MDEV-30986 hybrid build with 10.6 InnoDB10.56-MDEV-30986

author: Marko Mäkelä <marko.makela@mariadb.com> 2023-04-18 16:19:39 +0300
committer: Marko Mäkelä <marko.makela@mariadb.com> 2023-04-18 16:19:39 +0300
commit: 821e3d52c0e7405aea7054e138604ae1a4e8c921 (patch)
tree: 1dbf11353bfd7ca04e176bc719e3dcec42eafe11
parent: 6c196090c8c265bfd93df1e2ee6b18cda2b1d2d8 (diff)
download: mariadb-git-10.56-MDEV-30986.tar.gz
255 files changed, 43381 insertions, 49938 deletions
diff --git a/extra/CMakeLists.txt b/extra/CMakeLists.txt
index f53e9fdf8d8..5021128ed35 100644
--- a/extra/CMakeLists.txt
+++ b/extra/CMakeLists.txt
@@ -87,7 +87,7 @@ IF(WITH_INNOBASE_STORAGE_ENGINE)
   ADD_DEPENDENCIES(innochecksum GenError)
 ENDIF()
 
-MYSQL_ADD_EXECUTABLE(replace replace.c COMPONENT Server)
+MYSQL_ADD_EXECUTABLE(replace replace.c COMPONENT Client)
 TARGET_LINK_LIBRARIES(replace mysys)
 
 IF(UNIX)
diff --git a/extra/innochecksum.cc b/extra/innochecksum.cc
index 1a6cbb3bfa1..2081fe28e82 100644
--- a/extra/innochecksum.cc
+++ b/extra/innochecksum.cc
@@ -32,7 +32,7 @@
 #include <time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-#ifndef __WIN__
+#ifndef _WIN32
 # include <unistd.h>
 #endif
 #include <my_getopt.h>
@@ -74,7 +74,6 @@ static bool			do_one_page;
 static my_bool do_leaf;
 static my_bool per_page_details;
 static ulint n_merge;
-extern ulong			srv_checksum_algorithm;
 static ulint physical_page_size;  /* Page size in bytes on disk. */
 ulong srv_page_size;
 ulong srv_page_size_shift;
@@ -84,8 +83,6 @@ uint32_t		cur_page_num;
 uint32_t		cur_space;
 /* Skip the checksum verification. */
 static bool			no_check;
-/* Enabled for strict checksum verification. */
-bool				strict_verify = 0;
 /* Enabled for rewrite checksum. */
 static bool			do_write;
 /* Mismatches count allowed (0 by default). */
@@ -111,11 +108,6 @@ const byte *field_ref_zero = field_ref_zero_buf;
 struct flock			lk;
 #endif /* _WIN32 */
 
-/* Strict check algorithm name. */
-static ulong			strict_check;
-/* Rewrite checksum algorithm name. */
-static ulong			write_check;
-
 /* Innodb page type. */
 struct innodb_page_type {
 	int n_undo_state_active;
@@ -142,24 +134,6 @@ struct innodb_page_type {
 	int n_fil_page_type_page_compressed_encrypted;
 } page_type;
 
-/* Possible values for "--strict-check" for strictly verify checksum
-and "--write" for rewrite checksum. */
-static const char *innochecksum_algorithms[] = {
-	"crc32",
-	"crc32",
-	"innodb",
-	"innodb",
-	"none",
-	"none",
-	NullS
-};
-
-/* Used to define an enumerate type of the "innochecksum algorithm". */
-static TYPELIB innochecksum_algorithms_typelib = {
-	array_elements(innochecksum_algorithms)-1,"",
-	innochecksum_algorithms, NULL
-};
-
 #define SIZE_RANGES_FOR_PAGE 10
 #define NUM_RETRIES 3
 #define DEFAULT_RETRY_DELAY 1000000
@@ -653,10 +627,9 @@ static bool update_checksum(byte* page, ulint flags)
 	}
 
 	if (iscompressed) {
-		/* page is compressed */
-		checksum = page_zip_calc_checksum(
-			page, physical_page_size,
-			static_cast<srv_checksum_algorithm_t>(write_check));
+		/* ROW_FORMAT=COMPRESSED */
+		checksum = page_zip_calc_checksum(page, physical_page_size,
+						  false);
 
 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
 		if (is_log_enabled) {
@@ -680,50 +653,17 @@ static bool update_checksum(byte* page, ulint flags)
 		/* page is uncompressed. */
 
 		/* Store the new formula checksum */
-		switch ((srv_checksum_algorithm_t) write_check) {
-
-		case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-			checksum = buf_calc_page_crc32(page);
-			break;
-
-		case SRV_CHECKSUM_ALGORITHM_INNODB:
-		case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-			checksum = (ib_uint32_t)
-					buf_calc_page_new_checksum(page);
-			break;
-
-		case SRV_CHECKSUM_ALGORITHM_NONE:
-		case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-			checksum = BUF_NO_CHECKSUM_MAGIC;
-			break;
-
-		/* no default so the compiler will emit a warning if new
-		enum is added and not handled here */
-		}
+		checksum = buf_calc_page_crc32(page);
 
 		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
 		if (is_log_enabled) {
-			fprintf(log_file, "page::" UINT32PF "; Updated checksum field1"
-				" = " UINT32PF "\n", cur_page_num, checksum);
-		}
-
-		if (write_check == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
-		    || write_check == SRV_CHECKSUM_ALGORITHM_INNODB) {
-			checksum = (ib_uint32_t)
-					buf_calc_page_old_checksum(page);
+			fprintf(log_file, "page::" UINT32PF
+				"; Updated checksum = " UINT32PF "\n",
+				cur_page_num, checksum);
 		}
 
 		mach_write_to_4(page + physical_page_size -
 				FIL_PAGE_END_LSN_OLD_CHKSUM,checksum);
-
-		if (is_log_enabled) {
-			fprintf(log_file, "page::" UINT32PF "; Updated checksum "
-				"field2 = " UINT32PF "\n", cur_page_num, checksum);
-		}
-
 	}
 
 func_exit:
@@ -1245,17 +1185,13 @@ static struct my_option innochecksum_options[] = {
   {"page", 'p', "Check only this page (0 based).",
     &do_page, &do_page, 0, GET_UINT, REQUIRED_ARG,
     0, 0, FIL_NULL, 0, 1, 0},
-  {"strict-check", 'C', "Specify the strict checksum algorithm by the user.",
-    &strict_check, &strict_check, &innochecksum_algorithms_typelib,
-    GET_ENUM, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"no-check", 'n', "Ignore the checksum verification.",
     &no_check, &no_check, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"allow-mismatches", 'a', "Maximum checksum mismatch allowed.",
     &allow_mismatches, &allow_mismatches, 0,
     GET_ULL, REQUIRED_ARG, 0, 0, ULLONG_MAX, 0, 1, 0},
-  {"write", 'w', "Rewrite the checksum algorithm by the user.",
-    &write_check, &write_check, &innochecksum_algorithms_typelib,
-    GET_ENUM, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"write", 'w', "Rewrite the checksum.",
+    &do_write, &do_write, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"page-type-summary", 'S', "Display a count of each page type "
    "in a tablespace.", &page_type_summary, &page_type_summary, 0,
    GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
@@ -1296,7 +1232,7 @@ static void usage(void)
 	printf("InnoDB offline file checksum utility.\n");
 	printf("Usage: %s [-c] [-s <start page>] [-e <end page>] "
 		"[-p <page>] [-i] [-v]  [-a <allow mismatches>] [-n] "
-		"[-C <strict-check>] [-w <write>] [-S] [-D <page type dump>] "
+		"[-S] [-D <page type dump>] "
 		"[-l <log>] [-l] [-m <merge pages>] <filename or [-]>\n", my_progname);
 	printf("See https://mariadb.com/kb/en/library/innochecksum/"
 	       " for usage hints.\n");
@@ -1333,38 +1269,6 @@ innochecksum_get_one_option(
 		my_end(0);
 		exit(EXIT_SUCCESS);
 		break;
-	case 'C':
-		strict_verify = true;
-		switch ((srv_checksum_algorithm_t) strict_check) {
-
-		case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_CRC32:
-			srv_checksum_algorithm =
-				SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
-			break;
-
-		case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-		case SRV_CHECKSUM_ALGORITHM_INNODB:
-			srv_checksum_algorithm =
-				SRV_CHECKSUM_ALGORITHM_STRICT_INNODB;
-			break;
-
-		case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		case SRV_CHECKSUM_ALGORITHM_NONE:
-			srv_checksum_algorithm =
-				SRV_CHECKSUM_ALGORITHM_STRICT_NONE;
-			break;
-
-		case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-		case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-			srv_checksum_algorithm =
-				SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32;
-			break;
-
-		default:
-			return(true);
-		}
-		break;
 	case 'n':
 		no_check = true;
 		break;
@@ -1567,13 +1471,6 @@ int main(
 		goto my_exit;
 	}
 
-	if (strict_verify && no_check) {
-		fprintf(stderr, "Error: --strict-check option cannot be used "
-			"together with --no-check option.\n");
-		exit_status = 1;
-		goto my_exit;
-	}
-
 	if (no_check && !do_write) {
 		fprintf(stderr, "Error: --no-check must be associated with "
 			"--write option.\n");
diff --git a/extra/mariabackup/CMakeLists.txt b/extra/mariabackup/CMakeLists.txt
index 0ebfba54534..a7a35c58ac3 100644
--- a/extra/mariabackup/CMakeLists.txt
+++ b/extra/mariabackup/CMakeLists.txt
@@ -46,12 +46,6 @@ ADD_DEFINITIONS(-UMYSQL_SERVER)
 # xtrabackup binary
 ########################################################################
 
-IF(WIN32)
-  SET(NT_SERVICE_SOURCE ${PROJECT_SOURCE_DIR}/sql/nt_servc.cc)
-ELSE()
-  SET(NT_SERVICE_SOURCE)
-ENDIF()
-
 ADD_DEFINITIONS(-DPCRE_STATIC=1)
 ADD_DEFINITIONS(${SSL_DEFINES})
 MYSQL_ADD_EXECUTABLE(mariadb-backup
@@ -76,7 +70,6 @@ MYSQL_ADD_EXECUTABLE(mariadb-backup
   encryption_plugin.cc
   ${PROJECT_BINARY_DIR}/sql/sql_builtin.cc
   ${PROJECT_SOURCE_DIR}/sql/net_serv.cc
-  ${NT_SERVICE_SOURCE}
   ${PROJECT_SOURCE_DIR}/libmysqld/libmysql.c
   COMPONENT backup
   )
diff --git a/extra/mariabackup/backup_copy.cc b/extra/mariabackup/backup_copy.cc
index 27c4ba29c91..9845d39d493 100644
--- a/extra/mariabackup/backup_copy.cc
+++ b/extra/mariabackup/backup_copy.cc
@@ -55,12 +55,18 @@ Street, Fifth Floor, Boston, MA 02110-1335 USA
 #include "xtrabackup.h"
 #include "common.h"
 #include "backup_copy.h"
+#include "backup_debug.h"
 #include "backup_mysql.h"
 #include <btr0btr.h>
 #ifdef _WIN32
 #include <direct.h> /* rmdir */
 #endif
 
+#ifdef _WIN32
+#include <aclapi.h>
+#endif
+
+
 #define ROCKSDB_BACKUP_DIR "#rocksdb"
 
 /* list of files to sync for --rsync mode */
@@ -127,7 +133,6 @@ struct datadir_thread_ctxt_t {
 	uint			n_thread;
 	uint			*count;
 	pthread_mutex_t*	count_mutex;
-	os_thread_id_t		id;
 	bool			ret;
 };
 
@@ -272,7 +277,6 @@ datadir_iter_next_database(datadir_iter_t *it)
 		}
 		snprintf(it->dbpath, it->dbpath_len, "%s/%s",
 			 it->datadir_path, it->dbinfo.name);
-		os_normalize_path(it->dbpath);
 
 		if (it->dbinfo.type == OS_FILE_TYPE_FILE) {
 			it->is_file = true;
@@ -560,8 +564,8 @@ datafile_read(datafile_cur_t *cursor)
 	}
 
 	if (os_file_read(IORequestRead,
-			  cursor->file, cursor->buf, cursor->buf_offset,
-			  to_read) != DB_SUCCESS) {
+			 cursor->file, cursor->buf, cursor->buf_offset,
+			 to_read, nullptr) != DB_SUCCESS) {
 		return(XB_FIL_CUR_ERROR);
 	}
 
@@ -944,7 +948,7 @@ backup_file_printf(const char *filename, const char *fmt, ...)
 
 static
 bool
-run_data_threads(datadir_iter_t *it, os_thread_func_t func, uint n)
+run_data_threads(datadir_iter_t *it, void (*func)(datadir_thread_ctxt_t *ctxt), uint n)
 {
 	datadir_thread_ctxt_t	*data_threads;
 	uint			i, count;
@@ -962,12 +966,12 @@ run_data_threads(datadir_iter_t *it, os_thread_func_t func, uint n)
 		data_threads[i].n_thread = i + 1;
 		data_threads[i].count = &count;
 		data_threads[i].count_mutex = &count_mutex;
-		data_threads[i].id = os_thread_create(func, data_threads + i);
+		std::thread(func, data_threads + i).detach();
 	}
 
 	/* Wait for threads to exit */
 	while (1) {
-		os_thread_sleep(100000);
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
 		pthread_mutex_lock(&count_mutex);
 		if (count == 0) {
 			pthread_mutex_unlock(&count_mutex);
@@ -991,64 +995,6 @@ run_data_threads(datadir_iter_t *it, os_thread_func_t func, uint n)
 	return(ret);
 }
 
-#ifdef _WIN32
-#include <windows.h>
-#include <accctrl.h>
-#include <aclapi.h>
-/*
-  On Windows, fix permission of the file after "copyback"
-  We assume that after copyback, mysqld will run as service as NetworkService
-  user, thus well give full permission on given file to that user.
-*/
-
-static int fix_win_file_permissions(const char *file)
-{
-	struct {
-		TOKEN_USER tokenUser;
-		BYTE buffer[SECURITY_MAX_SID_SIZE];
-	} tokenInfoBuffer;
-	HANDLE hFile = CreateFile(file, READ_CONTROL | WRITE_DAC, 0, NULL, OPEN_EXISTING,
-		FILE_FLAG_BACKUP_SEMANTICS, NULL);
-	if (hFile == INVALID_HANDLE_VALUE)
-		return -1;
-	ACL* pOldDACL;
-	SECURITY_DESCRIPTOR* pSD = NULL;
-	EXPLICIT_ACCESS ea = { 0 };
-	PSID pSid = NULL;
-
-	GetSecurityInfo(hFile, SE_FILE_OBJECT, DACL_SECURITY_INFORMATION, NULL, NULL,
-		&pOldDACL, NULL, (void**)&pSD);
-	DWORD size = SECURITY_MAX_SID_SIZE;
-	pSid = (PSID)tokenInfoBuffer.buffer;
-	if (!CreateWellKnownSid(WinNetworkServiceSid, NULL, pSid,
-		&size))
-	{
-		return 1;
-	}
-	ea.Trustee.TrusteeForm = TRUSTEE_IS_SID;
-	ea.Trustee.ptstrName = (LPTSTR)pSid;
-
-	ea.grfAccessMode = GRANT_ACCESS;
-	ea.grfAccessPermissions = GENERIC_ALL;
-	ea.grfInheritance = CONTAINER_INHERIT_ACE | OBJECT_INHERIT_ACE;
-	ea.Trustee.TrusteeType = TRUSTEE_IS_UNKNOWN;
-	ACL* pNewDACL = 0;
-	DWORD err = SetEntriesInAcl(1, &ea, pOldDACL, &pNewDACL);
-	if (!err)
-	{
-		DBUG_ASSERT(pNewDACL);
-		SetSecurityInfo(hFile, SE_FILE_OBJECT, DACL_SECURITY_INFORMATION, NULL, NULL,
-			pNewDACL, NULL);
-		LocalFree((HLOCAL)pNewDACL);
-	}
-	if (pSD != NULL)
-		LocalFree((HLOCAL)pSD);
-	CloseHandle(hFile);
-	return 0;
-}
-
-#endif
-
 
 /************************************************************************
 Copy file for backup/restore.
@@ -1099,10 +1045,6 @@ copy_file(ds_ctxt_t *datasink,
 	/* close */
 	msg(thread_n,"        ...done");
 	datafile_close(&cursor);
-#ifdef _WIN32
-	if (xtrabackup_copy_back || xtrabackup_move_back)
-		ut_a(!fix_win_file_permissions(dstfile->path));
-#endif
 	if (ds_close(dstfile)) {
 		goto error_close;
 	}
@@ -1175,10 +1117,6 @@ move_file(ds_ctxt_t *datasink,
 			errbuf);
 		return(false);
 	}
-#ifdef _WIN32
-	if (xtrabackup_copy_back || xtrabackup_move_back)
-		ut_a(!fix_win_file_permissions(dst_file_path_abs));
-#endif
 	msg(thread_n,"        ...done");
 
 	return(true);
@@ -1201,13 +1139,12 @@ read_link_file(const char *ibd_filepath, const char *link_filepath)
 		os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
 		fclose(file);
 
-		if (strlen(filepath)) {
+		if (size_t len = strlen(filepath)) {
 			/* Trim whitespace from end of filepath */
-			ulint lastch = strlen(filepath) - 1;
+			ulint lastch = len - 1;
 			while (lastch > 4 && filepath[lastch] <= 0x20) {
 				filepath[lastch--] = 0x00;
 			}
-			os_normalize_path(filepath);
 		}
 
 		tablespace_locations[ibd_filepath] = filepath;
@@ -1511,6 +1448,13 @@ bool backup_start(CorruptedPages &corrupted_pages)
 
 	msg("Waiting for log copy thread to read lsn %llu", (ulonglong)server_lsn_after_lock);
 	backup_wait_for_lsn(server_lsn_after_lock);
+	DBUG_EXECUTE_FOR_KEY("sleep_after_waiting_for_lsn", {},
+		{
+			ulong milliseconds = strtoul(dbug_val, NULL, 10);
+			msg("sleep_after_waiting_for_lsn");
+			my_sleep(milliseconds*1000UL);
+		});
+
 	backup_fix_ddl(corrupted_pages);
 
 	// There is no need to stop slave thread before coping non-Innodb data when
@@ -1850,6 +1794,19 @@ copy_back()
 			return(false);
 		}
 	}
+
+#ifdef _WIN32
+	/* Initialize security descriptor for the new directories
+	to be the same as for datadir */
+	DWORD res = GetNamedSecurityInfoA(mysql_data_home,
+		SE_FILE_OBJECT, DACL_SECURITY_INFORMATION,
+		NULL, NULL, NULL, NULL,
+		&my_dir_security_attributes.lpSecurityDescriptor);
+	if (res != ERROR_SUCCESS) {
+		msg("Unable to read security descriptor of %s",mysql_data_home);
+	}
+#endif
+
 	if (srv_undo_dir && *srv_undo_dir
 		&& !directory_exists(srv_undo_dir, true)) {
 			return(false);
@@ -1884,7 +1841,6 @@ copy_back()
 	}
 
 	srv_max_n_threads = 1000;
-	sync_check_init();
 
 	/* copy undo tablespaces */
 
@@ -1947,9 +1903,8 @@ copy_back()
 	     end(srv_sys_space.end());
 	     iter != end;
 	     ++iter) {
-		const char *filename = base_name(iter->name());
-
-		if (!(ret = copy_or_move_file(filename, iter->name(),
+		const char *filepath = iter->filepath();
+		if (!(ret = copy_or_move_file(base_name(filepath), filepath,
 					      dst_dir, 1))) {
 			goto cleanup;
 		}
@@ -1972,7 +1927,6 @@ copy_back()
 		const char *filename;
 		char c_tmp;
 		int i_tmp;
-		bool is_ibdata_file;
 
 		if (strstr(node.filepath,"/" ROCKSDB_BACKUP_DIR "/")
 #ifdef _WIN32
@@ -2032,23 +1986,19 @@ copy_back()
                 }
 
 		/* skip innodb data files */
-		is_ibdata_file = false;
 		for (Tablespace::const_iterator iter(srv_sys_space.begin()),
 		       end(srv_sys_space.end()); iter != end; ++iter) {
-			const char *ibfile = base_name(iter->name());
-			if (strcmp(ibfile, filename) == 0) {
-				is_ibdata_file = true;
-				break;
+			if (!strcmp(base_name(iter->filepath()), filename)) {
+				goto next_file;
 			}
 		}
-		if (is_ibdata_file) {
-			continue;
-		}
 
 		if (!(ret = copy_or_move_file(node.filepath, node.filepath_rel,
 					      mysql_data_home, 1))) {
 			goto cleanup;
 		}
+	next_file:
+		continue;
 	}
 
 	/* copy buffer pool dump */
@@ -2075,7 +2025,6 @@ cleanup:
 
 	ds_data = NULL;
 
-	sync_check_close();
 	return(ret);
 }
 
@@ -2123,13 +2072,10 @@ decrypt_decompress_file(const char *filepath, uint thread_n)
  	return(true);
 }
 
-static
-os_thread_ret_t STDCALL
-decrypt_decompress_thread_func(void *arg)
+static void decrypt_decompress_thread_func(datadir_thread_ctxt_t *ctxt)
 {
 	bool ret = true;
 	datadir_node_t node;
-	datadir_thread_ctxt_t *ctxt = (datadir_thread_ctxt_t *)(arg);
 
 	datadir_node_init(&node);
 
@@ -2159,9 +2105,6 @@ cleanup:
 	pthread_mutex_unlock(ctxt->count_mutex);
 
 	ctxt->ret = ret;
-
-	os_thread_exit();
-	OS_THREAD_DUMMY_RETURN;
 }
 
 bool
@@ -2171,7 +2114,6 @@ decrypt_decompress()
 	datadir_iter_t *it = NULL;
 
 	srv_max_n_threads = 1000;
-	sync_check_init();
 
 	/* cd to backup directory */
 	if (my_setwd(xtrabackup_target_dir, MYF(MY_WME)))
@@ -2200,8 +2142,6 @@ decrypt_decompress()
 
 	ds_data = NULL;
 
-	sync_check_close();
-
 	return(ret);
 }
 
@@ -2222,7 +2162,14 @@ static bool backup_files_from_datadir(const char *dir_path)
 		if (info.type != OS_FILE_TYPE_FILE)
 			continue;
 
-		const char *pname = strrchr(info.name, OS_PATH_SEPARATOR);
+		const char *pname = strrchr(info.name, '/');
+#ifdef _WIN32
+		if (const char *last = strrchr(info.name, '\\')) {
+			if (!pname || last >pname) {
+				pname = last;
+			}
+		}
+#endif
 		if (!pname)
 			pname = info.name;
 
@@ -2239,7 +2186,7 @@ static bool backup_files_from_datadir(const char *dir_path)
 			unlink(info.name);
 
 		std::string full_path(dir_path);
-		full_path.append(1, OS_PATH_SEPARATOR).append(info.name);
+		full_path.append(1, '/').append(info.name);
 		if (!(ret = copy_file(ds_data, full_path.c_str() , info.name, 1)))
 			break;
 	}
diff --git a/extra/mariabackup/backup_debug.h b/extra/mariabackup/backup_debug.h
index cefbc287361..777b4f4adeb 100644
--- a/extra/mariabackup/backup_debug.h
+++ b/extra/mariabackup/backup_debug.h
@@ -1,7 +1,7 @@
 #pragma once
 #include "my_dbug.h"
 #ifndef DBUG_OFF
-extern char *dbug_mariabackup_get_val(const char *event, const char *key);
+char *dbug_mariabackup_get_val(const char *event, fil_space_t::name_type key);
 /*
 In debug mode,  execute SQL statement that was passed via environment.
 To use this facility, you need to
@@ -14,19 +14,11 @@ To use this facility, you need to
    for the variable)
 3. start mariabackup with --dbug=+d,debug_mariabackup_events
 */
-extern void dbug_mariabackup_event(
-	const char *event,const char *key);
-#define DBUG_MARIABACKUP_EVENT(A, B) \
-	DBUG_EXECUTE_IF("mariabackup_events", \
-		dbug_mariabackup_event(A,B););
-#define DBUG_EXECUTE_FOR_KEY(EVENT, KEY, CODE) \
-	DBUG_EXECUTE_IF("mariabackup_inject_code", {\
-		char *dbug_val = dbug_mariabackup_get_val(EVENT, KEY); \
-		if (dbug_val && *dbug_val) CODE \
-	})
+#define DBUG_EXECUTE_FOR_KEY(EVENT, KEY, CODE)			\
+	DBUG_EXECUTE_IF("mariabackup_inject_code",		\
+	{ char *dbug_val= dbug_mariabackup_get_val(EVENT, KEY);	\
+	  if (dbug_val) CODE })
 #else
-#define DBUG_MARIABACKUP_EVENT(A,B)
-#define DBUG_MARIABACKUP_EVENT_LOCK(A,B)
 #define DBUG_EXECUTE_FOR_KEY(EVENT, KEY, CODE)
 #endif
 
diff --git a/extra/mariabackup/backup_mysql.cc b/extra/mariabackup/backup_mysql.cc
index ef79c8d561e..9ddbb6387b9 100644
--- a/extra/mariabackup/backup_mysql.cc
+++ b/extra/mariabackup/backup_mysql.cc
@@ -76,9 +76,10 @@ bool have_multi_threaded_slave = false;
 bool have_gtid_slave = false;
 
 /* Kill long selects */
-os_event_t	kill_query_thread_started;
-os_event_t	kill_query_thread_stopped;
-os_event_t	kill_query_thread_stop;
+static mysql_mutex_t kill_query_thread_mutex;
+static bool kill_query_thread_running, kill_query_thread_stopping;
+static mysql_cond_t kill_query_thread_stopped;
+static mysql_cond_t kill_query_thread_stop;
 
 bool sql_thread_started = false;
 char *mysql_slave_position = NULL;
@@ -103,7 +104,7 @@ xb_mysql_connect()
 	sprintf(mysql_port_str, "%d", opt_port);
 
 	if (connection == NULL) {
-		msg("Failed to init MySQL struct: %s.",
+		msg("Failed to init MariaDB struct: %s.",
 			mysql_error(connection));
 		return(NULL);
 	}
@@ -126,7 +127,7 @@ xb_mysql_connect()
 	mysql_options(connection, MYSQL_OPT_PROTOCOL, &opt_protocol);
 	mysql_options(connection,MYSQL_SET_CHARSET_NAME, "utf8");
 
-	msg("Connecting to server host: %s, user: %s, password: %s, "
+	msg("Connecting to MariaDB server host: %s, user: %s, password: %s, "
 	       "port: %s, socket: %s", opt_host ? opt_host : "localhost",
 	       opt_user ? opt_user : "not set",
 	       opt_password ? "set" : "not set",
@@ -153,7 +154,7 @@ xb_mysql_connect()
 				opt_password,
 				"" /*database*/, opt_port,
 				opt_socket, 0)) {
-		msg("Failed to connect to server: %s.", mysql_error(connection));
+		msg("Failed to connect to MariaDB server: %s.", mysql_error(connection));
 		mysql_close(connection);
 		return(NULL);
 	}
@@ -471,7 +472,7 @@ bool get_mysql_vars(MYSQL *connection)
     }
     if (!directory_exists(datadir_var, false))
     {
-      msg("Warning: MySQL variable 'datadir' points to "
+      msg("Warning: MariaDB variable 'datadir' points to "
           "nonexistent directory '%s'",
           datadir_var);
     }
@@ -808,7 +809,7 @@ wait_for_no_updates(MYSQL *connection, uint timeout, uint threshold)
 		if (!have_queries_to_wait_for(connection, threshold)) {
 			return(true);
 		}
-		os_thread_sleep(1000000);
+		std::this_thread::sleep_for(std::chrono::seconds(1));
 	}
 
 	msg("Unable to obtain lock. Please try again later.");
@@ -816,74 +817,70 @@ wait_for_no_updates(MYSQL *connection, uint timeout, uint threshold)
 	return(false);
 }
 
-static
-os_thread_ret_t
-DECLARE_THREAD(kill_query_thread)(
-/*===============*/
-	void *arg __attribute__((unused)))
+static void kill_query_thread()
 {
-	MYSQL	*mysql;
-	time_t	start_time;
-
-	start_time = time(NULL);
+  mysql_mutex_lock(&kill_query_thread_mutex);
 
-	os_event_set(kill_query_thread_started);
+  msg("Kill query timeout %d seconds.", opt_kill_long_queries_timeout);
 
-	msg("Kill query timeout %d seconds.",
-	       opt_kill_long_queries_timeout);
-
-	while (time(NULL) - start_time <
-				(time_t)opt_kill_long_queries_timeout) {
-		if (os_event_wait_time(kill_query_thread_stop, 1000) !=
-		    OS_SYNC_TIME_EXCEEDED) {
-			goto stop_thread;
-		}
-	}
-
-	if ((mysql = xb_mysql_connect()) == NULL) {
-		msg("Error: kill query thread failed");
-		goto stop_thread;
-	}
-
-	while (true) {
-		kill_long_queries(mysql, time(NULL) - start_time);
-		if (os_event_wait_time(kill_query_thread_stop, 1000) !=
-		    OS_SYNC_TIME_EXCEEDED) {
-			break;
-		}
-	}
+  time_t start_time= time(nullptr);
+  timespec abstime;
+  set_timespec(abstime, opt_kill_long_queries_timeout);
 
-	mysql_close(mysql);
+  while (!kill_query_thread_stopping)
+    if (!mysql_cond_timedwait(&kill_query_thread_stop,
+                              &kill_query_thread_mutex, &abstime))
+      goto func_exit;
 
-stop_thread:
-	msg("Kill query thread stopped");
+  if (MYSQL *mysql= xb_mysql_connect())
+  {
+    do
+    {
+      kill_long_queries(mysql, time(nullptr) - start_time);
+      set_timespec(abstime, 1);
+    }
+    while (mysql_cond_timedwait(&kill_query_thread_stop,
+                                &kill_query_thread_mutex, &abstime) &&
+	   !kill_query_thread_stopping);
+    mysql_close(mysql);
+  }
+  else
+    msg("Error: kill query thread failed");
 
-	os_event_set(kill_query_thread_stopped);
+func_exit:
+  msg("Kill query thread stopped");
 
-	os_thread_exit();
-	OS_THREAD_DUMMY_RETURN;
+  kill_query_thread_running= false;
+  mysql_cond_signal(&kill_query_thread_stopped);
+  mysql_mutex_unlock(&kill_query_thread_mutex);
 }
 
 
-static
-void
-start_query_killer()
+static void start_query_killer()
 {
-	kill_query_thread_stop		= os_event_create(0);
-	kill_query_thread_started	= os_event_create(0);
-	kill_query_thread_stopped	= os_event_create(0);
-
-	os_thread_create(kill_query_thread);
-
-	os_event_wait(kill_query_thread_started);
+  ut_ad(!kill_query_thread_running);
+  kill_query_thread_running= true;
+  kill_query_thread_stopping= false;
+  mysql_mutex_init(0, &kill_query_thread_mutex, nullptr);
+  mysql_cond_init(0, &kill_query_thread_stop, nullptr);
+  mysql_cond_init(0, &kill_query_thread_stopped, nullptr);
+  std::thread(kill_query_thread).detach();
 }
 
-static
-void
-stop_query_killer()
+static void stop_query_killer()
 {
-	os_event_set(kill_query_thread_stop);
-	os_event_wait_time(kill_query_thread_stopped, 60000);
+  mysql_mutex_lock(&kill_query_thread_mutex);
+  kill_query_thread_stopping= true;
+  mysql_cond_signal(&kill_query_thread_stop);
+
+  do
+    mysql_cond_wait(&kill_query_thread_stopped, &kill_query_thread_mutex);
+  while (kill_query_thread_running);
+
+  mysql_cond_destroy(&kill_query_thread_stop);
+  mysql_cond_destroy(&kill_query_thread_stopped);
+  mysql_mutex_unlock(&kill_query_thread_mutex);
+  mysql_mutex_destroy(&kill_query_thread_mutex);
 }
 
 
@@ -934,7 +931,9 @@ bool lock_tables(MYSQL *connection)
   }
 
   xb_mysql_query(connection, "BACKUP STAGE START", true);
+  DBUG_MARIABACKUP_EVENT("after_backup_stage_start", {});
   xb_mysql_query(connection, "BACKUP STAGE BLOCK_COMMIT", true);
+  DBUG_MARIABACKUP_EVENT("after_backup_stage_block_commit", {});
   /* Set the maximum supported session value for
   lock_wait_timeout to prevent unnecessary timeouts when the
   global value is changed from the default */
@@ -979,8 +978,9 @@ unlock_all(MYSQL *connection)
 	if (opt_debug_sleep_before_unlock) {
 		msg("Debug sleep for %u seconds",
 		       opt_debug_sleep_before_unlock);
-		os_thread_sleep(opt_debug_sleep_before_unlock * 1000);
-	}
+                std::this_thread::sleep_for(
+                    std::chrono::milliseconds(opt_debug_sleep_before_unlock));
+        }
 
 	msg("Executing BACKUP STAGE END");
 	xb_mysql_query(connection, "BACKUP STAGE END", false);
@@ -1058,7 +1058,7 @@ wait_for_safe_slave(MYSQL *connection)
 		       "remaining)...", sleep_time, n_attempts);
 
 		xb_mysql_query(connection, "START SLAVE SQL_THREAD", false);
-		os_thread_sleep(sleep_time * 1000000);
+		std::this_thread::sleep_for(std::chrono::seconds(sleep_time));
 		xb_mysql_query(connection, "STOP SLAVE SQL_THREAD", false);
 
 		open_temp_tables = get_open_temp_tables(connection);
@@ -1151,22 +1151,23 @@ public:
   bool print(String *to) const
   {
     ut_ad(m_value);
-    return to->append(m_value);
+    return to->append(m_value, strlen(m_value));
   }
   bool print_quoted(String *to) const
   {
     ut_ad(m_value);
-    return to->append("'") || to->append(m_value) || to->append("'");
+    return to->append('\'') || to->append(m_value, strlen(m_value)) ||
+           to->append('\'');
   }
   bool print_set_global(String *to) const
   {
     ut_ad(m_value);
     return
-      to->append("SET GLOBAL ") ||
-      to->append(m_name) ||
-      to->append(" = '") ||
-      to->append(m_value) ||
-      to->append("';\n");
+      to->append(STRING_WITH_LEN("SET GLOBAL ")) ||
+      to->append(m_name, strlen(m_name)) ||
+      to->append(STRING_WITH_LEN(" = '")) ||
+      to->append(m_value, strlen(m_value)) ||
+      to->append(STRING_WITH_LEN("';\n"));
   }
 };
 
@@ -1219,7 +1220,7 @@ public:
 
   static bool start_comment_chunk(String *to)
   {
-    return to->length() ? to->append("; ") : false;
+    return to->length() ? to->append(STRING_WITH_LEN("; ")) : false;
   }
 
   bool print_connection_name_if_set(String *to) const
@@ -1231,24 +1232,28 @@ public:
 
   bool print_comment_master_identity(String *comment) const
   {
-    if (comment->append("master "))
+    if (comment->append(STRING_WITH_LEN("master ")))
       return true;
     if (!m_mariadb_connection_name.is_null_or_empty())
       return m_mariadb_connection_name.print_quoted(comment);
-    return comment->append("''"); // Default not named master
+    return comment->append(STRING_WITH_LEN("''")); // Default not named master
   }
 
   bool print_using_master_log_pos(String *sql, String *comment) const
   {
     return
-      sql->append("CHANGE MASTER ") ||
+      sql->append(STRING_WITH_LEN("CHANGE MASTER ")) ||
       print_connection_name_if_set(sql) ||
-      sql->append("TO MASTER_LOG_FILE=") || m_filename.print_quoted(sql) ||
-      sql->append(", MASTER_LOG_POS=")   || m_position.print(sql) ||
-      sql->append(";\n") ||
+      sql->append(STRING_WITH_LEN("TO MASTER_LOG_FILE=")) ||
+      m_filename.print_quoted(sql) ||
+      sql->append(STRING_WITH_LEN(", MASTER_LOG_POS=")) ||
+      m_position.print(sql) ||
+      sql->append(STRING_WITH_LEN(";\n")) ||
       print_comment_master_identity(comment) ||
-      comment->append(" filename ")  || m_filename.print_quoted(comment) ||
-      comment->append(" position ")  || m_position.print_quoted(comment);
+      comment->append(STRING_WITH_LEN(" filename ")) ||
+      m_filename.print_quoted(comment) ||
+      comment->append(STRING_WITH_LEN(" position ")) ||
+      m_position.print_quoted(comment);
   }
 
   bool print_mysql56(String *sql, String *comment) const
@@ -1259,23 +1264,23 @@ public:
       CHANGE MASTER TO MASTER_AUTO_POSITION=1;
     */
     return
-      sql->append("SET GLOBAL gtid_purged=") ||
+      sql->append(STRING_WITH_LEN("SET GLOBAL gtid_purged=")) ||
       m_mysql_gtid_executed.print_quoted(sql) ||
-      sql->append(";\n") ||
-      sql->append("CHANGE MASTER TO MASTER_AUTO_POSITION=1;\n") ||
+      sql->append(STRING_WITH_LEN(";\n")) ||
+      sql->append(STRING_WITH_LEN("CHANGE MASTER TO MASTER_AUTO_POSITION=1;\n")) ||
       print_comment_master_identity(comment) ||
-      comment->append(" purge list ") ||
+      comment->append(STRING_WITH_LEN(" purge list ")) ||
       m_mysql_gtid_executed.print_quoted(comment);
   }
 
   bool print_mariadb10_using_gtid(String *sql, String *comment) const
   {
     return
-      sql->append("CHANGE MASTER ") ||
+      sql->append(STRING_WITH_LEN("CHANGE MASTER ")) ||
       print_connection_name_if_set(sql) ||
-      sql->append("TO master_use_gtid = slave_pos;\n") ||
+      sql->append(STRING_WITH_LEN("TO master_use_gtid = slave_pos;\n")) ||
       print_comment_master_identity(comment) ||
-      comment->append(" master_use_gtid = slave_pos");
+      comment->append(STRING_WITH_LEN(" master_use_gtid = slave_pos"));
   }
 
   bool print(String *sql, String *comment, const Var &gtid_slave_pos) const
@@ -1314,7 +1319,7 @@ public:
         if (status.is_mariadb_using_gtid())
         {
           if (gtid_slave_pos.print_set_global(sql) ||
-              comment->append("gtid_slave_pos ") ||
+              comment->append(STRING_WITH_LEN("gtid_slave_pos ")) ||
               gtid_slave_pos.print_quoted(comment))
             return true; // Error
           break;
diff --git a/extra/mariabackup/changed_page_bitmap.cc b/extra/mariabackup/changed_page_bitmap.cc
index 793d7378b0f..a6cc0e01492 100644
--- a/extra/mariabackup/changed_page_bitmap.cc
+++ b/extra/mariabackup/changed_page_bitmap.cc
@@ -188,18 +188,15 @@ log_online_read_bitmap_page(
 {
 	ulint	checksum;
 	ulint	actual_checksum;
-	ibool	success;
 
 	ut_a(bitmap_file->size >= MODIFIED_PAGE_BLOCK_SIZE);
 	ut_a(bitmap_file->offset
 	     <= bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE);
 	ut_a(bitmap_file->offset % MODIFIED_PAGE_BLOCK_SIZE == 0);
-	success = os_file_read(IORequestRead,
-			       bitmap_file->file, page, bitmap_file->offset,
-			       MODIFIED_PAGE_BLOCK_SIZE) == DB_SUCCESS;
-
-	if (UNIV_UNLIKELY(!success)) {
-
+	if (DB_SUCCESS !=
+	    os_file_read(IORequestRead, bitmap_file->file, page,
+			 bitmap_file->offset, MODIFIED_PAGE_BLOCK_SIZE,
+			 nullptr)) {
 		/* The following call prints an error message */
 		os_file_get_last_error(TRUE);
 		msg("InnoDB: Warning: failed reading changed page bitmap "
diff --git a/extra/mariabackup/ds_stdout.cc b/extra/mariabackup/ds_stdout.cc
index 08776e99329..a9639ff7739 100644
--- a/extra/mariabackup/ds_stdout.cc
+++ b/extra/mariabackup/ds_stdout.cc
@@ -75,7 +75,7 @@ stdout_open(ds_ctxt_t *ctxt __attribute__((unused)),
 	stdout_file = (ds_stdout_file_t *) (file + 1);
 
 
-#ifdef __WIN__
+#ifdef _WIN32
 	setmode(fileno(stdout), _O_BINARY);
 #endif
 
diff --git a/extra/mariabackup/fil_cur.cc b/extra/mariabackup/fil_cur.cc
index 8d7fb3c7492..8820ce40c2b 100644
--- a/extra/mariabackup/fil_cur.cc
+++ b/extra/mariabackup/fil_cur.cc
@@ -65,17 +65,21 @@ xb_get_relative_path(
 	prev = NULL;
 	cur = path;
 
-	while ((next = strchr(cur, OS_PATH_SEPARATOR)) != NULL) {
+#ifdef _WIN32
+	while ((next = strchr(cur, '\\')) != NULL) {
+		prev = cur;
+		cur = next + 1;
+	}
+#endif
 
+	while ((next = strchr(cur, '/')) != NULL) {
 		prev = cur;
 		cur = next + 1;
 	}
 
 	if (is_system) {
-
 		return(cur);
 	} else {
-
 		return((prev == NULL) ? cur : prev);
 	}
 
@@ -91,23 +95,18 @@ xb_fil_node_close_file(
 {
 	ibool	ret;
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 
 	ut_ad(node);
 	ut_a(!node->being_extended);
 
-	if (!node->is_open()) {
-
-		mutex_exit(&fil_system.mutex);
-
-		return;
+	if (node->is_open()) {
+		ret = os_file_close(node->handle);
+		ut_a(ret);
+		node->handle = OS_FILE_CLOSED;
 	}
 
-	ret = os_file_close(node->handle);
-	ut_a(ret);
-
-	node->handle = OS_FILE_CLOSED;
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 }
 
 /************************************************************************
@@ -225,13 +224,13 @@ xb_fil_cur_open(
 	if (!node->space->crypt_data
 	    && os_file_read(IORequestRead,
 			    node->handle, cursor->buf, 0,
-			    cursor->page_size) == DB_SUCCESS) {
-		mutex_enter(&fil_system.mutex);
+			    cursor->page_size, nullptr) == DB_SUCCESS) {
+		mysql_mutex_lock(&fil_system.mutex);
 		if (!node->space->crypt_data) {
 			node->space->crypt_data = fil_space_read_crypt_data(
 				node->space->zip_size(), cursor->buf);
 		}
-		mutex_exit(&fil_system.mutex);
+		mysql_mutex_unlock(&fil_system.mutex);
 	}
 
 	cursor->space_size = (ulint)(cursor->statinfo.st_size
@@ -366,6 +365,7 @@ xb_fil_cur_result_t xb_fil_cur_read(xb_fil_cur_t*	cursor,
 	ib_int64_t		offset;
 	ib_int64_t		to_read;
 	const ulint		page_size = cursor->page_size;
+	bool			defer = false;
 	xb_ad(!cursor->is_system() || page_size == srv_page_size);
 
 	cursor->read_filter->get_next_batch(&cursor->read_filter_ctxt,
@@ -418,7 +418,7 @@ read_retry:
 	cursor->buf_page_no = static_cast<unsigned>(offset / page_size);
 
 	if (os_file_read(IORequestRead, cursor->file, cursor->buf, offset,
-			 (ulint) to_read) != DB_SUCCESS) {
+			 (ulint) to_read, nullptr) != DB_SUCCESS) {
 		if (!srv_is_undo_tablespace(cursor->space_id)) {
 			ret = XB_FIL_CUR_ERROR;
 			goto func_exit;
@@ -440,13 +440,15 @@ read_retry:
 		space->release();
 		goto reinit_buf;
 	}
+
+	defer = UT_LIST_GET_FIRST(space->chain)->deferred;
 	/* check pages for corruption and re-read if necessary. i.e. in case of
 	partially written pages */
 	for (page = cursor->buf, i = 0; i < npages;
 	     page += page_size, i++) {
 		unsigned page_no = cursor->buf_page_no + i;
 
-		if (page_is_corrupted(page, page_no, cursor, space)){
+		if (!defer && page_is_corrupted(page, page_no, cursor, space)) {
 			retry_count--;
 
 			if (retry_count == 0) {
@@ -473,11 +475,13 @@ read_retry:
 				msg(cursor->thread_n, "Database page corruption detected at page "
 				    UINT32PF ", retrying...",
 				    page_no);
-				os_thread_sleep(100000);
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(100));
 				goto read_retry;
 			}
 		}
-		DBUG_EXECUTE_FOR_KEY("add_corrupted_page_for", cursor->node->space->name,
+		DBUG_EXECUTE_FOR_KEY("add_corrupted_page_for",
+				     cursor->node->space->name(),
 			{
 				unsigned corrupted_page_no =
 					static_cast<unsigned>(strtoul(dbug_val, NULL, 10));
diff --git a/extra/mariabackup/xbstream_read.cc b/extra/mariabackup/xbstream_read.cc
index 84bb279aba0..b54a98157ea 100644
--- a/extra/mariabackup/xbstream_read.cc
+++ b/extra/mariabackup/xbstream_read.cc
@@ -43,7 +43,7 @@ xb_stream_read_new(void)
 
 	stream = (xb_rstream_t *) my_malloc(PSI_NOT_INSTRUMENTED, sizeof(xb_rstream_t), MYF(MY_FAE));
 
-#ifdef __WIN__
+#ifdef _WIN32
 	setmode(fileno(stdin), _O_BINARY);
 #endif
 
diff --git a/extra/mariabackup/xbstream_write.cc b/extra/mariabackup/xbstream_write.cc
index 2c9ffde6c42..5801e867aac 100644
--- a/extra/mariabackup/xbstream_write.cc
+++ b/extra/mariabackup/xbstream_write.cc
@@ -110,7 +110,7 @@ xb_stream_write_open(xb_wstream_t *stream, const char *path,
 	file->chunk_ptr = file->chunk;
 	file->chunk_free = XB_STREAM_MIN_CHUNK_SIZE;
 	if (onwrite) {
-#ifdef __WIN__
+#ifdef _WIN32
 		setmode(fileno(stdout), _O_BINARY);
 #endif
 		file->userdata = userdata;
diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc
index 7d45337bb18..9de1ef853b9 100644
--- a/extra/mariabackup/xtrabackup.cc
+++ b/extra/mariabackup/xtrabackup.cc
@@ -72,7 +72,6 @@ Street, Fifth Floor, Boston, MA 02110-1335 USA
 
 
 #include <btr0sea.h>
-#include <dict0priv.h>
 #include <lock0lock.h>
 #include <log0recv.h>
 #include <log0crypt.h>
@@ -141,8 +140,8 @@ longlong xtrabackup_use_memory;
 uint opt_protocol;
 long xtrabackup_throttle; /* 0:unlimited */
 static lint io_ticket;
-static os_event_t wait_throttle;
-static os_event_t log_copying_stop;
+static mysql_cond_t wait_throttle;
+static mysql_cond_t log_copying_stop;
 
 char *xtrabackup_incremental;
 lsn_t incremental_lsn;
@@ -182,18 +181,17 @@ static hash_table_t databases_exclude_hash;
 
 static hash_table_t inc_dir_tables_hash;
 
-struct xb_filter_entry_struct{
+struct xb_filter_entry_t{
 	char*		name;
 	ibool		has_tables;
-	hash_node_t	name_hash;
+	xb_filter_entry_t *name_hash;
 };
-typedef struct xb_filter_entry_struct	xb_filter_entry_t;
 
 lsn_t checkpoint_lsn_start;
 lsn_t checkpoint_no_start;
 static lsn_t log_copy_scanned_lsn;
+/** whether log_copying_thread() is active; protected by log_sys.mutex */
 static bool log_copying_running;
-static bool io_watching_thread_running;
 
 int xtrabackup_parallel;
 
@@ -357,6 +355,9 @@ char orig_argv1[FN_REFLEN];
 pthread_mutex_t backup_mutex;
 pthread_cond_t  scanned_lsn_cond;
 
+/** Store the deferred tablespace name during --backup */
+static std::set<std::string> defer_space_names;
+
 typedef std::map<space_id_t,std::string> space_id_to_name_t;
 
 struct ddl_tracker_t {
@@ -366,6 +367,45 @@ struct ddl_tracker_t {
 	std::set<space_id_t> drops;
 	/* For DDL operation found in redo log,  */
 	space_id_to_name_t id_to_name;
+	/** Deferred tablespaces with their ID and name which was
+	found in redo log of DDL operations */
+	space_id_to_name_t deferred_tables;
+
+  /** Insert the deferred tablespace id with the name */
+  void insert_defer_id(space_id_t space_id, std::string name)
+  {
+    auto it= defer_space_names.find(name);
+    if (it != defer_space_names.end())
+    {
+      deferred_tables[space_id]= name;
+      defer_space_names.erase(it);
+    }
+  }
+
+  /** Rename the deferred tablespace with new name */
+  void rename_defer(space_id_t space_id, std::string old_name,
+                    std::string new_name)
+  {
+    if (deferred_tables.find(space_id) != deferred_tables.end())
+      deferred_tables[space_id] = new_name;
+    auto defer_end= defer_space_names.end();
+    auto defer= defer_space_names.find(old_name);
+    if (defer == defer_end)
+      defer= defer_space_names.find(new_name);
+
+    if (defer != defer_end)
+    {
+      deferred_tables[space_id]= new_name;
+      defer_space_names.erase(defer);
+    }
+  }
+
+  /** Delete the deferred tablespace */
+  void delete_defer(space_id_t space_id, std::string name)
+  {
+    deferred_tables.erase(space_id);
+    defer_space_names.erase(name);
+  }
 };
 
 static ddl_tracker_t ddl_tracker;
@@ -374,8 +414,13 @@ static ddl_tracker_t ddl_tracker;
 by recv_sys.mutex */
 static std::set<uint32_t> undo_trunc_ids;
 
+/** Stores the space ids of page0 INIT_PAGE redo records. It is
+used to indicate whether the given deferred tablespace can
+be reconstructed. */
+static std::set<space_id_t> first_page_init_ids;
+
 // Convert non-null terminated filename to space name
-std::string filename_to_spacename(const byte *filename, size_t len);
+static std::string filename_to_spacename(const void *filename, size_t len);
 
 CorruptedPages::CorruptedPages() { ut_a(!pthread_mutex_init(&m_mutex, NULL)); }
 
@@ -387,11 +432,9 @@ void CorruptedPages::add_page_no_lock(const char *space_name, ulint space_id,
 {
   space_info_t  &space_info = m_spaces[space_id];
   if (space_info.space_name.empty())
-    space_info.space_name=
-        convert_space_name
-            ? filename_to_spacename(reinterpret_cast<const byte *>(space_name),
-                                    strlen(space_name))
-            : space_name;
+    space_info.space_name= convert_space_name
+      ? filename_to_spacename(space_name, strlen(space_name))
+      : space_name;
   (void)space_info.pages.insert(page_no);
 }
 
@@ -519,7 +562,8 @@ bool CorruptedPages::empty() const
 }
 
 static void xb_load_single_table_tablespace(const std::string &space_name,
-                                            bool set_size);
+                                            bool set_size,
+                                            ulint defer_space_id=0);
 static void xb_data_files_close();
 static fil_space_t* fil_space_get_by_name(const char* name);
 
@@ -547,8 +591,8 @@ void CorruptedPages::zero_out_free_pages()
              space_it->second.pages.begin();
          page_it != space_it->second.pages.end(); ++page_it)
     {
-      bool is_free= fseg_page_is_free(space, *page_it);
-      if (!is_free) {
+      if (fseg_page_is_allocated(space, *page_it))
+      {
         space_info_t &space_info = non_free_pages[space_id];
         space_info.pages.insert(*page_it);
         if (space_info.space_name.empty())
@@ -568,7 +612,7 @@ void CorruptedPages::zero_out_free_pages()
           die("Can't zero out corrupted page " UINT32PF " of tablespace %s",
               *page_it, space_name.c_str());
         msg("Corrupted page " UINT32PF
-            " of tablespace %s was successfuly fixed.",
+            " of tablespace %s was successfully fixed.",
             *page_it, space_name.c_str());
       }
     }
@@ -596,41 +640,26 @@ xtrabackup_add_datasink(ds_ctxt_t *ds)
 typedef void (*process_single_tablespace_func_t)(const char *dirname,
                                                  const char *filname,
                                                  bool is_remote,
-                                                 bool skip_node_page0);
+                                                 bool skip_node_page0,
+                                                 ulint defer_space_id);
 static dberr_t enumerate_ibd_files(process_single_tablespace_func_t callback);
 
 /* ======== Datafiles iterator ======== */
 struct datafiles_iter_t {
-	fil_space_t	*space;
-	fil_node_t	*node;
-	ibool		started;
-	pthread_mutex_t	mutex;
+	space_list_t::iterator space = fil_system.space_list.end();
+	fil_node_t	*node = nullptr;
+	bool		started = false;
+	std::mutex	mutex;
 };
 
 /* ======== Datafiles iterator ======== */
 static
-datafiles_iter_t *
-datafiles_iter_new()
-{
-	datafiles_iter_t *it;
-
-	it = static_cast<datafiles_iter_t *>(malloc(sizeof(datafiles_iter_t)));
-	pthread_mutex_init(&it->mutex, NULL);
-
-	it->space = NULL;
-	it->node = NULL;
-	it->started = FALSE;
-
-	return it;
-}
-
-static
 fil_node_t *
 datafiles_iter_next(datafiles_iter_t *it)
 {
 	fil_node_t *new_node;
 
-	pthread_mutex_lock(&it->mutex);
+	std::lock_guard<std::mutex> _(it->mutex);
 
 	if (it->node == NULL) {
 		if (it->started)
@@ -642,34 +671,25 @@ datafiles_iter_next(datafiles_iter_t *it)
 			goto end;
 	}
 
-	it->space = (it->space == NULL) ?
-		UT_LIST_GET_FIRST(fil_system.space_list) :
-		UT_LIST_GET_NEXT(space_list, it->space);
+	it->space = (it->space == fil_system.space_list.end()) ?
+		fil_system.space_list.begin() :
+		std::next(it->space);
 
-	while (it->space != NULL &&
+	while (it->space != fil_system.space_list.end() &&
 	       (it->space->purpose != FIL_TYPE_TABLESPACE ||
 		UT_LIST_GET_LEN(it->space->chain) == 0))
-		it->space = UT_LIST_GET_NEXT(space_list, it->space);
-	if (it->space == NULL)
+		++it->space;
+	if (it->space == fil_system.space_list.end())
 		goto end;
 
 	it->node = UT_LIST_GET_FIRST(it->space->chain);
 
 end:
 	new_node = it->node;
-	pthread_mutex_unlock(&it->mutex);
 
 	return new_node;
 }
 
-static
-void
-datafiles_iter_free(datafiles_iter_t *it)
-{
-	pthread_mutex_destroy(&it->mutex);
-	free(it);
-}
-
 #ifndef DBUG_OFF
 struct dbug_thread_param_t
 {
@@ -677,17 +697,14 @@ struct dbug_thread_param_t
 	const char *query;
 	int expect_err;
 	int expect_errno;
-	os_event_t done_event;
 };
 
 
 /* Thread procedure used in dbug_start_query_thread. */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(dbug_execute_in_new_connection)(void *arg)
+static void *dbug_execute_in_new_connection(void *arg)
 {
 	mysql_thread_init();
-	dbug_thread_param_t *par= (dbug_thread_param_t *)arg;
+	dbug_thread_param_t *par= static_cast<dbug_thread_param_t*>(arg);
 	int err = mysql_query(par->con, par->query);
 	int err_no = mysql_errno(par->con);
 	if(par->expect_err != err)
@@ -704,13 +721,12 @@ DECLARE_THREAD(dbug_execute_in_new_connection)(void *arg)
 	}
 	mysql_close(par->con);
 	mysql_thread_end();
-	os_event_t done = par->done_event;
 	delete par;
-	os_event_set(done);
-	os_thread_exit();
-	return os_thread_ret_t(0);
+	return nullptr;
 }
 
+static pthread_t dbug_alter_thread;
+
 /*
 Execute query from a new connection, in own thread.
 
@@ -722,7 +738,7 @@ Execute query from a new connection, in own thread.
 @param expected_errno - if not 0, and query finished with error,
 	expected mysql_errno()
 */
-static os_event_t dbug_start_query_thread(
+static void dbug_start_query_thread(
 	const char *query,
 	const char *wait_state,
 	int expected_err,
@@ -733,12 +749,13 @@ static os_event_t dbug_start_query_thread(
 	par->query = query;
 	par->expect_err = expected_err;
 	par->expect_errno = expected_errno;
-	par->done_event = os_event_create(0);
 	par->con =  xb_mysql_connect();
-	os_thread_create(dbug_execute_in_new_connection, par);
+
+	mysql_thread_create(0, &dbug_alter_thread, nullptr,
+			    dbug_execute_in_new_connection, par);
 
 	if (!wait_state)
-		return par->done_event;
+		return;
 
 	char q[256];
 	snprintf(q, sizeof(q),
@@ -760,32 +777,30 @@ static os_event_t dbug_start_query_thread(
 end:
 	msg("query '%s' on connection %lu reached state '%s'", query,
 	mysql_thread_id(par->con), wait_state);
-	return par->done_event;
 }
-
-os_event_t dbug_alter_thread_done;
 #endif
 
 void mdl_lock_all()
 {
-	mdl_lock_init();
-	datafiles_iter_t *it = datafiles_iter_new();
-	if (!it)
-		return;
-
-	while (fil_node_t *node = datafiles_iter_next(it)){
-		if (fil_is_user_tablespace_id(node->space->id)
-			&& check_if_skip_table(node->space->name))
-			continue;
+  mdl_lock_init();
+  datafiles_iter_t it;
 
-		mdl_lock_table(node->space->id);
-	}
-	datafiles_iter_free(it);
+  while (fil_node_t *node= datafiles_iter_next(&it))
+  {
+    const auto id= node->space->id;
+    if (const char *name= (fil_is_user_tablespace_id(id) &&
+                           node->space->chain.start)
+        ? node->space->chain.start->name : nullptr)
+      if (check_if_skip_table(filename_to_spacename(name,
+                                                    strlen(name)).c_str()))
+        continue;
+    mdl_lock_table(id);
+  }
 }
 
 
 // Convert non-null terminated filename to space name
-std::string filename_to_spacename(const byte *filename, size_t len)
+static std::string filename_to_spacename(const void *filename, size_t len)
 {
 	// null- terminate filename
 	char *f = (char *)malloc(len + 1);
@@ -811,33 +826,56 @@ std::string filename_to_spacename(const byte *filename, size_t len)
 
 /** Report an operation to create, delete, or rename a file during backup.
 @param[in]	space_id	tablespace identifier
-@param[in]	create		whether the file is being created
+@param[in]	type		redo log file operation type
 @param[in]	name		file name (not NUL-terminated)
 @param[in]	len		length of name, in bytes
 @param[in]	new_name	new file name (NULL if not rename)
 @param[in]	new_len		length of new_name, in bytes (0 if NULL) */
-static void backup_file_op(ulint space_id, bool create,
+static void backup_file_op(ulint space_id, int type,
 	const byte* name, ulint len,
 	const byte* new_name, ulint new_len)
 {
 
-	ut_ad(!create || !new_name);
 	ut_ad(name);
 	ut_ad(len);
 	ut_ad(!new_name == !new_len);
 	pthread_mutex_lock(&backup_mutex);
 
-	if (create) {
-		ddl_tracker.id_to_name[space_id] = filename_to_spacename(name, len);
+	switch(type) {
+	case FILE_CREATE:
+	{
+		std::string space_name = filename_to_spacename(name, len);
+		ddl_tracker.id_to_name[space_id] = space_name;
+		ddl_tracker.delete_defer(space_id, space_name);
 		msg("DDL tracking : create %zu \"%.*s\"", space_id, int(len), name);
 	}
-	else if (new_name) {
-		ddl_tracker.id_to_name[space_id] = filename_to_spacename(new_name, new_len);
+	break;
+	case FILE_MODIFY:
+		ddl_tracker.insert_defer_id(
+			space_id, filename_to_spacename(name, len));
+		break;
+	case FILE_RENAME:
+	{
+		std::string new_space_name = filename_to_spacename(
+						new_name, new_len);
+		std::string old_space_name = filename_to_spacename(
+						name, len);
+		ddl_tracker.id_to_name[space_id] = new_space_name;
+		ddl_tracker.rename_defer(space_id, old_space_name,
+					 new_space_name);
 		msg("DDL tracking : rename %zu \"%.*s\",\"%.*s\"",
 			space_id, int(len), name, int(new_len), new_name);
-	} else {
+	}
+	break;
+	case FILE_DELETE:
 		ddl_tracker.drops.insert(space_id);
+		ddl_tracker.delete_defer(
+			space_id, filename_to_spacename(name, len));
 		msg("DDL tracking : delete %zu \"%.*s\"", space_id, int(len), name);
+		break;
+	default:
+		ut_ad(0);
+		break;
 	}
 	pthread_mutex_unlock(&backup_mutex);
 }
@@ -852,29 +890,37 @@ static void backup_file_op(ulint space_id, bool create,
 
  We will abort backup in this case.
 */
-static void backup_file_op_fail(ulint space_id, bool create,
+static void backup_file_op_fail(ulint space_id, int type,
 	const byte* name, ulint len,
 	const byte* new_name, ulint new_len)
 {
-	bool fail;
-	if (create) {
-		msg("DDL tracking : create %zu \"%.*s\"",
-			space_id, int(len), name);
-		std::string  spacename = filename_to_spacename(name, len);
-		fail = !check_if_skip_table(spacename.c_str());
-	}
-	else if (new_name) {
+	bool fail = false;
+	switch(type) {
+	case FILE_CREATE:
+		msg("DDL tracking : create %zu \"%.*s\"", space_id, int(len), name);
+		fail = !check_if_skip_table(
+				filename_to_spacename(name, len).c_str());
+		break;
+	case FILE_MODIFY:
+		break;
+	case FILE_RENAME:
 		msg("DDL tracking : rename %zu \"%.*s\",\"%.*s\"",
 			space_id, int(len), name, int(new_len), new_name);
-		std::string  spacename = filename_to_spacename(name, len);
-		std::string  new_spacename = filename_to_spacename(new_name, new_len);
-		fail = !check_if_skip_table(spacename.c_str()) || !check_if_skip_table(new_spacename.c_str());
-	}
-	else {
-		std::string  spacename = filename_to_spacename(name, len);
-		fail = !check_if_skip_table(spacename.c_str());
+		fail = !check_if_skip_table(
+				filename_to_spacename(name, len).c_str())
+		       || !check_if_skip_table(
+				filename_to_spacename(new_name, new_len).c_str());
+		break;
+	case FILE_DELETE:
+		fail = !check_if_skip_table(
+				filename_to_spacename(name, len).c_str());
 		msg("DDL tracking : delete %zu \"%.*s\"", space_id, int(len), name);
+		break;
+	default:
+		ut_ad(0);
+		break;
 	}
+
 	if (fail) {
 		ut_a(opt_no_lock);
 		die("DDL operation detected in the late phase of backup."
@@ -887,6 +933,13 @@ static void backup_undo_trunc(uint32_t space_id)
   undo_trunc_ids.insert(space_id);
 }
 
+/* Function to store the space id of page0 INIT_PAGE
+@param	space_id	space id which has page0 init page */
+static void backup_first_page_op(ulint space_id)
+{
+  first_page_init_ids.insert(space_id);
+}
+
 /*
   Retrieve default data directory, to be used with --copy-back.
 
@@ -925,7 +978,6 @@ typedef struct {
 	uint			num;
 	uint			*count;
 	pthread_mutex_t*	count_mutex;
-	os_thread_id_t		id;
 	CorruptedPages *corrupted_pages;
 } data_thread_ctxt_t;
 
@@ -1440,9 +1492,10 @@ uint xb_client_options_count = array_elements(xb_client_options);
 static const char *dbug_option;
 #endif
 
-namespace deprecated {
-extern ulong srv_n_log_files;
-}
+#ifdef HAVE_URING
+extern const char *io_uring_may_be_unsafe;
+bool innodb_use_native_aio_default();
+#endif
 
 struct my_option xb_server_options[] =
 {
@@ -1450,7 +1503,7 @@ struct my_option xb_server_options[] =
    (G_PTR*) &mysql_data_home, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"tmpdir", 't',
    "Path for temporary files. Several paths may be specified, separated by a "
-#if defined(__WIN__) || defined(OS2) || defined(__NETWARE__)
+#if defined(_WIN32)
    "semicolon (;)"
 #else
    "colon (:)"
@@ -1538,7 +1591,7 @@ struct my_option xb_server_options[] =
    "With which method to flush data.",
    &srv_file_flush_method, &srv_file_flush_method,
    &innodb_flush_method_typelib, GET_ENUM, REQUIRED_ARG,
-   IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_FSYNC), 0, 0, 0, 0, 0},
+   IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), 0, 0, 0, 0, 0},
 
   {"innodb_log_buffer_size", OPT_INNODB_LOG_BUFFER_SIZE,
    "The size of the buffer which InnoDB uses to write log to the log files on disk.",
@@ -1550,10 +1603,6 @@ struct my_option xb_server_options[] =
    GET_ULL, REQUIRED_ARG, 48 << 20, 1 << 20,
    std::numeric_limits<ulonglong>::max(), 0,
    UNIV_PAGE_SIZE_MAX, 0},
-  {"innodb_log_files_in_group", OPT_INNODB_LOG_FILES_IN_GROUP,
-   "Ignored for mysqld option compatibility",
-   &deprecated::srv_n_log_files, &deprecated::srv_n_log_files,
-   0, GET_LONG, REQUIRED_ARG, 1, 1, 100, 0, 1, 0},
   {"innodb_log_group_home_dir", OPT_INNODB_LOG_GROUP_HOME_DIR,
    "Path to InnoDB log files.", &srv_log_group_home_dir,
    &srv_log_group_home_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
@@ -1564,7 +1613,12 @@ struct my_option xb_server_options[] =
    "Use native AIO if supported on this platform.",
    (G_PTR*) &srv_use_native_aio,
    (G_PTR*) &srv_use_native_aio, 0, GET_BOOL, NO_ARG,
-   TRUE, 0, 0, 0, 0, 0},
+#ifdef HAVE_URING
+   innodb_use_native_aio_default(),
+#else
+   TRUE,
+#endif
+   0, 0, 0, 0, 0},
   {"innodb_page_size", OPT_INNODB_PAGE_SIZE,
    "The universal page size of the database.",
    (G_PTR*) &innobase_page_size, (G_PTR*) &innobase_page_size, 0,
@@ -1585,7 +1639,7 @@ struct my_option xb_server_options[] =
 
   {"innodb_checksum_algorithm", OPT_INNODB_CHECKSUM_ALGORITHM,
   "The algorithm InnoDB uses for page checksumming. [CRC32, STRICT_CRC32, "
-   "INNODB, STRICT_INNODB, NONE, STRICT_NONE]", &srv_checksum_algorithm,
+   "FULL_CRC32, STRICT_FULL_CRC32]", &srv_checksum_algorithm,
    &srv_checksum_algorithm, &innodb_checksum_algorithm_typelib, GET_ENUM,
    REQUIRED_ARG, SRV_CHECKSUM_ALGORITHM_CRC32, 0, 0, 0, 0, 0},
 
@@ -1667,7 +1721,8 @@ uint xb_server_options_count = array_elements(xb_server_options);
 static std::set<std::string> tables_for_export;
 
 static void append_export_table(const char *dbname, const char *tablename,
-                                bool is_remote, bool skip_node_page0)
+                                bool is_remote, bool skip_node_page0,
+                                ulint defer_space_id)
 {
   if(dbname && tablename && !is_remote)
   {
@@ -2078,7 +2133,6 @@ static bool innodb_init_param()
 	    innobase_data_file_path);
 
 	srv_sys_space.set_space_id(TRX_SYS_SPACE);
-	srv_sys_space.set_name("innodb_system");
 	srv_sys_space.set_path(srv_data_home);
 	switch (srv_checksum_algorithm) {
 	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
@@ -2110,8 +2164,6 @@ static bool innodb_init_param()
 	msg("innodb_log_group_home_dir = %s",
 	    srv_log_group_home_dir);
 
-	os_normalize_path(srv_log_group_home_dir);
-
 	if (strchr(srv_log_group_home_dir, ';')) {
 		msg("syntax error in innodb_log_group_home_dir, ");
 		goto error;
@@ -2148,6 +2200,15 @@ static bool innodb_init_param()
 	if (srv_use_native_aio) {
 		msg("InnoDB: Using Linux native AIO");
 	}
+#elif defined(HAVE_URING)
+	if (!srv_use_native_aio) {
+	} else if (io_uring_may_be_unsafe) {
+		msg("InnoDB: Using liburing on this kernel %s may cause hangs;"
+		    " see https://jira.mariadb.org/browse/MDEV-26674",
+		    io_uring_may_be_unsafe);
+	} else {
+		msg("InnoDB: Using liburing");
+	}
 #else
 	/* Currently native AIO is supported only on windows and linux
 	and that also when the support is compiled in. In all other
@@ -2450,13 +2511,15 @@ xb_write_delta_metadata(const char *filename, const xb_delta_info_t *info)
 }
 
 /* ================= backup ================= */
-void
-xtrabackup_io_throttling(void)
+void xtrabackup_io_throttling()
 {
-	if (xtrabackup_backup && xtrabackup_throttle && (io_ticket--) < 0) {
-		os_event_reset(wait_throttle);
-		os_event_wait(wait_throttle);
-	}
+  if (!xtrabackup_backup || !xtrabackup_throttle)
+    return;
+
+  mysql_mutex_lock(&log_sys.mutex);
+  if (io_ticket-- < 0)
+    mysql_cond_wait(&wait_throttle, &log_sys.mutex);
+  mysql_mutex_unlock(&log_sys.mutex);
 }
 
 static
@@ -2486,7 +2549,8 @@ find_filter_in_hashtable(
 )
 {
 	xb_filter_entry_t* found = NULL;
-	HASH_SEARCH(name_hash, table, ut_fold_string(name),
+	const ulint fold = my_crc32c(0, name, strlen(name));
+	HASH_SEARCH(name_hash, table, fold,
 		    xb_filter_entry_t*,
 		    found, (void) 0,
 		    !strcmp(found->name, name));
@@ -2580,7 +2644,15 @@ check_if_skip_database_by_path(
 		return(FALSE);
 	}
 
-	const char* db_name = strrchr(path, OS_PATH_SEPARATOR);
+	const char* db_name = strrchr(path, '/');
+#ifdef _WIN32
+	if (const char* last = strrchr(path, '\\')) {
+		if (!db_name || last > db_name) {
+			db_name = last;
+		}
+	}
+#endif
+
 	if (db_name == NULL) {
 		db_name = path;
 	} else {
@@ -2609,7 +2681,16 @@ check_if_skip_table(
 
 	dbname = NULL;
 	tbname = name;
-	while ((ptr = strchr(tbname, '/')) != NULL) {
+	for (;;) {
+		ptr= strchr(tbname, '/');
+#ifdef _WIN32
+		if (!ptr) {
+			ptr= strchr(tbname,'\\');
+		}
+#endif
+		if (!ptr) {
+			break;
+		}
 		dbname = tbname;
 		tbname = ptr + 1;
 	}
@@ -2739,21 +2820,11 @@ static my_bool xtrabackup_copy_datafile(fil_node_t *node, uint thread_n,
 	xb_read_filt_t		*read_filter;
 	my_bool			rc = FALSE;
 
-	/* Get the name and the path for the tablespace. node->name always
-	contains the path (which may be absolute for remote tablespaces in
-	5.6+). space->name contains the tablespace name in the form
-	"./database/table.ibd" (in 5.5-) or "database/table" (in 5.6+). For a
-	multi-node shared tablespace, space->name contains the name of the first
-	node, but that's irrelevant, since we only need node_name to match them
-	against filters, and the shared tablespace is always copied regardless
-	of the filters value. */
-
-	const char* const node_name = node->space->name;
-	const char* const node_path = node->name;
-
 	if (fil_is_user_tablespace_id(node->space->id)
-	    && check_if_skip_table(node_name)) {
-		msg(thread_n, "Skipping %s.", node_name);
+	    && check_if_skip_table(filename_to_spacename(node->name,
+							 strlen(node->name)).
+				   c_str())) {
+		msg(thread_n, "Skipping %s.", node->name);
 		return(FALSE);
 	}
 
@@ -2765,9 +2836,9 @@ static my_bool xtrabackup_copy_datafile(fil_node_t *node, uint thread_n,
 	pthread_mutex_unlock(&backup_mutex);
 	if (was_dropped) {
 		if (node->is_open()) {
-			mutex_enter(&fil_system.mutex);
+			mysql_mutex_lock(&fil_system.mutex);
 			node->close();
-			mutex_exit(&fil_system.mutex);
+			mysql_mutex_unlock(&fil_system.mutex);
 		}
 		goto skip;
 	}
@@ -2808,9 +2879,10 @@ static my_bool xtrabackup_copy_datafile(fil_node_t *node, uint thread_n,
 	action = xb_get_copy_action();
 
 	if (xtrabackup_stream) {
-		msg(thread_n, "%s %s", action, node_path);
+		msg(thread_n, "%s %s", action, node->name);
 	} else {
-		msg(thread_n, "%s %s to %s", action, node_path, dstfile->path);
+		msg(thread_n, "%s %s to %s", action, node->name,
+		    dstfile->path);
 	}
 
 	/* The main copy loop */
@@ -2840,11 +2912,15 @@ static my_bool xtrabackup_copy_datafile(fil_node_t *node, uint thread_n,
 	if (write_filter.finalize
 	    && !write_filter.finalize(&write_filt_ctxt, dstfile)) {
 		goto error;
-	}
+	} else {
+		const fil_space_t::name_type name = node->space->name();
 
-	pthread_mutex_lock(&backup_mutex);
-	ddl_tracker.tables_in_backup[node->space->id] = node_name;
-	pthread_mutex_unlock(&backup_mutex);
+		pthread_mutex_lock(&backup_mutex);
+		ddl_tracker.tables_in_backup.emplace(node->space->id,
+						     std::string(name.data(),
+								 name.size()));
+		pthread_mutex_unlock(&backup_mutex);
+	}
 
 	/* close */
 	msg(thread_n,"        ...done");
@@ -2876,7 +2952,7 @@ skip:
 	if (write_filter.deinit) {
 		write_filter.deinit(&write_filt_ctxt);
 	}
-	msg(thread_n,"Warning: We assume the  table was dropped during xtrabackup execution and ignore the tablespace %s", node_name);
+	msg(thread_n,"Warning: We assume the  table was dropped during xtrabackup execution and ignore the tablespace %s", node->name);
 	return(FALSE);
 }
 
@@ -2964,6 +3040,8 @@ static lsn_t xtrabackup_copy_log(lsn_t start_lsn, lsn_t end_lsn, bool last)
 @return	whether the operation failed */
 static bool xtrabackup_copy_logfile(bool last = false)
 {
+	mysql_mutex_assert_owner(&log_sys.mutex);
+
 	ut_a(dst_log_file != NULL);
 	ut_ad(recv_sys.is_initialised());
 
@@ -2979,9 +3057,10 @@ static bool xtrabackup_copy_logfile(bool last = false)
 	do {
 		end_lsn = start_lsn + RECV_SCAN_SIZE;
 
-		xtrabackup_io_throttling();
+		if (xtrabackup_throttle && (io_ticket--) < 0) {
+			mysql_cond_wait(&wait_throttle, &log_sys.mutex);
+		}
 
-		mysql_mutex_lock(&log_sys.mutex);
 		lsn_t lsn= start_lsn;
 		for (int retries= 0; retries < 100; retries++) {
 			if (log_sys.log.read_log_seg(&lsn, end_lsn)
@@ -2993,22 +3072,20 @@ static bool xtrabackup_copy_logfile(bool last = false)
 		}
 
 		if (lsn == start_lsn) {
-			overwritten_block= !recv_sys.found_corrupt_log
+			overwritten_block= !recv_sys.is_corrupt_log()
 				&& log_block_calc_checksum_crc32(log_sys.buf) ==
 					log_block_get_checksum(log_sys.buf)
 				&& log_block_get_hdr_no(log_sys.buf) >
 					log_block_convert_lsn_to_no(start_lsn);
 			start_lsn = 0;
 		} else {
-			mutex_enter(&recv_sys.mutex);
+			mysql_mutex_lock(&recv_sys.mutex);
 			start_lsn = xtrabackup_copy_log(start_lsn, lsn, last);
-			mutex_exit(&recv_sys.mutex);
+			mysql_mutex_unlock(&recv_sys.mutex);
 		}
 
-		mysql_mutex_unlock(&log_sys.mutex);
-
 		if (!start_lsn) {
-			const char *reason = recv_sys.found_corrupt_log
+			const char *reason = recv_sys.is_corrupt_log()
 				? "corrupt log."
 				: (overwritten_block
 				   ? "redo log block is overwritten, please increase redo log size with innodb_log_file_size parameter."
@@ -3048,76 +3125,70 @@ void backup_wait_for_lsn(lsn_t lsn) {
 
 extern lsn_t server_lsn_after_lock;
 
-static os_thread_ret_t DECLARE_THREAD(log_copying_thread)(void*)
+static void log_copying_thread()
 {
-	/*
-	  Initialize mysys thread-specific memory so we can
-	  use mysys functions in this thread.
-	*/
-	my_thread_init();
-
-	for (;;) {
-		os_event_reset(log_copying_stop);
-		os_event_wait_time_low(log_copying_stop,
-				       xtrabackup_log_copy_interval * 1000U,
-				       0);
-		if (xtrabackup_copy_logfile()) {
-			break;
-		}
-
-		mysql_mutex_lock(&log_sys.mutex);
-		bool completed = metadata_to_lsn
-			&& metadata_to_lsn <= log_copy_scanned_lsn;
-		mysql_mutex_unlock(&log_sys.mutex);
-		if (completed) {
-			break;
-		}
-	}
-
-	log_copying_running = false;
-	my_thread_end();
-	os_thread_exit();
-
-	return(0);
+  my_thread_init();
+  mysql_mutex_lock(&log_sys.mutex);
+  while (!xtrabackup_copy_logfile() &&
+         (!metadata_to_lsn || metadata_to_lsn > log_copy_scanned_lsn))
+  {
+    timespec abstime;
+    set_timespec_nsec(abstime, 1000000ULL * xtrabackup_log_copy_interval);
+    mysql_cond_timedwait(&log_copying_stop, &log_sys.mutex, &abstime);
+  }
+  log_copying_running= false;
+  mysql_mutex_unlock(&log_sys.mutex);
+  my_thread_end();
 }
 
+/** whether io_watching_thread() is active; protected by log_sys.mutex */
+static bool have_io_watching_thread;
+
 /* io throttle watching (rough) */
-static os_thread_ret_t DECLARE_THREAD(io_watching_thread)(void*)
+static void io_watching_thread()
 {
-	/* currently, for --backup only */
-	ut_a(xtrabackup_backup);
-
-	while (log_copying_running && !metadata_to_lsn) {
-		os_thread_sleep(1000000); /*1 sec*/
-		io_ticket = xtrabackup_throttle;
-		os_event_set(wait_throttle);
-	}
-
-	/* stop io throttle */
-	xtrabackup_throttle = 0;
-	os_event_set(wait_throttle);
+  my_thread_init();
+  /* currently, for --backup only */
+  ut_ad(xtrabackup_backup);
 
-	io_watching_thread_running = false;
+  mysql_mutex_lock(&log_sys.mutex);
+  ut_ad(have_io_watching_thread);
 
-	os_thread_exit();
+  while (log_copying_running && !metadata_to_lsn)
+  {
+    timespec abstime;
+    set_timespec(abstime, 1);
+    mysql_cond_timedwait(&log_copying_stop, &log_sys.mutex, &abstime);
+    io_ticket= xtrabackup_throttle;
+    mysql_cond_broadcast(&wait_throttle);
+  }
 
-	return(0);
+  /* stop io throttle */
+  xtrabackup_throttle= 0;
+  have_io_watching_thread= false;
+  mysql_cond_broadcast(&wait_throttle);
+  mysql_mutex_unlock(&log_sys.mutex);
+  my_thread_end();
 }
 
 #ifndef DBUG_OFF
-char *dbug_mariabackup_get_val(const char *event, const char *key)
+char *dbug_mariabackup_get_val(const char *event,
+                               const fil_space_t::name_type key)
 {
-	char envvar[FN_REFLEN];
-	if (key) {
-		snprintf(envvar, sizeof(envvar), "%s_%s", event, key);
-		char *slash = strchr(envvar, '/');
-		if (slash)
-			*slash = '_';
-	} else {
-		strncpy(envvar, event, sizeof envvar - 1);
-		envvar[sizeof envvar - 1] = '\0';
-	}
-	return getenv(envvar);
+  char envvar[FN_REFLEN];
+  strncpy(envvar, event, sizeof envvar - 1);
+  envvar[(sizeof envvar) - 1] = '\0';
+
+  if (key.size() && key.size() + strlen(envvar) < (sizeof envvar) - 2)
+  {
+    strcat(envvar, "_");
+    strncat(envvar, key.data(), key.size());
+    if (char *slash= strchr(envvar, '/'))
+      *slash= '_';
+  }
+
+  char *val = getenv(envvar);
+  return val && *val ? val : nullptr;
 }
 
 /*
@@ -3132,7 +3203,8 @@ To use this facility, you need to
    for the variable)
 3. start mariabackup with --dbug=+d,debug_mariabackup_events
 */
-void dbug_mariabackup_event(const char *event,const char *key)
+void dbug_mariabackup_event(const char *event,
+                                   const fil_space_t::name_type key)
 {
 	char *sql = dbug_mariabackup_get_val(event, key);
 	if (sql && *sql) {
@@ -3142,15 +3214,9 @@ void dbug_mariabackup_event(const char *event,const char *key)
 }
 #endif // DBUG_OFF
 
-/**************************************************************************
-Datafiles copying thread.*/
-static
-os_thread_ret_t
-DECLARE_THREAD(data_copy_thread_func)(
-/*==================*/
-	void *arg) /* thread context */
+/** Datafiles copying thread.*/
+static void data_copy_thread_func(data_thread_ctxt_t *ctxt) /* thread context */
 {
-	data_thread_ctxt_t	*ctxt = (data_thread_ctxt_t *) arg;
 	uint			num = ctxt->num;
 	fil_node_t*		node;
 	ut_ad(ctxt->corrupted_pages);
@@ -3162,8 +3228,9 @@ DECLARE_THREAD(data_copy_thread_func)(
 	my_thread_init();
 
 	while ((node = datafiles_iter_next(ctxt->it)) != NULL) {
-		DBUG_MARIABACKUP_EVENT("before_copy", node->space->name);
-		DBUG_EXECUTE_FOR_KEY("wait_innodb_redo_before_copy", node->space->name,
+		DBUG_MARIABACKUP_EVENT("before_copy", node->space->name());
+		DBUG_EXECUTE_FOR_KEY("wait_innodb_redo_before_copy",
+				     node->space->name(),
 			backup_wait_for_lsn(get_current_lsn(mysql_connection)););
 		/* copy the datafile */
 		if (xtrabackup_copy_datafile(node, num, NULL,
@@ -3171,8 +3238,7 @@ DECLARE_THREAD(data_copy_thread_func)(
 			*ctxt->corrupted_pages))
 			die("failed to copy datafile.");
 
-		DBUG_MARIABACKUP_EVENT("after_copy", node->space->name);
-
+		DBUG_MARIABACKUP_EVENT("after_copy", node->space->name());
 	}
 
 	pthread_mutex_lock(ctxt->count_mutex);
@@ -3180,8 +3246,6 @@ DECLARE_THREAD(data_copy_thread_func)(
 	pthread_mutex_unlock(ctxt->count_mutex);
 
 	my_thread_end();
-	os_thread_exit();
-	OS_THREAD_DUMMY_RETURN;
 }
 
 /************************************************************************
@@ -3288,23 +3352,6 @@ xb_fil_io_init()
 	fil_system.space_id_reuse_warned = true;
 }
 
-static
-Datafile*
-xb_new_datafile(const char *name, bool is_remote)
-{
-	if (is_remote) {
-		RemoteDatafile *remote_file = new RemoteDatafile();
-		remote_file->set_name(name);
-		return(remote_file);
-	} else {
-		Datafile *file = new Datafile();
-		file->set_name(name);
-		file->make_filepath(".", name, IBD);
-		return(file);
-	}
-}
-
-
 /** Load tablespace.
 
 @param[in] dirname directory name of the tablespace to open
@@ -3314,15 +3361,19 @@ xb_new_datafile(const char *name, bool is_remote)
 node page0 will be read, and it's size and free pages limit
 will be set from page 0, what is neccessary for checking and fixing corrupted
 pages.
+@param[in] defer_space_id use the space id to create space object
+when there is deferred tablespace
 */
 static void xb_load_single_table_tablespace(const char *dirname,
                                             const char *filname,
                                             bool is_remote,
-                                            bool skip_node_page0)
+                                            bool skip_node_page0,
+                                            ulint defer_space_id)
 {
 	ut_ad(srv_operation == SRV_OPERATION_BACKUP
 	      || srv_operation == SRV_OPERATION_RESTORE_DELTA
-	      || srv_operation == SRV_OPERATION_RESTORE);
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_BACKUP_NO_DEFER);
 	/* Ignore .isl files on XtraBackup recovery. All tablespaces must be
 	local. */
 	if (is_remote && srv_operation == SRV_OPERATION_RESTORE_DELTA) {
@@ -3341,6 +3392,7 @@ static void xb_load_single_table_tablespace(const char *dirname,
 	lsn_t	flush_lsn;
 	dberr_t	err;
 	fil_space_t	*space;
+	bool	defer = false;
 
 	name = static_cast<char*>(ut_malloc_nokey(pathlen));
 
@@ -3352,36 +3404,71 @@ static void xb_load_single_table_tablespace(const char *dirname,
 		name[pathlen - 5] = 0;
 	}
 
-	Datafile *file = xb_new_datafile(name, is_remote);
+	const fil_space_t::name_type n{name, pathlen - 5};
+	Datafile *file;
+
+	if (is_remote) {
+		RemoteDatafile* rf = new RemoteDatafile();
+		if (!rf->open_link_file(n)) {
+			die("Can't open datafile %s", name);
+		}
+		file = rf;
+	} else {
+		file = new Datafile();
+		file->make_filepath(".", n, IBD);
+	}
 
 	if (file->open_read_only(true) != DB_SUCCESS) {
 		die("Can't open datafile %s", name);
 	}
 
 	for (int i = 0; i < 10; i++) {
+		file->m_defer = false;
 		err = file->validate_first_page(&flush_lsn);
-		if (err != DB_CORRUPTION) {
+
+		if (file->m_defer) {
+			if (defer_space_id) {
+				defer = true;
+				file->set_space_id(defer_space_id);
+				file->set_flags(FSP_FLAGS_PAGE_SSIZE());
+				err = DB_SUCCESS;
+				break;
+			}
+		} else if (err != DB_CORRUPTION) {
 			break;
 		}
 
 		my_sleep(1000);
 	}
 
+	if (!defer && file->m_defer) {
+		const char *file_path = file->filepath();
+		defer_space_names.insert(
+			filename_to_spacename(
+				file_path, strlen(file_path)));
+		delete file;
+		ut_free(name);
+		return;
+	}
+
 	bool is_empty_file = file->exists() && file->is_empty_file();
 
 	if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) {
 		space = fil_space_t::create(
-			name, file->space_id(), file->flags(),
-			FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */,
+			file->space_id(), file->flags(),
+			FIL_TYPE_TABLESPACE, nullptr/* TODO: crypt_data */,
 			FIL_ENCRYPTION_DEFAULT,
 			file->handle() != OS_FILE_CLOSED);
 
 		ut_a(space != NULL);
-		space->add(file->filepath(),
-			skip_node_page0 ? file->detach() : pfs_os_file_t(), 0, false, false);
-		mutex_enter(&fil_system.mutex);
+		fil_node_t* node= space->add(
+			file->filepath(),
+			skip_node_page0 ? file->detach() : pfs_os_file_t(),
+			0, false, false);
+		node->deferred= defer;
+		mysql_mutex_lock(&fil_system.mutex);
 		space->read_page0();
-		mutex_exit(&fil_system.mutex);
+		mysql_mutex_unlock(&fil_system.mutex);
 
 		if (srv_operation == SRV_OPERATION_RESTORE_DELTA
 		    || xb_close_files) {
@@ -3399,7 +3486,8 @@ static void xb_load_single_table_tablespace(const char *dirname,
 }
 
 static void xb_load_single_table_tablespace(const std::string &space_name,
-                                            bool skip_node_page0)
+                                            bool skip_node_page0,
+                                            ulint defer_space_id)
 {
   std::string name(space_name);
   bool is_remote= access((name + ".ibd").c_str(), R_OK) != 0;
@@ -3410,14 +3498,13 @@ static void xb_load_single_table_tablespace(const std::string &space_name,
   buf[sizeof buf - 1]= '\0';
   const char *dbname= buf;
   char *p= strchr(buf, '/');
-  if (p == 0)
+  if (!p)
     die("Unexpected tablespace %s filename %s", space_name.c_str(),
         name.c_str());
-  ut_a(p);
   *p= 0;
   const char *tablename= p + 1;
   xb_load_single_table_tablespace(dbname, tablename, is_remote,
-                                  skip_node_page0);
+                                  skip_node_page0, defer_space_id);
 }
 
 #ifdef _WIN32
@@ -3675,12 +3762,11 @@ static dberr_t enumerate_ibd_files(process_single_tablespace_func_t callback)
 
 		/* General tablespaces are always at the first level of the
 		data home dir */
-		if (dbinfo.type == OS_FILE_TYPE_FILE) {
-			bool is_isl = ends_with(dbinfo.name, ".isl");
-			bool is_ibd = !is_isl && ends_with(dbinfo.name,".ibd");
-
-			if (is_isl || is_ibd) {
-				(*callback)(NULL, dbinfo.name, is_isl, false);
+		if (dbinfo.type != OS_FILE_TYPE_FILE) {
+			const bool is_isl = ends_with(dbinfo.name, ".isl");
+			if (is_isl || ends_with(dbinfo.name,".ibd")) {
+				(*callback)(nullptr, dbinfo.name, is_isl,
+					    false, 0);
 			}
 		}
 
@@ -3706,7 +3792,6 @@ static dberr_t enumerate_ibd_files(process_single_tablespace_func_t callback)
 		}
 		snprintf(dbpath, dbpath_len,
 			 "%s/%s", fil_path_to_mysql_datadir, dbinfo.name);
-		os_normalize_path(dbpath);
 
 		if (check_if_skip_database_by_path(dbpath)) {
 			fprintf(stderr, "Skipping db: %s\n", dbpath);
@@ -3734,7 +3819,7 @@ static dberr_t enumerate_ibd_files(process_single_tablespace_func_t callback)
 				if (strlen(fileinfo.name) > 4) {
 					bool is_isl= false;
 					if (ends_with(fileinfo.name, ".ibd") || ((is_isl = ends_with(fileinfo.name, ".isl"))))
-						(*callback)(dbinfo.name, fileinfo.name, is_isl, false);
+						(*callback)(dbinfo.name, fileinfo.name, is_isl, false, 0);
 				}
 			}
 
@@ -3763,7 +3848,7 @@ next_datadir_item:
 
 	if (os_file_closedir_failed(dir)) {
 		fprintf(stderr,
-			"InnoDB: Error: could not close MySQL datadir\n");
+			"InnoDB: Error: could not close MariaDB datadir\n");
 		return(DB_ERROR);
 	}
 
@@ -3796,7 +3881,7 @@ static dberr_t xb_assign_undo_space_start()
 	byte* page = static_cast<byte*>
 		(aligned_malloc(srv_page_size, srv_page_size));
 
-	if (os_file_read(IORequestRead, file, page, 0, srv_page_size)
+	if (os_file_read(IORequestRead, file, page, 0, srv_page_size, nullptr)
 	    != DB_SUCCESS) {
 		msg("Reading first page failed.\n");
 		error = DB_ERROR;
@@ -3808,7 +3893,7 @@ static dberr_t xb_assign_undo_space_start()
 retry:
 	if (os_file_read(IORequestRead, file, page,
 			 TRX_SYS_PAGE_NO << srv_page_size_shift,
-			 srv_page_size) != DB_SUCCESS) {
+			 srv_page_size, nullptr) != DB_SUCCESS) {
 		msg("Reading TRX_SYS page failed.");
 		error = DB_ERROR;
 		goto func_exit;
@@ -3817,7 +3902,8 @@ retry:
 	/* TRX_SYS page can't be compressed or encrypted. */
 	if (buf_page_is_corrupted(false, page, fsp_flags)) {
 		if (n_retries--) {
-			os_thread_sleep(1000);
+			std::this_thread::sleep_for(
+				std::chrono::milliseconds(1));
 			goto retry;
 		} else {
 			msg("mariabackup: TRX_SYS page corrupted.\n");
@@ -3930,7 +4016,7 @@ xb_load_tablespaces()
 		xb_close_undo_tablespaces();
 	}
 
-	DBUG_MARIABACKUP_EVENT("after_load_tablespaces", 0);
+	DBUG_MARIABACKUP_EVENT("after_load_tablespaces", {});
 	return(DB_SUCCESS);
 }
 
@@ -3972,22 +4058,16 @@ new hash table */
 static
 xb_filter_entry_t*
 xb_add_filter(
-/*========================*/
 	const char*	name,	/*!< in: name of table/database */
 	hash_table_t*	hash)	/*!< in/out: hash to insert into */
 {
-	xb_filter_entry_t*	entry;
-
-	entry = xb_new_filter_entry(name);
+	xb_filter_entry_t* entry = xb_new_filter_entry(name);
 
 	if (UNIV_UNLIKELY(!hash->array)) {
 		hash->create(1000);
 	}
-	HASH_INSERT(xb_filter_entry_t,
-		name_hash, hash,
-		ut_fold_string(entry->name),
-		entry);
-
+	const ulint fold = my_crc32c(0, entry->name, strlen(entry->name));
+	HASH_INSERT(xb_filter_entry_t, name_hash, hash, fold, entry);
 	return entry;
 }
 
@@ -4041,8 +4121,9 @@ xb_register_filter_entry(
 		dbname[p - name] = 0;
 
 		if (databases_hash && databases_hash->array) {
+			const ulint fold = my_crc32c(0, dbname, p - name);
 			HASH_SEARCH(name_hash, databases_hash,
-					ut_fold_string(dbname),
+					fold,
 					xb_filter_entry_t*,
 					db_entry, (void) 0,
 					!strcmp(db_entry->name, dbname));
@@ -4251,9 +4332,10 @@ xb_filter_hash_free(hash_table_t* hash)
 
 			table = static_cast<xb_filter_entry_t *>
 				(HASH_GET_NEXT(name_hash, prev_table));
-
+			const ulint fold = my_crc32c(0, prev_table->name,
+						     strlen(prev_table->name));
 			HASH_DELETE(xb_filter_entry_t, name_hash, hash,
-				ut_fold_string(prev_table->name), prev_table);
+				    fold, prev_table);
 			free(prev_table);
 		}
 	}
@@ -4351,40 +4433,40 @@ end:
 
 static void stop_backup_threads()
 {
-	if (log_copying_stop && log_copying_running) {
-		os_event_set(log_copying_stop);
-		fputs("mariabackup: Stopping log copying thread", stderr);
-		fflush(stderr);
-		while (log_copying_running) {
-			putc('.', stderr);
-			fflush(stderr);
-			os_thread_sleep(200000); /*0.2 sec*/
-		}
-		putc('\n', stderr);
-		os_event_destroy(log_copying_stop);
-	}
+  mysql_cond_broadcast(&log_copying_stop);
 
-	if (wait_throttle) {
-		/* wait for io_watching_thread completion */
-		while (io_watching_thread_running) {
-			os_thread_sleep(1000000);
-		}
-		os_event_destroy(wait_throttle);
-	}
+  if (log_copying_running || have_io_watching_thread)
+  {
+    mysql_mutex_unlock(&log_sys.mutex);
+    fputs("mariabackup: Stopping log copying thread", stderr);
+    fflush(stderr);
+    mysql_mutex_lock(&log_sys.mutex);
+    while (log_copying_running || have_io_watching_thread)
+    {
+      mysql_cond_broadcast(&log_copying_stop);
+      mysql_mutex_unlock(&log_sys.mutex);
+      putc('.', stderr);
+      fflush(stderr);
+      std::this_thread::sleep_for(std::chrono::milliseconds(200));
+      mysql_mutex_lock(&log_sys.mutex);
+    }
+    putc('\n', stderr);
+  }
+
+  mysql_cond_destroy(&log_copying_stop);
 }
 
 /** Implement the core of --backup
 @return	whether the operation succeeded */
 static bool xtrabackup_backup_low()
 {
+	mysql_mutex_lock(&log_sys.mutex);
 	ut_ad(!metadata_to_lsn);
 
 	/* read the latest checkpoint lsn */
 	{
 		ulint	max_cp_field;
 
-		mysql_mutex_lock(&log_sys.mutex);
-
 		if (recv_find_max_checkpoint(&max_cp_field) == DB_SUCCESS
 		    && log_sys.log.format != 0) {
 			if (max_cp_field == LOG_CHECKPOINT_1) {
@@ -4400,16 +4482,17 @@ static bool xtrabackup_backup_low()
 		} else {
 			msg("Error: recv_find_max_checkpoint() failed.");
 		}
-		mysql_mutex_unlock(&log_sys.mutex);
-	}
 
-	stop_backup_threads();
+		stop_backup_threads();
+	}
 
 	if (metadata_to_lsn && xtrabackup_copy_logfile(true)) {
+		mysql_mutex_unlock(&log_sys.mutex);
 		ds_close(dst_log_file);
 		dst_log_file = NULL;
 		return false;
 	}
+	mysql_mutex_unlock(&log_sys.mutex);
 
 	if (ds_close(dst_log_file) || !metadata_to_lsn) {
 		dst_log_file = NULL;
@@ -4512,15 +4595,22 @@ static bool xtrabackup_backup_func()
 	srv_operation = SRV_OPERATION_BACKUP;
 	log_file_op = backup_file_op;
 	undo_space_trunc = backup_undo_trunc;
+	first_page_init = backup_first_page_op;
 	metadata_to_lsn = 0;
 
 	/* initialize components */
         if(innodb_init_param()) {
 fail:
-		metadata_to_lsn = log_copying_running;
-		stop_backup_threads();
+		if (log_copying_running) {
+			mysql_mutex_lock(&log_sys.mutex);
+			metadata_to_lsn = 1;
+			stop_backup_threads();
+			mysql_mutex_unlock(&log_sys.mutex);
+		}
+
 		log_file_op = NULL;
 		undo_space_trunc = NULL;
+		first_page_init = NULL;
 		if (dst_log_file) {
 			ds_close(dst_log_file);
 			dst_log_file = NULL;
@@ -4548,7 +4638,6 @@ fail:
                                                 computers */
         }
 	srv_thread_pool_init();
-	sync_check_init();
 	/* Reset the system variables in the recovery module. */
 	trx_pool_init();
 	recv_sys.create();
@@ -4677,13 +4766,15 @@ reread_log_header:
 
 	aligned_free(log_hdr_buf);
 	log_copying_running = true;
+
+	mysql_cond_init(0, &log_copying_stop, nullptr);
+
 	/* start io throttle */
-	if(xtrabackup_throttle) {
+	if (xtrabackup_throttle) {
 		io_ticket = xtrabackup_throttle;
-		wait_throttle = os_event_create(0);
-		io_watching_thread_running = true;
-
-		os_thread_create(io_watching_thread);
+		have_io_watching_thread = true;
+		mysql_cond_init(0, &wait_throttle, nullptr);
+		std::thread(io_watching_thread).detach();
 	}
 
 	/* Populate fil_system with tablespaces to copy */
@@ -4700,13 +4791,18 @@ fail_before_log_copying_thread_start:
 	log_copy_scanned_lsn = checkpoint_lsn_start;
 	recv_sys.recovered_lsn = log_copy_scanned_lsn;
 
-	if (xtrabackup_copy_logfile())
+	mysql_mutex_lock(&log_sys.mutex);
+
+	const bool log_copy_failed = xtrabackup_copy_logfile();
+
+	mysql_mutex_unlock(&log_sys.mutex);
+
+	if (log_copy_failed)
 		goto fail_before_log_copying_thread_start;
 
-	DBUG_MARIABACKUP_EVENT("before_innodb_log_copy_thread_started",0);
+	DBUG_MARIABACKUP_EVENT("before_innodb_log_copy_thread_started", {});
 
-	log_copying_stop = os_event_create(0);
-	os_thread_create(log_copying_thread);
+	std::thread(log_copying_thread).detach();
 
 	/* FLUSH CHANGED_PAGE_BITMAPS call */
 	if (!flush_changed_page_bitmaps()) {
@@ -4724,16 +4820,11 @@ fail_before_log_copying_thread_start:
 		mdl_lock_all();
 
 		DBUG_EXECUTE_IF("check_mdl_lock_works",
-			dbug_alter_thread_done =
 			dbug_start_query_thread("ALTER TABLE test.t ADD COLUMN mdl_lock_column int",
 				"Waiting for table metadata lock", 0, 0););
 	}
 
-	datafiles_iter_t *it = datafiles_iter_new();
-	if (it == NULL) {
-		msg("mariabackup: Error: datafiles_iter_new() failed.");
-		goto fail;
-	}
+	datafiles_iter_t it;
 
 	/* Create data copying threads */
 	data_threads = (data_thread_ctxt_t *)
@@ -4742,18 +4833,17 @@ fail_before_log_copying_thread_start:
 	pthread_mutex_init(&count_mutex, NULL);
 
 	for (i = 0; i < (uint) xtrabackup_parallel; i++) {
-		data_threads[i].it = it;
+		data_threads[i].it = &it;
 		data_threads[i].num = i+1;
 		data_threads[i].count = &count;
 		data_threads[i].count_mutex = &count_mutex;
 		data_threads[i].corrupted_pages = &corrupted_pages;
-		data_threads[i].id = os_thread_create(data_copy_thread_func,
-						      data_threads + i);
+		std::thread(data_copy_thread_func, data_threads + i).detach();
 	}
 
 	/* Wait for threads to exit */
 	while (1) {
-		os_thread_sleep(1000000);
+		std::this_thread::sleep_for(std::chrono::seconds(1));
 		pthread_mutex_lock(&count_mutex);
 		bool stop = count == 0;
 		pthread_mutex_unlock(&count_mutex);
@@ -4764,7 +4854,6 @@ fail_before_log_copying_thread_start:
 
 	pthread_mutex_destroy(&count_mutex);
 	free(data_threads);
-	datafiles_iter_free(it);
 	}
 
 	bool ok = backup_start(corrupted_pages);
@@ -4775,9 +4864,7 @@ fail_before_log_copying_thread_start:
 		backup_release();
 
 		DBUG_EXECUTE_IF("check_mdl_lock_works",
-			os_event_wait(dbug_alter_thread_done);
-			os_event_destroy(dbug_alter_thread_done);
-		);
+				pthread_join(dbug_alter_thread, nullptr););
 
 		if (ok) {
 			backup_finish();
@@ -4813,6 +4900,7 @@ fail_before_log_copying_thread_start:
 	innodb_shutdown();
 	log_file_op = NULL;
 	undo_space_trunc = NULL;
+	first_page_init = NULL;
 	pthread_mutex_destroy(&backup_mutex);
 	pthread_cond_destroy(&scanned_lsn_cond);
 	if (!corrupted_pages.empty()) {
@@ -4844,16 +4932,16 @@ FTWRL.  This ensures consistent backup in presence of DDL.
 */
 void backup_fix_ddl(CorruptedPages &corrupted_pages)
 {
-	std::set<std::string> new_tables;
 	std::set<std::string> dropped_tables;
 	std::map<std::string, std::string> renamed_tables;
+	space_id_to_name_t new_tables;
 
 	/* Disable further DDL on backed up tables (only needed for --no-lock).*/
 	pthread_mutex_lock(&backup_mutex);
 	log_file_op = backup_file_op_fail;
 	pthread_mutex_unlock(&backup_mutex);
 
-	DBUG_MARIABACKUP_EVENT("backup_fix_ddl",0);
+	DBUG_MARIABACKUP_EVENT("backup_fix_ddl", {});
 
 	for (space_id_to_name_t::iterator iter = ddl_tracker.tables_in_backup.begin();
 		iter != ddl_tracker.tables_in_backup.end();
@@ -4894,9 +4982,11 @@ void backup_fix_ddl(CorruptedPages &corrupted_pages)
 			continue;
 		}
 
-		if (ddl_tracker.drops.find(id) == ddl_tracker.drops.end()) {
+		if (ddl_tracker.drops.find(id) == ddl_tracker.drops.end()
+		    && ddl_tracker.deferred_tables.find(id)
+			== ddl_tracker.deferred_tables.end()) {
 			dropped_tables.erase(name);
-			new_tables.insert(name);
+			new_tables[id] = name;
 			if (opt_log_innodb_page_corruption)
 				corrupted_pages.drop_space(id);
 		}
@@ -4921,10 +5011,8 @@ void backup_fix_ddl(CorruptedPages &corrupted_pages)
 	//  Load and copy new tables.
 	//  Close all datanodes first, reload only new tables.
 	std::vector<fil_node_t *> all_nodes;
-	datafiles_iter_t *it = datafiles_iter_new();
-	if (!it)
-		return;
-	while (fil_node_t *node = datafiles_iter_next(it)) {
+	datafiles_iter_t it;
+	while (fil_node_t *node = datafiles_iter_next(&it)) {
 		all_nodes.push_back(node);
 	}
 	for (size_t i = 0; i < all_nodes.size(); i++) {
@@ -4932,38 +5020,69 @@ void backup_fix_ddl(CorruptedPages &corrupted_pages)
 		if (n->space->id == 0)
 			continue;
 		if (n->is_open()) {
-			mutex_enter(&fil_system.mutex);
+			mysql_mutex_lock(&fil_system.mutex);
 			n->close();
-			mutex_exit(&fil_system.mutex);
+			mysql_mutex_unlock(&fil_system.mutex);
 		}
 		fil_space_free(n->space->id, false);
 	}
-	datafiles_iter_free(it);
 
 	DBUG_EXECUTE_IF("check_mdl_lock_works", DBUG_ASSERT(new_tables.size() == 0););
-	for (std::set<std::string>::iterator iter = new_tables.begin();
-		iter != new_tables.end(); iter++) {
-		const char *space_name = iter->c_str();
-		if (check_if_skip_table(space_name))
+
+	srv_operation = SRV_OPERATION_BACKUP_NO_DEFER;
+
+	/* Mariabackup detected the FILE_MODIFY or FILE_RENAME
+	for the deferred tablespace. So it needs to read the
+	tablespace again if innodb doesn't have page0 initialization
+	redo log for it */
+	for (space_id_to_name_t::iterator iter =
+			ddl_tracker.deferred_tables.begin();
+	     iter != ddl_tracker.deferred_tables.end();
+	     iter++) {
+		if (check_if_skip_table(iter->second.c_str())) {
 			continue;
-		xb_load_single_table_tablespace(*iter, false);
+		}
+
+		if (first_page_init_ids.find(iter->first)
+				!= first_page_init_ids.end()) {
+			new_tables[iter->first] = iter->second.c_str();
+			continue;
+		}
+
+		xb_load_single_table_tablespace(iter->second, false);
 	}
 
-	it = datafiles_iter_new();
-	if (!it)
-		return;
+	/* Mariabackup doesn't detect any FILE_OP for the deferred
+	tablespace. There is a possiblity that page0 could've
+	been corrupted persistently in the disk */
+	for (auto space_name: defer_space_names) {
+		if (!check_if_skip_table(space_name.c_str())) {
+			xb_load_single_table_tablespace(
+					space_name, false);
+		}
+	}
+
+	srv_operation = SRV_OPERATION_BACKUP;
+
+	for (const auto &t : new_tables) {
+		if (!check_if_skip_table(t.second.c_str())) {
+			xb_load_single_table_tablespace(t.second, false,
+							t.first);
+		}
+	}
 
-	while (fil_node_t *node = datafiles_iter_next(it)) {
-		fil_space_t * space = node->space;
-		if (!fil_is_user_tablespace_id(space->id))
+	datafiles_iter_t it2;
+
+	while (fil_node_t *node = datafiles_iter_next(&it2)) {
+		if (!fil_is_user_tablespace_id(node->space->id))
 			continue;
-		std::string dest_name(node->space->name);
+		std::string dest_name= filename_to_spacename(
+			node->name, strlen(node->name));
 		dest_name.append(".new");
+
 		xtrabackup_copy_datafile(node, 0, dest_name.c_str(), wf_write_through,
 			corrupted_pages);
 	}
-
-	datafiles_iter_free(it);
 }
 
 /* ================= prepare ================= */
@@ -5026,64 +5145,20 @@ xb_space_create_file(
 		return ret;
 	}
 
-	/* Align the memory for file i/o if we might have O_DIRECT set */
-	byte* page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
-						       srv_page_size));
-
-	memset(page, '\0', srv_page_size);
-
-	fsp_header_init_fields(page, space_id, flags);
-	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
-
-	const ulint zip_size = fil_space_t::zip_size(flags);
-
-	if (!zip_size) {
-		buf_flush_init_for_writing(
-			NULL, page, NULL,
-			fil_space_t::full_crc32(flags));
-
-		ret = os_file_write(IORequestWrite, path, *file, page, 0,
-				    srv_page_size);
-	} else {
-		page_zip_des_t	page_zip;
-		page_zip_set_size(&page_zip, zip_size);
-		page_zip.data = page + srv_page_size;
-		fprintf(stderr, "zip_size = " ULINTPF "\n", zip_size);
-
-#ifdef UNIV_DEBUG
-		page_zip.m_start = 0;
-#endif /* UNIV_DEBUG */
-		page_zip.m_end = 0;
-		page_zip.m_nonempty = 0;
-		page_zip.n_blobs = 0;
-
-		buf_flush_init_for_writing(NULL, page, &page_zip, false);
-
-		ret = os_file_write(IORequestWrite, path, *file,
-				    page_zip.data, 0, zip_size);
-	}
-
-	aligned_free(page);
-
-	if (ret != DB_SUCCESS) {
-		msg("mariabackup: could not write the first page to %s",
-		    path);
-		os_file_close(*file);
-		os_file_delete(0, path);
-		return ret;
-	}
-
 	return TRUE;
 }
 
 static fil_space_t* fil_space_get_by_name(const char* name)
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space != NULL;
-	     space = UT_LIST_GET_NEXT(space_list, space))
-		if (!strcmp(space->name, name)) return space;
-	return NULL;
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  for (fil_space_t &space : fil_system.space_list)
+    if (space.chain.start)
+      if (const char *str= strstr(space.chain.start->name, name))
+        if (!strcmp(str + strlen(name), ".ibd") &&
+            (str == space.chain.start->name ||
+             IF_WIN(str[-1] == '\\' ||,) str[-1] == '/'))
+          return &space;
+  return nullptr;
 }
 
 /***********************************************************************
@@ -5118,20 +5193,15 @@ xb_delta_open_matching_space(
 	if (dbname) {
 		snprintf(dest_dir, FN_REFLEN, "%s/%s",
 			xtrabackup_target_dir, dbname);
-		os_normalize_path(dest_dir);
-
 		snprintf(dest_space_name, FN_REFLEN, "%s/%s", dbname, name);
 	} else {
 		snprintf(dest_dir, FN_REFLEN, "%s", xtrabackup_target_dir);
-		os_normalize_path(dest_dir);
-
 		snprintf(dest_space_name, FN_REFLEN, "%s", name);
 	}
 
 	snprintf(real_name, real_name_len,
 		 "%s/%s",
 		 xtrabackup_target_dir, dest_space_name);
-	os_normalize_path(real_name);
 	/* Truncate ".ibd" */
 	dest_space_name[strlen(dest_space_name) - 4] = '\0';
 
@@ -5174,19 +5244,21 @@ exit:
 		return file;
 	}
 
+	const size_t len = strlen(dest_space_name);
 	/* remember space name for further reference */
 	table = static_cast<xb_filter_entry_t *>
 		(malloc(sizeof(xb_filter_entry_t) +
-			strlen(dest_space_name) + 1));
+			len + 1));
 
 	table->name = ((char*)table) + sizeof(xb_filter_entry_t);
-	strcpy(table->name, dest_space_name);
+	memcpy(table->name, dest_space_name, len + 1);
+	const ulint fold = my_crc32c(0, dest_space_name, len);
 	HASH_INSERT(xb_filter_entry_t, name_hash, &inc_dir_tables_hash,
-			ut_fold_string(table->name), table);
+		    fold, table);
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	fil_space = fil_space_get_by_name(dest_space_name);
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	if (fil_space != NULL) {
 		if (fil_space->id == info.space_id
@@ -5201,12 +5273,11 @@ exit:
 				 dbname, fil_space->id);
 
 			msg("mariabackup: Renaming %s to %s.ibd",
-				fil_space->name, tmpname);
+				fil_space->chain.start->name, tmpname);
 
-			if (fil_space->rename(tmpname, NULL, false)
-			    != DB_SUCCESS) {
+			if (fil_space->rename(tmpname, false) != DB_SUCCESS) {
 				msg("mariabackup: Cannot rename %s to %s",
-					fil_space->name, tmpname);
+					fil_space->chain.start->name, tmpname);
 				goto exit;
 			}
 		}
@@ -5217,21 +5288,20 @@ exit:
 		die("Can't handle DDL operation on tablespace "
 		    "%s\n", dest_space_name);
 	}
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	fil_space = fil_space_get_by_id(info.space_id);
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 	if (fil_space != NULL) {
 		char	tmpname[FN_REFLEN];
 
-		strncpy(tmpname, dest_space_name, FN_REFLEN);
+		snprintf(tmpname, sizeof tmpname, "%s.ibd", dest_space_name);
 
 		msg("mariabackup: Renaming %s to %s",
-		    fil_space->name, dest_space_name);
+		    fil_space->chain.start->name, tmpname);
 
-		if (fil_space->rename(tmpname, NULL, false) != DB_SUCCESS)
-		{
+		if (fil_space->rename(tmpname, false) != DB_SUCCESS) {
 			msg("mariabackup: Cannot rename %s to %s",
-				fil_space->name, dest_space_name);
+			    fil_space->chain.start->name, tmpname);
 			goto exit;
 		}
 
@@ -5254,7 +5324,7 @@ exit:
 	ut_ad(fil_space_t::zip_size(flags) == info.zip_size);
 	ut_ad(fil_space_t::physical_size(flags) == info.page_size);
 
-	if (fil_space_t::create(dest_space_name, info.space_id, flags,
+	if (fil_space_t::create(info.space_id, flags,
 				FIL_TYPE_TABLESPACE, 0, FIL_ENCRYPTION_DEFAULT,
 				true)) {
 		*success = xb_space_create_file(real_name, info.space_id,
@@ -5320,10 +5390,6 @@ xtrabackup_apply_delta(
 		goto error;
 	}
 
-	os_normalize_path(dst_path);
-	os_normalize_path(src_path);
-	os_normalize_path(meta_path);
-
 	if (!xb_read_delta_metadata(meta_path, &info)) {
 		goto error;
 	}
@@ -5374,7 +5440,8 @@ xtrabackup_apply_delta(
 		offset = ((incremental_buffers * (page_size / 4))
 			 << page_size_shift);
 		if (os_file_read(IORequestRead, src_file,
-				 incremental_buffer, offset, page_size)
+				 incremental_buffer, offset, page_size,
+				 nullptr)
 		    != DB_SUCCESS) {
 			goto error;
 		}
@@ -5407,7 +5474,7 @@ xtrabackup_apply_delta(
 		/* read whole of the cluster */
 		if (os_file_read(IORequestRead, src_file,
 				 incremental_buffer,
-				 offset, page_in_buffer * page_size)
+				 offset, page_in_buffer * page_size, nullptr)
 		    != DB_SUCCESS) {
 			goto error;
 		}
@@ -5602,8 +5669,10 @@ static ibool prepare_handle_new_files(const char *data_home_dir,
 			(malloc(sizeof(xb_filter_entry_t) + table_name.size() + 1));
 		table->name = ((char*)table) + sizeof(xb_filter_entry_t);
 		strcpy(table->name, table_name.c_str());
+		const ulint fold = my_crc32c(0, table->name,
+					     table_name.size());
 		HASH_INSERT(xb_filter_entry_t, name_hash, &inc_dir_tables_hash,
-				ut_fold_string(table->name), table);
+			    fold, table);
 	}
 
 	return TRUE;
@@ -5626,9 +5695,11 @@ rm_if_not_found(
 
 	snprintf(name, FN_REFLEN, "%s/%s", db_name, file_name);
 	/* Truncate ".ibd" */
-	name[strlen(name) - 4] = '\0';
+	const size_t len = strlen(name) - 4;
+	name[len] = '\0';
+	const ulint fold = my_crc32c(0, name, len);
 
-	HASH_SEARCH(name_hash, &inc_dir_tables_hash, ut_fold_string(name),
+	HASH_SEARCH(name_hash, &inc_dir_tables_hash, fold,
 		    xb_filter_entry_t*,
 		    table, (void) 0,
 		    !strcmp(table->name, name));
@@ -5725,8 +5796,6 @@ next_file_item_1:
                          OS_FILE_MAX_PATH/2-1,
                          dbinfo.name);
 
-		os_normalize_path(dbpath);
-
 		dbdir = os_file_opendir(dbpath);
 
 		if (dbdir != IF_WIN(INVALID_HANDLE_VALUE, nullptr)) {
@@ -5846,8 +5915,7 @@ static void rename_table_in_prepare(const std::string &datadir, const std::strin
 	if (file_exists(dest)) {
 		ren2= std::string(datadir) + "/" + to + ".ren";
 		if (!file_exists(ren2)) {
-			msg("ERROR : File %s was not found, but expected during rename processing\n", ren2.c_str());
-			ut_a(0);
+			die("ERROR : File %s was not found, but expected during rename processing\n", ren2.c_str());
 		}
 		tmp = to + "#";
 		rename_table_in_prepare(datadir, to, tmp);
@@ -5994,7 +6062,6 @@ static bool xtrabackup_prepare_func(char** argv)
 			goto error_cleanup;
 		}
 
-		sync_check_init();
 		recv_sys.create();
 		log_sys.create();
 		recv_sys.recovery_on = true;
@@ -6023,7 +6090,6 @@ static bool xtrabackup_prepare_func(char** argv)
 		fil_system.close();
 		innodb_free_param();
 		log_sys.close();
-		sync_check_close();
 		if (!ok) goto error_cleanup;
 	}
 
@@ -6841,7 +6907,7 @@ static int main_low(char** argv)
 	/* get default temporary directory */
 	if (!opt_mysql_tmpdir || !opt_mysql_tmpdir[0]) {
 		opt_mysql_tmpdir = getenv("TMPDIR");
-#if defined(__WIN__)
+#if defined(_WIN32)
 		if (!opt_mysql_tmpdir) {
 			opt_mysql_tmpdir = getenv("TEMP");
 		}
diff --git a/extra/mariabackup/xtrabackup.h b/extra/mariabackup/xtrabackup.h
index 394ea9ed87c..ff7adf49252 100644
--- a/extra/mariabackup/xtrabackup.h
+++ b/extra/mariabackup/xtrabackup.h
@@ -286,4 +286,16 @@ fil_file_readdir_next_file(
 	os_file_dir_t	dir,	/*!< in: directory stream */
 	os_file_stat_t* info);	/*!< in/out: buffer where the
 				info is returned */
+
+#ifndef DBUG_OFF
+#include <fil0fil.h>
+extern void dbug_mariabackup_event(const char *event,
+                            const fil_space_t::name_type key);
+
+#define DBUG_MARIABACKUP_EVENT(A, B)                                          \
+  DBUG_EXECUTE_IF("mariabackup_events", dbug_mariabackup_event(A, B);)
+#else
+#define DBUG_MARIABACKUP_EVENT(A, B) /* empty */
+#endif // DBUG_OFF
+
 #endif /* XB_XTRABACKUP_H */
diff --git a/extra/my_print_defaults.c b/extra/my_print_defaults.c
index 83eb0bacf08..b7f52382721 100644
--- a/extra/my_print_defaults.c
+++ b/extra/my_print_defaults.c
@@ -106,7 +106,7 @@ static void usage()
 static my_bool
 get_one_option(const struct my_option *opt __attribute__((unused)),
 	       const char *argument __attribute__((unused)),
-               const char *filename)
+               const char *filename __attribute__((unused)))
 {
   switch (opt->id) {
     case 'c':
diff --git a/extra/resolve_stack_dump.c b/extra/resolve_stack_dump.c
index cb4d6ed33e5..fe2f297fd33 100644
--- a/extra/resolve_stack_dump.c
+++ b/extra/resolve_stack_dump.c
@@ -177,7 +177,7 @@ static void open_files()
   /* if name not given, assume stdin*/
 
   if (!sym_fname)
-    die("Please run nm --numeric-sort on mysqld binary that produced stack \
+    die("Please run nm --numeric-sort on mariadbd binary that produced stack \
 trace dump and specify the path to it with -s or --symbols-file");
   if (!(fp_sym = my_fopen(sym_fname, O_RDONLY, MYF(MY_WME))))
     die("Could not open %s", sym_fname);
diff --git a/extra/wolfssl/CMakeLists.txt b/extra/wolfssl/CMakeLists.txt
index 0aee5865867..390a618ac91 100644
--- a/extra/wolfssl/CMakeLists.txt
+++ b/extra/wolfssl/CMakeLists.txt
@@ -1,4 +1,4 @@
-IF(MSVC)
+IF(MSVC_INTEL)
   PROJECT(wolfssl C ASM_MASM)
 ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64")
    PROJECT(wolfssl C ASM)
@@ -7,7 +7,7 @@ ELSE()
 ENDIF()
 
 IF(CMAKE_SIZEOF_VOID_P MATCHES 8)
-IF(MSVC)
+IF(MSVC_INTEL)
   SET(WOLFSSL_INTELASM ON)
   SET(WOLFSSL_X86_64_BUILD 1)
   SET(HAVE_INTEL_RDSEED 1)
diff --git a/include/aligned.h b/include/aligned.h
new file mode 100644
index 00000000000..81bd5d3f6f7
--- /dev/null
+++ b/include/aligned.h
@@ -0,0 +1,38 @@
+/*
+   Copyright (c) 2022, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA */
+
+#if defined __linux__
+# include <malloc.h>
+#endif
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+  return _aligned_malloc(size, alignment);
+#elif defined __linux__
+  return memalign(alignment, size);
+#else
+  void *result;
+  if (posix_memalign(&result, alignment, size))
+    result= NULL;
+  return result;
+#endif
+}
+
+inline void aligned_free(void *ptr)
+{
+  IF_WIN(_aligned_free,free)(ptr);
+}
diff --git a/include/my_atomic_wrapper.h b/include/my_atomic_wrapper.h
index 80debdc6d59..7b35b14d3b7 100644
--- a/include/my_atomic_wrapper.h
+++ b/include/my_atomic_wrapper.h
@@ -48,6 +48,7 @@ public:
   operator Type() const { return m.load(); }
   Type operator=(const Type i) { store(i); return i; }
   Type operator=(const Atomic_relaxed<Type> &rhs) { return *this= Type{rhs}; }
+  Type operator+=(const Type i) { return fetch_add(i); }
   Type fetch_add(const Type i, std::memory_order o= std::memory_order_relaxed)
   { return m.fetch_add(i, o); }
   Type fetch_sub(const Type i, std::memory_order o= std::memory_order_relaxed)
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index b41b4389d21..1651557840f 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -54,16 +54,7 @@ ENDIF()
 # OS tests
 IF(UNIX)
   IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-
     ADD_DEFINITIONS("-D_GNU_SOURCE=1")
-
-    CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
-    CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
-
-    IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
-      ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
-      LINK_LIBRARIES(aio)
-    ENDIF()
     IF(HAVE_LIBNUMA)
       LINK_LIBRARIES(numa)
     ENDIF()
@@ -94,11 +85,6 @@ IF(WITH_INNODB_EXTRA_DEBUG)
 ENDIF()
 ADD_FEATURE_INFO(INNODB_EXTRA_DEBUG WITH_INNODB_EXTRA_DEBUG "Extra InnoDB debug checks")
 
-CHECK_FUNCTION_EXISTS(nanosleep HAVE_NANOSLEEP)
-IF(HAVE_NANOSLEEP)
- ADD_DEFINITIONS(-DHAVE_NANOSLEEP=1)
-ENDIF()
-
 IF(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE)
  ADD_DEFINITIONS(-DHAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE=1)
 ENDIF()
@@ -120,17 +106,6 @@ ENDIF(NOT MSVC)
 
 CHECK_FUNCTION_EXISTS(vasprintf  HAVE_VASPRINTF)
 
-SET(MUTEXTYPE "event" CACHE STRING "Mutex type: event, sys or futex")
-
-IF(MUTEXTYPE MATCHES "event")
-  ADD_DEFINITIONS(-DMUTEX_EVENT)
-ELSEIF(MUTEXTYPE MATCHES "futex" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
-  ADD_DEFINITIONS(-DMUTEX_FUTEX)
-ELSE()
-   ADD_DEFINITIONS(-DMUTEX_SYS)
-ENDIF()
-
-
 # Include directories under innobase
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
 		    ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
@@ -187,6 +162,7 @@ SET(INNOBASE_SOURCES
 	dict/dict0stats.cc
 	dict/dict0stats_bg.cc
 	dict/dict0defrag_bg.cc
+	dict/drop.cc
 	eval/eval0eval.cc
 	eval/eval0proc.cc
 	fil/fil0fil.cc
@@ -233,7 +209,6 @@ SET(INNOBASE_SOURCES
 	include/buf0dblwr.h
 	include/buf0dump.h
 	include/buf0flu.h
-	include/buf0flu.inl
 	include/buf0lru.h
 	include/buf0rea.h
 	include/buf0types.h
@@ -244,7 +219,6 @@ SET(INNOBASE_SOURCES
 	include/data0types.h
 	include/db0err.h
 	include/dict0boot.h
-	include/dict0boot.inl
 	include/dict0crea.h
 	include/dict0crea.inl
 	include/dict0defrag_bg.h
@@ -255,8 +229,6 @@ SET(INNOBASE_SOURCES
 	include/dict0mem.inl
 	include/dict0pagecompress.h
 	include/dict0pagecompress.inl
-	include/dict0priv.h
-	include/dict0priv.inl
 	include/dict0stats.h
 	include/dict0stats.inl
 	include/dict0stats_bg.h
@@ -270,7 +242,6 @@ SET(INNOBASE_SOURCES
 	include/fil0crypt.h
 	include/fil0crypt.inl
 	include/fil0fil.h
-	include/fil0fil.inl
 	include/fil0pagecompress.h
 	include/fsp0file.h
 	include/fsp0fsp.h
@@ -290,7 +261,6 @@ SET(INNOBASE_SOURCES
 	include/fts0types.h
 	include/fts0types.inl
 	include/fts0vlc.h
-	include/fut0fut.h
 	include/fut0lst.h
 	include/gis0geo.h
 	include/gis0rtree.h
@@ -303,7 +273,6 @@ SET(INNOBASE_SOURCES
 	include/ha0storage.inl
 	include/handler0alter.h
 	include/hash0hash.h
-	include/ib0mutex.h
 	include/ibuf0ibuf.h
 	include/ibuf0ibuf.inl
 	include/lock0iter.h
@@ -324,12 +293,9 @@ SET(INNOBASE_SOURCES
 	include/mem0mem.inl
 	include/mtr0log.h
 	include/mtr0mtr.h
-	include/mtr0mtr.inl
 	include/mtr0types.h
-	include/os0event.h
 	include/os0file.h
 	include/os0file.inl
-	include/os0thread.h
 	include/page0cur.h
 	include/page0cur.inl
 	include/page0page.h
@@ -357,7 +323,6 @@ SET(INNOBASE_SOURCES
 	include/row0import.h
 	include/row0ins.h
 	include/row0log.h
-	include/row0log.inl
 	include/row0merge.h
 	include/row0mysql.h
 	include/row0purge.h
@@ -365,7 +330,6 @@ SET(INNOBASE_SOURCES
 	include/row0row.h
 	include/row0row.inl
 	include/row0sel.h
-	include/row0sel.inl
 	include/row0types.h
 	include/row0uins.h
 	include/row0umod.h
@@ -373,25 +337,20 @@ SET(INNOBASE_SOURCES
 	include/row0upd.h
 	include/row0upd.inl
 	include/row0vers.h
+	include/rw_lock.h
+	include/small_vector.h
 	include/srv0mon.h
 	include/srv0mon.inl
 	include/srv0srv.h
 	include/srv0start.h
-	include/sync0arr.h
-	include/sync0arr.inl
-	include/sync0debug.h
-	include/sync0policy.h
-	include/sync0rw.h
-	include/sync0rw.inl
-	include/sync0sync.h
-	include/sync0types.h
+	include/srw_lock.h
+	include/sux_lock.h
+	include/transactional_lock_guard.h
 	include/trx0i_s.h
 	include/trx0purge.h
 	include/trx0rec.h
-	include/trx0rec.inl
 	include/trx0roll.h
 	include/trx0rseg.h
-	include/trx0rseg.inl
 	include/trx0sys.h
 	include/trx0trx.h
 	include/trx0trx.inl
@@ -409,7 +368,6 @@ SET(INNOBASE_SOURCES
 	include/ut0lst.h
 	include/ut0mem.h
 	include/ut0mem.inl
-	include/ut0mutex.h
 	include/ut0new.h
 	include/ut0pool.h
 	include/ut0rbt.h
@@ -425,7 +383,6 @@ SET(INNOBASE_SOURCES
 	lock/lock0iter.cc
 	lock/lock0prdt.cc
 	lock/lock0lock.cc
-	lock/lock0wait.cc
 	log/log0log.cc
 	log/log0recv.cc
 	log/log0crypt.cc
@@ -433,8 +390,6 @@ SET(INNOBASE_SOURCES
 	mem/mem0mem.cc
 	mtr/mtr0mtr.cc
 	os/os0file.cc
-	os/os0event.cc
-	os/os0thread.cc
 	page/page0cur.cc
 	page/page0page.cc
 	page/page0zip.cc
@@ -466,10 +421,7 @@ SET(INNOBASE_SOURCES
 	srv/srv0mon.cc
 	srv/srv0srv.cc
 	srv/srv0start.cc
-	sync/sync0arr.cc
-	sync/sync0rw.cc
-	sync/sync0debug.cc
-	sync/sync0sync.cc
+	sync/srw_lock.cc
 	trx/trx0i_s.cc
 	trx/trx0purge.cc
 	trx/trx0rec.cc
@@ -488,21 +440,33 @@ SET(INNOBASE_SOURCES
 	ut/ut0vec.cc
 	ut/ut0wqueue.cc)
 
+OPTION(WITH_PMEM "Support redo log in persistent memory" OFF)
+FIND_PACKAGE(PMEM)
+IF(PMEM_FOUND)
+  INCLUDE_DIRECTORIES(${PMEM_INCLUDES})
+  ADD_COMPILE_FLAGS(log/log0log.cc COMPILE_FLAGS "-DHAVE_PMEM")
+  SET(PMEM_LIBRARY ${PMEM_LIBRARIES})
+ELSE()
+  IF(WITH_PMEM)
+    MESSAGE(FATAL_ERROR "WITH_PMEM=ON cannot be satisfied")
+  ENDIF()
+ENDIF()
+
 MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
   MODULE_OUTPUT_NAME ha_innodb
   DEFAULT RECOMPILE_FOR_EMBEDDED
   LINK_LIBRARIES
 	${ZLIB_LIBRARY}
+	${PMEM_LIBRARY}
 	${NUMA_LIBRARY}
 	${LIBSYSTEMD}
-	${LINKER_SCRIPT}
-	${LIBPMEM})
+	${LINKER_SCRIPT})
 
 IF(NOT TARGET innobase)
   RETURN()
 ENDIF()
 
-ADD_DEFINITIONS(${SSL_DEFINES})
+ADD_DEFINITIONS(${SSL_DEFINES} ${TPOOL_DEFINES})
 
 # A GCC bug causes crash when compiling these files on ARM64 with -O1+
 # Compile them with -O0 as a workaround.
@@ -522,6 +486,20 @@ IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64"
       COMPILE_FLAGS "-O0"
       )
 ENDIF()
+
+# Older gcc version insist on -mhtm flag for including the
+# htmxlintrin.h header. This is also true for new gcc versions
+# like 11.2.0 in Debian Sid
+# s390x because of the way it defines the high level intrinsics
+# as not-inline in the header file can only be included by one
+# source file that has -mhtm enabled.
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64|s390x"
+   OR CMAKE_SYSTEM_NAME MATCHES "AIX")
+  ADD_COMPILE_FLAGS(
+      sync/srw_lock.cc
+      COMPILE_FLAGS "-mhtm"
+      )
+ENDIF()
 IF(MSVC)
   IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
    ADD_COMPILE_FLAGS(
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 6fd9686304c..0b74d3b4311 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -43,8 +43,7 @@ Created 6/2/1994 Heikki Tuuri
 #include "gis0geo.h"
 #include "dict0boot.h"
 #include "row0sel.h" /* row_search_max_autoinc() */
-
-Atomic_counter<uint32_t> btr_validate_index_running;
+#include "log.h"
 
 /**************************************************************//**
 Checks if the page in the cursor can be merged with given page.
@@ -59,17 +58,6 @@ btr_can_merge_with_page(
 	buf_block_t**	merge_block,	/*!< out: the merge block */
 	mtr_t*		mtr);		/*!< in: mini-transaction */
 
-/** Report that an index page is corrupted.
-@param[in]	buffer block
-@param[in]	index tree */
-void btr_corruption_report(const buf_block_t* block, const dict_index_t* index)
-{
-	ib::fatal()
-		<< "Flag mismatch in page " << block->page.id()
-		<< " index " << index->name
-		<< " of table " << index->table->name;
-}
-
 /*
 Latching strategy of the InnoDB B-tree
 --------------------------------------
@@ -188,25 +176,77 @@ make them consecutive on disk if possible. From the other file segment
 we allocate pages for the non-leaf levels of the tree.
 */
 
-#ifdef UNIV_BTR_DEBUG
-/**************************************************************//**
-Checks a file segment header within a B-tree root page.
-@return TRUE if valid */
-static
-ibool
-btr_root_fseg_validate(
-/*===================*/
-	const fseg_header_t*	seg_header,	/*!< in: segment header */
-	ulint			space)		/*!< in: tablespace identifier */
+/** Check a file segment header within a B-tree root page.
+@param offset      file segment header offset
+@param block       B-tree root page
+@param space       tablespace
+@return whether the segment header is valid */
+static bool btr_root_fseg_validate(ulint offset,
+                                   const buf_block_t &block,
+                                   const fil_space_t &space)
 {
-	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+  ut_ad(block.page.id().space() == space.id);
+  const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET +
+                                       block.page.frame);
+  if (FIL_PAGE_DATA <= hdr && hdr <= srv_page_size - FIL_PAGE_DATA_END &&
+      mach_read_from_4(block.page.frame + offset + FSEG_HDR_SPACE) == space.id)
+    return true;
+  sql_print_error("InnoDB: Index root page " UINT32PF " in %s is corrupted "
+                  "at " ULINTPF,
+                  block.page.id().page_no(),
+                  UT_LIST_GET_FIRST(space.chain)->name);
+  return false;
+}
 
-	ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
-	ut_a(offset >= FIL_PAGE_DATA);
-	ut_a(offset <= srv_page_size - FIL_PAGE_DATA_END);
-	return(TRUE);
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
+{
+  ib_push_warning(static_cast<void*>(nullptr), DB_DECRYPTION_FAILED,
+                  "Table %s is encrypted but encryption service or"
+                  " used key_id is not available. "
+                  " Can't continue reading table.",
+                  index.table->name.m_name);
+  index.table->file_unreadable= true;
+}
+
+/** Get an index page and declare its latching order level.
+@param[in]	index	index tree
+@param[in]	page	page number
+@param[in]	mode	latch mode
+@param[in]	merge	whether change buffer merge should be attempted
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return block */
+buf_block_t *btr_block_get(const dict_index_t &index,
+                           uint32_t page, ulint mode, bool merge,
+                           mtr_t *mtr, dberr_t *err)
+{
+  dberr_t local_err;
+  if (!err)
+    err= &local_err;
+  buf_block_t *block=
+    buf_page_get_gen(page_id_t{index.table->space->id, page},
+                     index.table->space->zip_size(), mode, nullptr, BUF_GET,
+                     mtr, err, merge && !index.is_clust());
+  ut_ad(!block == (*err != DB_SUCCESS));
+
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    if (!!page_is_comp(block->page.frame) != index.table->not_redundant() ||
+        btr_page_get_index_id(block->page.frame) != index.id ||
+        !fil_page_index_page_check(block->page.frame) ||
+        index.is_spatial() !=
+        (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+    {
+      *err= DB_PAGE_CORRUPTED;
+      block= nullptr;
+    }
+  }
+  else if (*err == DB_DECRYPTION_FAILED)
+    btr_decryption_failed(index);
+
+  return block;
 }
-#endif /* UNIV_BTR_DEBUG */
 
 /**************************************************************//**
 Gets the root node of a tree and x- or s-latches it.
@@ -214,96 +254,74 @@ Gets the root node of a tree and x- or s-latches it.
 buf_block_t*
 btr_root_block_get(
 /*===============*/
-	const dict_index_t*	index,	/*!< in: index tree */
+	dict_index_t*		index,	/*!< in: index tree */
 	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
 					or RW_X_LATCH */
-	mtr_t*			mtr)	/*!< in: mtr */
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err)	/*!< out: error code */
 {
-	if (!index->table || !index->table->space) {
-		return NULL;
-	}
-
-	buf_block_t* block = btr_block_get(*index, index->page, mode, false,
-					   mtr);
-
-	if (!block) {
-		index->table->file_unreadable = true;
-
-		ib_push_warning(
-			static_cast<THD*>(NULL), DB_DECRYPTION_FAILED,
-			"Table %s in file %s is encrypted but encryption service or"
-			" used key_id is not available. "
-			" Can't continue reading table.",
-			index->table->name.m_name,
-			UT_LIST_GET_FIRST(index->table->space->chain)->name);
-
-		return NULL;
-	}
-
-	btr_assert_not_corrupted(block, index);
+  if (!index->table || !index->table->space)
+  {
+    *err= DB_TABLESPACE_NOT_FOUND;
+    return nullptr;
+  }
 
-#ifdef UNIV_BTR_DEBUG
-	if (!dict_index_is_ibuf(index)) {
-		const page_t*	root = buf_block_get_frame(block);
+  buf_block_t *block;
+#ifndef BTR_CUR_ADAPT
+  static constexpr buf_block_t *guess= nullptr;
+#else
+  buf_block_t *&guess= btr_search_get_info(index)->root_guess;
+  guess=
+#endif
+  block=
+    buf_page_get_gen(page_id_t{index->table->space->id, index->page},
+                     index->table->space->zip_size(), mode, guess, BUF_GET,
+                     mtr, err, false);
+  ut_ad(!block == (*err != DB_SUCCESS));
 
-		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-					    + root, index->table->space_id));
-		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-					    + root, index->table->space_id));
-	}
-#endif /* UNIV_BTR_DEBUG */
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
+        btr_page_get_index_id(block->page.frame) != index->id ||
+        !fil_page_index_page_check(block->page.frame) ||
+        index->is_spatial() !=
+        (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+    {
+      *err= DB_PAGE_CORRUPTED;
+      block= nullptr;
+    }
+    else if (index->is_ibuf());
+    else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+                                     *block, *index->table->space) ||
+             !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+                                     *block, *index->table->space))
+    {
+      *err= DB_CORRUPTION;
+      block= nullptr;
+    }
+  }
+  else if (*err == DB_DECRYPTION_FAILED)
+    btr_decryption_failed(*index);
 
-	return(block);
+  return block;
 }
 
 /**************************************************************//**
 Gets the root node of a tree and sx-latches it for segment access.
 @return root page, sx-latched */
+static
 page_t*
 btr_root_get(
 /*=========*/
-	const dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*			mtr)	/*!< in: mtr */
-{
-	/* Intended to be used for segment list access.
-	SX lock doesn't block reading user data by other threads.
-	And block the segment list access by others.*/
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
-	return(root ? buf_block_get_frame(root) : NULL);
-}
-
-/**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
-	const dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	dict_index_t*		index,	/*!< in: index tree */
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err)	/*!< out: error code */
 {
-	ulint		height=0;
-	buf_block_t*	root_block;
-
-	ut_ad(srv_read_only_mode
-	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
-					    | MTR_MEMO_X_LOCK
-					    | MTR_MEMO_SX_LOCK));
-
-	/* S latches the page */
-	root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
-
-	if (root_block) {
-		height = btr_page_get_level(buf_block_get_frame(root_block));
-
-		/* Release the S latch on the root page. */
-		mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
-
-		ut_d(sync_check_unlock(&root_block->lock));
-	}
-
-	return(height);
+  /* Intended to be used for accessing file segment lists.
+  Concurrent read of other data is allowed. */
+  if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))
+    return root->page.frame;
+  return nullptr;
 }
 
 /**************************************************************//**
@@ -361,15 +379,12 @@ btr_root_adjust_on_import(
 	buf_block_t* block = buf_page_get_gen(
 		page_id_t(table->space->id, index->page),
 		table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET,
-		__FILE__, __LINE__,
 		&mtr, &err);
 	if (!block) {
 		ut_ad(err != DB_SUCCESS);
 		goto func_exit;
 	}
 
-	buf_block_dbg_add_level(block, SYNC_TREE_NODE);
-
 	page = buf_block_get_frame(block);
 	page_zip = buf_block_get_page_zip(block);
 
@@ -393,11 +408,11 @@ btr_root_adjust_on_import(
 			tf &= ~FSP_FLAGS_MEM_MASK;
 			if (fil_space_t::is_flags_equal(tf, sf)
 			    || fil_space_t::is_flags_equal(sf, tf)) {
-				mutex_enter(&fil_system.mutex);
+				mysql_mutex_lock(&fil_system.mutex);
 				table->space->flags = (table->space->flags
 						       & ~FSP_FLAGS_MEM_MASK)
 					| (tf & FSP_FLAGS_MEM_MASK);
-				mutex_exit(&fil_system.mutex);
+				mysql_mutex_unlock(&fil_system.mutex);
 				err = DB_SUCCESS;
 			} else {
 				err = DB_CORRUPTION;
@@ -439,7 +454,7 @@ btr_page_create(
 {
   ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
   byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
-                                       block->frame);
+                                       block->page.frame);
 
   if (UNIV_LIKELY_NULL(page_zip))
   {
@@ -453,20 +468,67 @@ btr_page_create(
     {
       static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
                     FIL_PAGE_RTREE, "compatibility");
-      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
                     byte(FIL_PAGE_RTREE));
-      if (mach_read_from_8(block->frame + FIL_RTREE_SPLIT_SEQ_NUM))
+      if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM))
         mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
     }
     /* Set the level of the new index page */
     mtr->write<2,mtr_t::MAYBE_NOP>(*block,
                                    my_assume_aligned<2>(PAGE_HEADER +
                                                         PAGE_LEVEL +
-                                                        block->frame), level);
+                                                        block->page.frame),
+				   level);
     mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id);
   }
 }
 
+buf_block_t *
+mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const
+{
+  ut_ad(is_active());
+  ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX ||
+        type == MTR_MEMO_PAGE_S_FIX);
+  for (ulint i= 0; i < m_memo.size(); i++)
+  {
+    const mtr_memo_slot_t &slot= m_memo[i];
+    const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY);
+    if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type)
+    {
+      buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+      if (block->page.id() == id)
+        return block;
+    }
+  }
+  return nullptr;
+}
+
+/** Fetch an index root page that was already latched in the
+mini-transaction. */
+static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr)
+{
+  return mtr->get_already_latched(page_id_t{index.table->space_id, index.page},
+                                  MTR_MEMO_PAGE_SX_FIX);
+}
+
+/** Fetch an index page that should have been already latched in the
+mini-transaction. */
+static buf_block_t *
+btr_block_reget(mtr_t *mtr, const dict_index_t &index,
+                const page_id_t id, rw_lock_type_t rw_latch,
+                dberr_t *err)
+{
+  if (buf_block_t *block=
+      mtr->get_already_latched(id, mtr_memo_type_t(rw_latch)))
+  {
+    *err= DB_SUCCESS;
+    return block;
+  }
+
+  ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK));
+  return btr_block_get(index, id.page_no(), rw_latch, true, mtr, err);
+}
+
 /**************************************************************//**
 Allocates a new file page to be used in an ibuf tree. Takes the page from
 the free list of the tree, which must contain pages!
@@ -476,30 +538,25 @@ buf_block_t*
 btr_page_alloc_for_ibuf(
 /*====================*/
 	dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
 {
-	buf_block_t*	new_block;
-
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
-
-	fil_addr_t node_addr = flst_get_first(PAGE_HEADER
-					      + PAGE_BTR_IBUF_FREE_LIST
-					      + root->frame);
-	ut_a(node_addr.page != FIL_NULL);
-
-	new_block = buf_page_get(
-		page_id_t(index->table->space_id, node_addr.page),
-		index->table->space->zip_size(),
-		RW_X_LATCH, mtr);
-
-	buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
-
-	flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		    new_block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
-		    mtr);
-	ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
-
-	return(new_block);
+  buf_block_t *root= btr_get_latched_root(*index, mtr);
+  if (UNIV_UNLIKELY(!root))
+    return root;
+  buf_block_t *new_block=
+    buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
+                               mach_read_from_4(PAGE_HEADER +
+                                                PAGE_BTR_IBUF_FREE_LIST +
+                                                FLST_FIRST + FIL_ADDR_PAGE +
+                                                root->page.frame)),
+                     0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+  if (new_block)
+    *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block,
+                PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+  ut_d(if (*err == DB_SUCCESS)
+         flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+  return new_block;
 }
 
 /**************************************************************//**
@@ -518,26 +575,36 @@ btr_page_alloc_low(
 					in the tree */
 	mtr_t*		mtr,		/*!< in/out: mini-transaction
 					for the allocation */
-	mtr_t*		init_mtr)	/*!< in/out: mtr or another
+	mtr_t*		init_mtr,	/*!< in/out: mtr or another
 					mini-transaction in which the
 					page should be initialized. */
+	dberr_t*	err)		/*!< out: error code */
 {
-	page_t* root = btr_root_get(index, mtr);
+  const auto savepoint= mtr->get_savepoint();
+  buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err);
+  if (UNIV_UNLIKELY(!root))
+    return root;
 
-	fseg_header_t* seg_header = (level
-				     ? PAGE_HEADER + PAGE_BTR_SEG_TOP
-				     : PAGE_HEADER + PAGE_BTR_SEG_LEAF)
-		+ root;
-
-	/* Parameter TRUE below states that the caller has made the
-	reservation for free extents, and thus we know that a page can
-	be allocated: */
-
-	buf_block_t* block = fseg_alloc_free_page_general(
-		seg_header, hint_page_no, file_direction,
-		true, mtr, init_mtr);
+  if (mtr->have_u_or_x_latch(*root))
+  {
+#ifdef BTR_CUR_HASH_ADAPT
+    ut_ad(!root->index || !root->index->freed());
+#endif
+    mtr->rollback_to_savepoint(savepoint);
+  }
+  else
+  {
+    mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX);
+    root->page.lock.u_lock();
+#ifdef BTR_CUR_HASH_ADAPT
+    btr_search_drop_page_hash_index(root, true);
+#endif
+  }
 
-	return block;
+  fseg_header_t *seg_header= root->page.frame +
+    (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF);
+  return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction,
+                                      true, mtr, init_mtr, err);
 }
 
 /**************************************************************//**
@@ -555,187 +622,109 @@ btr_page_alloc(
 					in the tree */
 	mtr_t*		mtr,		/*!< in/out: mini-transaction
 					for the allocation */
-	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+	mtr_t*		init_mtr,	/*!< in/out: mini-transaction
 					for x-latching and initializing
 					the page */
+	dberr_t*	err)		/*!< out: error code */
 {
-	buf_block_t*	new_block;
-
-	if (dict_index_is_ibuf(index)) {
-
-		return(btr_page_alloc_for_ibuf(index, mtr));
-	}
-
-	new_block = btr_page_alloc_low(
-		index, hint_page_no, file_direction, level, mtr, init_mtr);
-
-	if (new_block) {
-		buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
-	}
-
-	return(new_block);
-}
-
-/**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
-	const dict_index_t*	index,	/*!< in: index */
-	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
-				is s-latched */
-{
-	ulint		n=0;
-
-	ut_ad(srv_read_only_mode
-	      || mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
-	ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
-
-	if (index->page == FIL_NULL
-	    || dict_index_is_online_ddl(index)
-	    || !index->is_committed()
-	    || !index->table->space) {
-		return(ULINT_UNDEFINED);
-	}
-
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
-	if (!root) {
-		return ULINT_UNDEFINED;
-	}
-	mtr_x_lock_space(index->table->space, mtr);
-	if (flag == BTR_N_LEAF_PAGES) {
-		fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
-				      + root->frame, &n, mtr);
-	} else {
-		ulint dummy;
-		n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
-					  + root->frame, &dummy, mtr);
-		n += fseg_n_reserved_pages(*root,
-					   PAGE_HEADER + PAGE_BTR_SEG_LEAF
-					   + root->frame, &dummy, mtr);
-	}
-
-	return(n);
-}
-
-/**************************************************************//**
-Gets the number of reserved and used pages in a B-tree.
-@return	number of pages reserved, or ULINT_UNDEFINED if the index
-is unavailable */
-UNIV_INTERN
-ulint
-btr_get_size_and_reserved(
-/*======================*/
-	dict_index_t*	index,	/*!< in: index */
-	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
-				is s-latched */
-{
-	ulint		dummy;
-
-	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
-	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
-
-	if (index->page == FIL_NULL
-	    || dict_index_is_online_ddl(index)
-	    || !index->is_committed()
-	    || !index->table->space) {
-		return(ULINT_UNDEFINED);
-	}
-
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
-	*used = 0;
-	if (!root) {
-		return ULINT_UNDEFINED;
-	}
-
-	mtr_x_lock_space(index->table->space, mtr);
-
-	ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
-					+ root->frame, used, mtr);
-	if (flag == BTR_TOTAL_SIZE) {
-		n += fseg_n_reserved_pages(*root,
-					   PAGE_HEADER + PAGE_BTR_SEG_TOP
-					   + root->frame, &dummy, mtr);
-		*used += dummy;
-	}
-
-	return(n);
+  ut_ad(level < BTR_MAX_NODE_LEVEL);
+  return index->is_ibuf()
+    ? btr_page_alloc_for_ibuf(index, mtr, err)
+    : btr_page_alloc_low(index, hint_page_no, file_direction, level,
+                         mtr, init_mtr, err);
 }
 
 /**************************************************************//**
 Frees a page used in an ibuf tree. Puts the page to the free list of the
 ibuf tree. */
 static
-void
+dberr_t
 btr_page_free_for_ibuf(
 /*===================*/
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
-
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
-
-	flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		       block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
-
-	ut_d(flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  buf_block_t *root= btr_get_latched_root(*index, mtr);
+  dberr_t err=
+    flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+                   block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+  ut_d(if (err == DB_SUCCESS)
+         flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+  return err;
 }
 
 /** Free an index page.
 @param[in,out]	index	index tree
 @param[in,out]	block	block to be freed
 @param[in,out]	mtr	mini-transaction
-@param[in]	blob	whether this is freeing a BLOB page */
-void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
-		   bool blob)
+@param[in]	blob	whether this is freeing a BLOB page
+@param[in]	latched	whether index->table->space->x_lock() was called
+@return error code */
+dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+                      bool blob, bool space_latched)
 {
-	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 #if defined BTR_CUR_HASH_ADAPT && defined UNIV_DEBUG
-	if (block->index
-	    && !btr_search_check_marked_free_index(block)) {
-		ut_ad(!blob);
-		ut_ad(page_is_leaf(block->frame));
-	}
+  if (btr_search_check_marked_free_index(block))
+  {
+    ut_ad(!blob);
+    ut_ad(page_is_leaf(block->page.frame));
+  }
 #endif
-	const page_id_t id(block->page.id());
-	ut_ad(index->table->space_id == id.space());
-	/* The root page is freed by btr_free_root(). */
-	ut_ad(id.page_no() != index->page);
-	ut_ad(mtr->is_named_space(index->table->space));
+  const uint32_t page{block->page.id().page_no()};
+  ut_ad(index->table->space_id == block->page.id().space());
+  /* The root page is freed by btr_free_root(). */
+  ut_ad(page != index->page);
+  ut_ad(mtr->is_named_space(index->table->space));
 
-	/* The page gets invalid for optimistic searches: increment the frame
-	modify clock */
+  /* The page gets invalid for optimistic searches: increment the frame
+  modify clock */
+  buf_block_modify_clock_inc(block);
 
-	buf_block_modify_clock_inc(block);
+  /* TODO: Discard any operations for block from mtr->m_log.
+  The page will be freed, so previous changes to it by this
+  mini-transaction should not matter. */
 
-	if (dict_index_is_ibuf(index)) {
-		btr_page_free_for_ibuf(index, block, mtr);
-		return;
-	}
+  if (index->is_ibuf())
+    return btr_page_free_for_ibuf(index, block, mtr);
 
-	/* TODO: Discard any operations for block from mtr->log.
-	The page will be freed, so previous changes to it by this
-	mini-transaction should not matter. */
-	page_t* root = btr_root_get(index, mtr);
-	fseg_header_t* seg_header = &root[blob || page_is_leaf(block->frame)
-					  ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
-					  : PAGE_HEADER + PAGE_BTR_SEG_TOP];
-	fil_space_t* space= index->table->space;
-	const uint32_t page= id.page_no();
+  fil_space_t *space= index->table->space;
+  dberr_t err;
 
-	fseg_free_page(seg_header, space, page, mtr);
-	buf_page_free(space, page, mtr, __FILE__, __LINE__);
+  const auto savepoint= mtr->get_savepoint();
+  if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err))
+  {
+    if (mtr->have_u_or_x_latch(*root))
+    {
+#ifdef BTR_CUR_HASH_ADAPT
+      ut_ad(!root->index || !root->index->freed());
+#endif
+      mtr->rollback_to_savepoint(savepoint);
+    }
+    else
+    {
+      mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX);
+      root->page.lock.u_lock();
+#ifdef BTR_CUR_HASH_ADAPT
+      btr_search_drop_page_hash_index(root, true);
+#endif
+    }
+    err= fseg_free_page(&root->page.frame[blob ||
+                                          page_is_leaf(block->page.frame)
+                                          ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
+                                          : PAGE_HEADER + PAGE_BTR_SEG_TOP],
+                        space, page, mtr, space_latched);
+  }
+  if (err == DB_SUCCESS)
+    buf_page_free(space, page, mtr);
 
-	/* The page was marked free in the allocation bitmap, but it
-	should remain exclusively latched until mtr_t::commit() or until it
-	is explicitly freed from the mini-transaction. */
-	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  /* The page was marked free in the allocation bitmap, but it
+  should remain exclusively latched until mtr_t::commit() or until it
+  is explicitly freed from the mini-transaction. */
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  return err;
 }
 
 /** Set the child page number in a node pointer record.
@@ -763,6 +752,7 @@ inline void btr_node_ptr_set_child_page_no(buf_block_t *block,
     mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no);
 }
 
+MY_ATTRIBUTE((nonnull(1,2,3,4),warn_unused_result))
 /************************************************************//**
 Returns the child page of a node pointer and sx-latches it.
 @return child page, sx-latched */
@@ -773,7 +763,8 @@ btr_node_ptr_get_child(
 	const rec_t*	node_ptr,/*!< in: node pointer */
 	dict_index_t*	index,	/*!< in: index */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err = nullptr)	/*!< out: error code */
 {
 	ut_ad(rec_offs_validate(node_ptr, index, offsets));
 	ut_ad(index->table->space_id
@@ -782,115 +773,115 @@ btr_node_ptr_get_child(
 	return btr_block_get(
 		*index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
 		RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
-		mtr);
+		mtr, err);
 }
 
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
 /************************************************************//**
 Returns the upper level node pointer to a page. It is assumed that mtr holds
 an sx-latch on the tree.
 @return rec_get_offsets() of the node pointer record */
 static
 rec_offs*
-btr_page_get_father_node_ptr_func(
-/*==============================*/
+btr_page_get_father_node_ptr_for_validate(
 	rec_offs*	offsets,/*!< in: work area for the return value */
 	mem_heap_t*	heap,	/*!< in: memory heap to use */
 	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
 				out: cursor on node pointer record,
 				its page x-latched */
-	ulint		latch_mode,/*!< in: BTR_CONT_MODIFY_TREE
-				or BTR_CONT_SEARCH_TREE */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	dtuple_t*	tuple;
-	rec_t*		user_rec;
-	rec_t*		node_ptr;
-	ulint		level;
-	ulint		page_no;
-	dict_index_t*	index;
-
-	ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
-	      || latch_mode == BTR_CONT_SEARCH_TREE);
-
-	page_no = btr_cur_get_block(cursor)->page.id().page_no();
-	index = btr_cur_get_index(cursor);
+	const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+	dict_index_t* index = btr_cur_get_index(cursor);
 	ut_ad(!dict_index_is_spatial(index));
-
-	ut_ad(srv_read_only_mode
-	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
-					    | MTR_MEMO_SX_LOCK));
-
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
 	ut_ad(dict_index_get_page(index) != page_no);
 
-	level = btr_page_get_level(btr_cur_get_page(cursor));
+	const auto level = btr_page_get_level(btr_cur_get_page(cursor));
 
-	user_rec = btr_cur_get_rec(cursor);
+	const rec_t* user_rec = btr_cur_get_rec(cursor);
 	ut_a(page_rec_is_user_rec(user_rec));
 
-	tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
-	dberr_t err = DB_SUCCESS;
-
-	err = btr_cur_search_to_nth_level(
-		index, level + 1, tuple,
-		PAGE_CUR_LE, latch_mode, cursor,
-		file, line, mtr);
-
-	if (err != DB_SUCCESS) {
-		ib::warn() << " Error code: " << err
-			<< " btr_page_get_father_node_ptr_func "
-			<< " level: " << level + 1
-			<< " called from file: "
-			<< file << " line: " << line
-			<< " table: " << index->table->name
-			<< " index: " << index->name();
+	if (btr_cur_search_to_nth_level(level + 1,
+					dict_index_build_node_ptr(index,
+								  user_rec, 0,
+								  heap, level),
+					RW_S_LATCH,
+					cursor, mtr) != DB_SUCCESS) {
+		return nullptr;
 	}
 
-	node_ptr = btr_cur_get_rec(cursor);
+	const rec_t* node_ptr = btr_cur_get_rec(cursor);
 
 	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
 				  ULINT_UNDEFINED, &heap);
 
 	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
-		rec_t*	print_rec;
-
-		ib::error()
-			<< "Corruption of an index tree: table "
-			<< index->table->name
-			<< " index " << index->name
-			<< ", father ptr page no "
-			<< btr_node_ptr_get_child_page_no(node_ptr, offsets)
-			<< ", child page no " << page_no;
-
-		print_rec = page_rec_get_next(
-			page_get_infimum_rec(page_align(user_rec)));
-		offsets = rec_get_offsets(print_rec, index, offsets,
-					  page_rec_is_leaf(user_rec)
-					  ? index->n_core_fields : 0,
-					  ULINT_UNDEFINED, &heap);
-		page_rec_print(print_rec, offsets);
-		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
-					  ULINT_UNDEFINED, &heap);
-		page_rec_print(node_ptr, offsets);
-
-		ib::fatal()
-			<< "You should dump + drop + reimport the table to"
-			<< " fix the corruption. If the crash happens at"
-			<< " database startup. " << FORCE_RECOVERY_MSG
-			<< " Then dump + drop + reimport.";
+		offsets = nullptr;
 	}
 
 	return(offsets);
 }
 
-#define btr_page_get_father_node_ptr(of,heap,cur,mtr)			\
-	btr_page_get_father_node_ptr_func(				\
-		of,heap,cur,BTR_CONT_MODIFY_TREE,__FILE__,__LINE__,mtr)
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/** Return the node pointer to a page.
+@param offsets   work area for the return value
+@param heap      memory heap
+@param cursor    in: child page; out: node pointer to it
+@param mtr       mini-transaction
+@return rec_get_offsets() of the node pointer record
+@retval nullptr  if the parent page had not been latched in mtr */
+static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap,
+                                     btr_cur_t *cursor, mtr_t *mtr)
+{
+  const uint32_t page_no= cursor->block()->page.id().page_no();
+  const dict_index_t *index= cursor->index();
+  ut_ad(!index->is_spatial());
+  ut_ad(index->page != page_no);
+
+  uint32_t p= index->page;
+  auto level= btr_page_get_level(cursor->block()->page.frame);
+  const dtuple_t *tuple=
+    dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, level);
+  level++;
+
+  ulint i;
+  for (i= 0; i < mtr->get_savepoint(); i++)
+    if (buf_block_t *block= mtr->block_at_savepoint(i))
+      if (block->page.id().page_no() == p)
+      {
+        ut_ad(block->page.lock.have_u_or_x() ||
+              (!block->page.lock.have_s() && index->lock.have_x()));
+        ulint up_match= 0, low_match= 0;
+        cursor->page_cur.block= block;
+        if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match,
+                                       &low_match, &cursor->page_cur,
+                                       nullptr))
+          return nullptr;
+        offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
+                                 ULINT_UNDEFINED, &heap);
+        p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets);
+        if (p != page_no)
+        {
+          if (btr_page_get_level(block->page.frame) == level)
+            return nullptr;
+          i= 0; // MDEV-29835 FIXME: require all pages to be latched in order!
+          continue;
+        }
+        ut_ad(block->page.lock.have_u_or_x());
+        if (block->page.lock.have_u_not_x())
+        {
+          /* btr_cur_t::search_leaf(BTR_MODIFY_TREE) only U-latches the
+          root page initially. */
+          ut_ad(block->page.id().page_no() == index->page);
+          block->page.lock.u_x_upgrade();
+          mtr->page_lock_upgrade(*block);
+        }
+        return offsets;
+      }
 
-#define btr_page_get_father_node_ptr_for_validate(of,heap,cur,mtr)	\
-	btr_page_get_father_node_ptr_func(				\
-		of,heap,cur,BTR_CONT_SEARCH_TREE,__FILE__,__LINE__,mtr)
+  return nullptr;
+}
 
 /************************************************************//**
 Returns the upper level node pointer to a page. It is assumed that mtr holds
@@ -902,36 +893,33 @@ btr_page_get_father_block(
 /*======================*/
 	rec_offs*	offsets,/*!< in: work area for the return value */
 	mem_heap_t*	heap,	/*!< in: memory heap to use */
-	dict_index_t*	index,	/*!< in: b-tree index */
-	buf_block_t*	block,	/*!< in: child page in the index */
 	mtr_t*		mtr,	/*!< in: mtr */
 	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
 				its page x-latched */
 {
-	rec_t*	rec
-		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
-								 block)));
-	btr_cur_position(index, rec, block, cursor);
-	return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (UNIV_UNLIKELY(!rec))
+    return nullptr;
+  cursor->page_cur.rec= rec;
+  return btr_page_get_parent(offsets, heap, cursor, mtr);
 }
 
 /** Seek to the parent page of a B-tree page.
-@param[in,out]	index	b-tree
-@param[in]	block	child page
 @param[in,out]	mtr	mini-transaction
-@param[out]	cursor	cursor pointing to the x-latched parent page */
-void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
-			 btr_cur_t* cursor)
+@param[in,out]	cursor	cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
 {
-	mem_heap_t*	heap;
-	rec_t*		rec
-		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
-								 block)));
-	btr_cur_position(index, rec, block, cursor);
-
-	heap = mem_heap_create(100);
-	btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
-	mem_heap_free(heap);
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (UNIV_UNLIKELY(!rec))
+    return false;
+  cursor->page_cur.rec= rec;
+  mem_heap_t *heap= mem_heap_create(100);
+  const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr);
+  mem_heap_free(heap);
+  return got;
 }
 
 #ifdef UNIV_DEBUG
@@ -941,27 +929,28 @@ constexpr index_id_t	BTR_FREED_INDEX_ID = 0;
 
 /** Free a B-tree root page. btr_free_but_not_root() must already
 have been called.
-In a persistent tablespace, the caller must invoke fsp_init_file_page()
-before mtr.commit().
-@param[in,out]	block		index root page
-@param[in,out]	mtr		mini-transaction */
-static void btr_free_root(buf_block_t *block, mtr_t *mtr)
+@param block   index root page
+@param space   tablespace
+@param mtr     mini-transaction */
+static void btr_free_root(buf_block_t *block, const fil_space_t &space,
+                          mtr_t *mtr)
 {
   ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
-  ut_ad(mtr->is_named_space(block->page.id().space()));
+  ut_ad(mtr->is_named_space(&space));
 
   btr_search_drop_page_hash_index(block, false);
 
-#ifdef UNIV_BTR_DEBUG
-  ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame,
-			      block->page.id().space()));
-#endif /* UNIV_BTR_DEBUG */
-
-  /* Free the entire segment in small steps. */
-  while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, mtr));
+  if (btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP, *block, space))
+  {
+    /* Free the entire segment in small steps. */
+    ut_d(mtr->freeing_tree());
+    while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP +
+                           block->page.frame, mtr));
+  }
 }
 
+MY_ATTRIBUTE((warn_unused_result))
 /** Prepare to free a B-tree.
 @param[in]	page_id		page id
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@@ -969,35 +958,64 @@ static void btr_free_root(buf_block_t *block, mtr_t *mtr)
 @param[in,out]	mtr		mini-transaction
 @return root block, to invoke btr_free_but_not_root() and btr_free_root()
 @retval NULL if the page is no longer a matching B-tree page */
-static MY_ATTRIBUTE((warn_unused_result))
-buf_block_t*
-btr_free_root_check(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	index_id_t		index_id,
-	mtr_t*			mtr)
+static
+buf_block_t *btr_free_root_check(const page_id_t page_id, ulint zip_size,
+				 index_id_t index_id, mtr_t *mtr)
 {
-	ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
-	ut_ad(index_id != BTR_FREED_INDEX_ID);
-
-	buf_block_t*	block = buf_page_get(
-		page_id, zip_size, RW_X_LATCH, mtr);
+  ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+  ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+  buf_block_t *block= buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
+                                       nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+  if (!block);
+  else if (fil_page_index_page_check(block->page.frame) &&
+           index_id == btr_page_get_index_id(block->page.frame))
+    /* This should be a root page. It should not be possible to
+    reassign the same index_id for some other index in the
+    tablespace. */
+    ut_ad(!page_has_siblings(block->page.frame));
+  else
+    block= nullptr;
 
-	if (block) {
-		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+  return block;
+}
 
-		if (fil_page_index_page_check(block->frame)
-		    && index_id == btr_page_get_index_id(block->frame)) {
-			/* This should be a root page.
-			It should not be possible to reassign the same
-			index_id for some other index in the tablespace. */
-			ut_ad(!page_has_siblings(block->frame));
-		} else {
-			block = NULL;
-		}
-	}
+/** Initialize the root page of the b-tree
+@param[in,out]  block           root block
+@param[in]      index_id        index id
+@param[in]      index           index of root page
+@param[in,out]  mtr             mini-transaction */
+static void btr_root_page_init(buf_block_t *block, index_id_t index_id,
+                               dict_index_t *index, mtr_t *mtr)
+{
+  constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID;
+  byte *page_index_id= my_assume_aligned<2>(field + block->page.frame);
 
-	return(block);
+  /* Create a new index page on the allocated segment page */
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    mach_write_to_8(page_index_id, index_id);
+    ut_ad(!page_has_siblings(block->page.zip.data));
+    page_create_zip(block, index, 0, 0, mtr);
+  }
+  else
+  {
+    page_create(block, mtr, index && index->table->not_redundant());
+    if (index && index->is_spatial())
+    {
+      static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                    FIL_PAGE_RTREE, "compatibility");
+      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+                    byte(FIL_PAGE_RTREE));
+      if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM))
+        mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+    }
+    /* Set the level of the new index page */
+    mtr->write<2,mtr_t::MAYBE_NOP>(
+        *block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, 0U);
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, index_id);
+  }
 }
 
 /** Create the root node for a new index tree.
@@ -1006,6 +1024,7 @@ btr_free_root_check(
 @param[in,out]	space			tablespace where created
 @param[in]	index			index, or NULL to create a system table
 @param[in,out]	mtr			mini-transaction
+@param[out]	err			error code
 @return	page number of the created root
 @retval	FIL_NULL	if did not succeed */
 uint32_t
@@ -1014,12 +1033,14 @@ btr_create(
 	fil_space_t*		space,
 	index_id_t		index_id,
 	dict_index_t*		index,
-	mtr_t*			mtr)
+	mtr_t*			mtr,
+	dberr_t*		err)
 {
 	buf_block_t*		block;
 
 	ut_ad(mtr->is_named_space(space));
 	ut_ad(index_id != BTR_FREED_INDEX_ID);
+	ut_ad(index || space == fil_system.sys_space);
 
 	/* Create the two new segments (one, in the case of an ibuf tree) for
 	the index tree; the segment headers are put on the allocated root page
@@ -1029,25 +1050,22 @@ btr_create(
 	if (UNIV_UNLIKELY(type & DICT_IBUF)) {
 		/* Allocate first the ibuf header page */
 		buf_block_t*	ibuf_hdr_block = fseg_create(
-			space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
+			space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err);
 
 		if (ibuf_hdr_block == NULL) {
 			return(FIL_NULL);
 		}
 
-		buf_block_dbg_add_level(
-			ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
-
 		ut_ad(ibuf_hdr_block->page.id().page_no()
 		      == IBUF_HEADER_PAGE_NO);
 		/* Allocate then the next page to the segment: it will be the
 		tree root page */
 
-		block = fseg_alloc_free_page(
+		block = fseg_alloc_free_page_general(
 			buf_block_get_frame(ibuf_hdr_block)
 			+ IBUF_HEADER + IBUF_TREE_SEG_HEADER,
 			IBUF_TREE_ROOT_PAGE_NO,
-			FSP_UP, mtr);
+			FSP_UP, false, mtr, mtr, err);
 
 		if (block == NULL) {
 			return(FIL_NULL);
@@ -1055,64 +1073,27 @@ btr_create(
 
 		ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
 
-		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
-
 		flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
 	} else {
 		block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
-				    mtr);
+				    mtr, err);
 
 		if (block == NULL) {
 			return(FIL_NULL);
 		}
 
-		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
-
 		if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
-				 false, block)) {
+				 err, false, block)) {
 			/* Not enough space for new segment, free root
 			segment before return. */
-			btr_free_root(block, mtr);
+			btr_free_root(block, *space, mtr);
 			return(FIL_NULL);
 		}
-
-		/* The fseg create acquires a second latch on the page,
-		therefore we must declare it: */
-		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 	}
 
-	ut_ad(!page_has_siblings(block->frame));
+	ut_ad(!page_has_siblings(block->page.frame));
 
-	constexpr uint16_t field = PAGE_HEADER + PAGE_INDEX_ID;
-
-	byte* page_index_id = my_assume_aligned<2>(field + block->frame);
-
-	/* Create a new index page on the allocated segment page */
-	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
-		mach_write_to_8(page_index_id, index_id);
-		ut_ad(!page_has_siblings(block->page.zip.data));
-		page_create_zip(block, index, 0, 0, mtr);
-	} else {
-		page_create(block, mtr,
-			    index && index->table->not_redundant());
-		if (index && index->is_spatial()) {
-			static_assert(((FIL_PAGE_INDEX & 0xff00)
-				       | byte(FIL_PAGE_RTREE))
-				      == FIL_PAGE_RTREE, "compatibility");
-			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
-				      byte(FIL_PAGE_RTREE));
-			if (mach_read_from_8(block->frame
-					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
-				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
-					    8, 0);
-			}
-		}
-		/* Set the level of the new index page */
-		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
-					       + block->frame, 0U);
-		mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id,
-					       index_id);
-	}
+	btr_root_page_init(block, index_id, index, mtr);
 
 	/* We reset the free bits for the page in a separate
 	mini-transaction to allow creation of several trees in the
@@ -1130,7 +1111,7 @@ btr_create(
 	allowed size fit on the root page: this fact is needed to ensure
 	correctness of split algorithms */
 
-	ut_ad(page_get_max_insert_size(block->frame, 2)
+	ut_ad(page_get_max_insert_size(block->page.frame, 2)
 	      > 2 * BTR_PAGE_MAX_REC_SIZE);
 
 	return(block->page.id().page_no());
@@ -1144,36 +1125,39 @@ static
 void
 btr_free_but_not_root(
 	buf_block_t*	block,
-	mtr_log_t	log_mode)
+	mtr_log_t	log_mode
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif
+	)
 {
 	mtr_t	mtr;
 
-	ut_ad(fil_page_index_page_check(block->frame));
-	ut_ad(!page_has_siblings(block->frame));
+	ut_ad(fil_page_index_page_check(block->page.frame));
+	ut_ad(!page_has_siblings(block->page.frame));
 leaf_loop:
 	mtr_start(&mtr);
+	ut_d(mtr.freeing_tree());
 	mtr_set_log_mode(&mtr, log_mode);
-	mtr.set_named_space_id(block->page.id().space());
+	fil_space_t *space = mtr.set_named_space_id(block->page.id().space());
 
-	page_t*	root = block->frame;
-
-	if (!root) {
+	if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+				    *block, *space)
+	    || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+				       *block, *space)) {
 		mtr_commit(&mtr);
 		return;
 	}
 
-#ifdef UNIV_BTR_DEBUG
-	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-				    + root, block->page.id().space()));
-	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-				    + root, block->page.id().space()));
-#endif /* UNIV_BTR_DEBUG */
-
 	/* NOTE: page hash indexes are dropped when a page is freed inside
 	fsp0fsp. */
 
-	bool finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
-				       &mtr);
+	bool finished = fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_LEAF
+				       + block->page.frame, &mtr
+#ifdef BTR_CUR_HASH_ADAPT
+				       , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+				       );
 	mtr_commit(&mtr);
 
 	if (!finished) {
@@ -1183,17 +1167,16 @@ leaf_loop:
 top_loop:
 	mtr_start(&mtr);
 	mtr_set_log_mode(&mtr, log_mode);
-	mtr.set_named_space_id(block->page.id().space());
-
-	root = block->frame;
-
-#ifdef UNIV_BTR_DEBUG
-	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-				    + root, block->page.id().space()));
-#endif /* UNIV_BTR_DEBUG */
-
-	finished = fseg_free_step_not_header(
-		root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
+	space = mtr.set_named_space_id(block->page.id().space());
+
+	finished = !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+					   *block, *space)
+		|| fseg_free_step_not_header(PAGE_HEADER + PAGE_BTR_SEG_TOP
+					     + block->page.frame, &mtr
+#ifdef BTR_CUR_HASH_ADAPT
+					     ,ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+					     );
 	mtr_commit(&mtr);
 
 	if (!finished) {
@@ -1201,45 +1184,90 @@ top_loop:
 	}
 }
 
-/** Free a persistent index tree if it exists.
-@param[in]	page_id		root page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	index_id	PAGE_INDEX_ID contents
-@param[in,out]	mtr		mini-transaction */
-void
-btr_free_if_exists(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	index_id_t		index_id,
-	mtr_t*			mtr)
+/** Clear the index tree and reinitialize the root page, in the
+rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+@param thr query thread
+@return error code */
+TRANSACTIONAL_TARGET
+dberr_t dict_index_t::clear(que_thr_t *thr)
 {
-	buf_block_t* root = btr_free_root_check(
-		page_id, zip_size, index_id, mtr);
+  mtr_t mtr;
+  mtr.start();
+  if (table->is_temporary())
+    mtr.set_log_mode(MTR_LOG_NO_REDO);
+  else
+    set_modified(mtr);
+  mtr_sx_lock_index(this, &mtr);
+
+  dberr_t err;
+  if (buf_block_t *root_block=
+      buf_page_get_gen(page_id_t(table->space->id, page),
+                       table->space->zip_size(),
+                       RW_X_LATCH, nullptr, BUF_GET, &mtr, &err))
+  {
+    btr_free_but_not_root(root_block, mtr.get_log_mode()
+#ifdef BTR_CUR_HASH_ADAPT
+		          ,n_ahi_pages() != 0
+#endif
+                         );
 
-	if (root == NULL) {
-		return;
-	}
+#ifdef BTR_CUR_HASH_ADAPT
+    if (root_block->index)
+      btr_search_drop_page_hash_index(root_block, false);
+    ut_ad(n_ahi_pages() == 0);
+#endif
+    mtr.memset(root_block, PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+               FSEG_HEADER_SIZE, 0);
+    if (fseg_create(table->space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr,
+                    &err, false, root_block))
+      btr_root_page_init(root_block, id, this, &mtr);
+  }
 
-	btr_free_but_not_root(root, mtr->get_log_mode());
-	mtr->set_named_space_id(page_id.space());
-	btr_free_root(root, mtr);
+  mtr.commit();
+  return err;
 }
 
-/** Free an index tree in a temporary tablespace.
-@param[in]	page_id		root page id */
-void btr_free(const page_id_t page_id)
+/** Free a persistent index tree if it exists.
+@param[in,out]	space		tablespce
+@param[in]	page		root page number
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction */
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+                        index_id_t index_id, mtr_t *mtr)
 {
-	mtr_t		mtr;
-	mtr.start();
-	mtr.set_log_mode(MTR_LOG_NO_REDO);
-
-	buf_block_t*	block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
+  if (buf_block_t *root= btr_free_root_check(page_id_t(space->id, page),
+					     space->zip_size(),
+					     index_id, mtr))
+  {
+    btr_free_but_not_root(root, mtr->get_log_mode());
+    mtr->set_named_space(space);
+    btr_free_root(root, *space, mtr);
+  }
+}
 
-	if (block) {
-		btr_free_but_not_root(block, MTR_LOG_NO_REDO);
-		btr_free_root(block, &mtr);
-	}
-	mtr.commit();
+/** Drop a temporary table
+@param table   temporary table */
+void btr_drop_temporary_table(const dict_table_t &table)
+{
+  ut_ad(table.is_temporary());
+  ut_ad(table.space == fil_system.temp_space);
+  mtr_t mtr;
+  mtr.start();
+  for (const dict_index_t *index= table.indexes.start; index;
+       index= dict_table_get_next_index(index))
+  {
+    if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0,
+                                             RW_X_LATCH, nullptr, BUF_GET, &mtr,
+                                             nullptr, false))
+    {
+      btr_free_but_not_root(block, MTR_LOG_NO_REDO);
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
+      btr_free_root(block, *fil_system.temp_space, &mtr);
+      mtr.commit();
+      mtr.start();
+    }
+  }
+  mtr.commit();
 }
 
 /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
@@ -1259,7 +1287,7 @@ btr_read_autoinc(dict_index_t* index)
 		    page_id_t(index->table->space_id, index->page),
 		    index->table->space->zip_size(),
 		    RW_S_LATCH, &mtr)) {
-		autoinc = page_get_autoinc(block->frame);
+		autoinc = page_get_autoinc(block->page.frame);
 	} else {
 		autoinc = 0;
 	}
@@ -1292,9 +1320,10 @@ btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
 		index->table->space->zip_size(),
 		RW_S_LATCH, &mtr);
 
-	ib_uint64_t	autoinc	= block ? page_get_autoinc(block->frame) : 0;
+	ib_uint64_t	autoinc	= block
+		? page_get_autoinc(block->page.frame) : 0;
 	const bool	retry	= block && autoinc == 0
-		&& !page_is_empty(block->frame);
+		&& !page_is_empty(block->page.frame);
 	mtr.commit();
 
 	if (retry) {
@@ -1325,53 +1354,57 @@ btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
 void
 btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
 {
-	ut_ad(index->is_primary());
-	ut_ad(index->table->persistent_autoinc);
-	ut_ad(!index->table->is_temporary());
+  ut_ad(index->is_primary());
+  ut_ad(index->table->persistent_autoinc);
+  ut_ad(!index->table->is_temporary());
+
+  mtr_t mtr;
+  mtr.start();
+  fil_space_t *space= index->table->space;
+  if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page),
+				      space->zip_size(), RW_SX_LATCH, &mtr))
+  {
+    mtr.set_named_space(space);
+    page_set_autoinc(root, autoinc, &mtr, reset);
+  }
 
-	mtr_t		mtr;
-	mtr.start();
-	fil_space_t* space = index->table->space;
-	mtr.set_named_space(space);
-	page_set_autoinc(buf_page_get(page_id_t(space->id, index->page),
-				      space->zip_size(),
-				      RW_SX_LATCH, &mtr),
-			 autoinc, &mtr, reset);
-	mtr.commit();
+  mtr.commit();
 }
 
 /** Reorganize an index page.
 @param cursor      index page cursor
-@param index       the index that the cursor belongs to
 @param mtr         mini-transaction */
-static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
-                                    mtr_t *mtr)
+static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr)
 {
-  const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
-
   buf_block_t *const block= cursor->block;
 
   ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
   ut_ad(!is_buf_block_get_page_zip(block));
-  btr_assert_not_corrupted(block, index);
-  ut_ad(fil_page_index_page_check(block->frame));
-  ut_ad(index->is_dummy ||
-        block->page.id().space() == index->table->space->id);
-  ut_ad(index->is_dummy || block->page.id().page_no() != index->page ||
-        !page_has_siblings(block->frame));
+  ut_ad(fil_page_index_page_check(block->page.frame));
+  ut_ad(cursor->index->is_dummy ||
+        block->page.id().space() == cursor->index->table->space->id);
+  ut_ad(cursor->index->is_dummy ||
+        block->page.id().page_no() != cursor->index->page ||
+        !page_has_siblings(block->page.frame));
 
-  buf_block_t *old= buf_block_alloc();
-  /* Copy the old page to temporary space */
-  memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->frame, block->frame, srv_page_size);
+  /* Save the cursor position. */
+  const ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+  if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+    return DB_CORRUPTION;
 
   btr_search_drop_page_hash_index(block, false);
 
-  /* Save the cursor position. */
-  const ulint pos= page_rec_get_n_recs_before(cursor->rec);
+  buf_block_t *old= buf_block_alloc();
+  /* Copy the old page to temporary space */
+  memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->page.frame, block->page.frame,
+                                     srv_page_size);
+
+  const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
 
-  page_create(block, mtr, index->table->not_redundant());
-  if (index->is_spatial())
-    block->frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
+  page_create(block, mtr, cursor->index->table->not_redundant());
+  if (cursor->index->is_spatial())
+    block->page.frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
 
   static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
                 FIL_PAGE_RTREE, "compatibility");
@@ -1379,92 +1412,116 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
   /* Copy the records from the temporary space to the recreated page;
   do not copy the lock bits yet */
 
-  page_copy_rec_list_end_no_locks(block, old, page_get_infimum_rec(old->frame),
-                                  index, mtr);
+  dberr_t err=
+    page_copy_rec_list_end_no_locks(block, old,
+                                    page_get_infimum_rec(old->page.frame),
+                                    cursor->index, mtr);
+  mtr->set_log_mode(log_mode);
+
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+    return err;
 
   /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
-  ut_ad(!page_get_max_trx_id(block->frame));
-  memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->frame,
-                    PAGE_MAX_TRX_ID + PAGE_HEADER + old->frame, 8);
+  ut_ad(!page_get_max_trx_id(block->page.frame));
+  memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->page.frame,
+                    PAGE_MAX_TRX_ID + PAGE_HEADER + old->page.frame, 8);
 #ifdef UNIV_DEBUG
-  if (page_get_max_trx_id(block->frame))
+  if (page_get_max_trx_id(block->page.frame))
     /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
     clustered index root pages. */
-    ut_ad(dict_index_is_sec_or_ibuf(index)
-          ? page_is_leaf(block->frame)
-          : block->page.id().page_no() == index->page);
+    ut_ad(dict_index_is_sec_or_ibuf(cursor->index)
+          ? page_is_leaf(block->page.frame)
+          : block->page.id().page_no() == cursor->index->page);
   else
     /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
     the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
     pages, and in temporary tables.  It was always zero-initialized in
     page_create().  PAGE_MAX_TRX_ID must be nonzero on
     dict_index_is_sec_or_ibuf() leaf pages. */
-    ut_ad(index->table->is_temporary() || !page_is_leaf(block->frame) ||
-          !dict_index_is_sec_or_ibuf(index));
+    ut_ad(cursor->index->table->is_temporary() ||
+          !page_is_leaf(block->page.frame) ||
+          !dict_index_is_sec_or_ibuf(cursor->index));
 #endif
 
-  const uint16_t data_size1= page_get_data_size(old->frame);
-  const uint16_t data_size2= page_get_data_size(block->frame);
-  const ulint max1= page_get_max_insert_size_after_reorganize(old->frame, 1);
-  const ulint max2= page_get_max_insert_size_after_reorganize(block->frame, 1);
+  const uint16_t data_size1= page_get_data_size(old->page.frame);
+  const uint16_t data_size2= page_get_data_size(block->page.frame);
+  const ulint max1=
+    page_get_max_insert_size_after_reorganize(old->page.frame, 1);
+  const ulint max2=
+    page_get_max_insert_size_after_reorganize(block->page.frame, 1);
 
   if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2))
-    ib::fatal() << "Page old data size " << data_size1
-                << " new data size " << data_size2
-                << ", page old max ins size " << max1
-                << " new max ins size " << max2;
+  {
+    sql_print_error("InnoDB: Page old data size %u new data size %u"
+                    ", page old max ins size %zu new max ins size %zu",
+                    data_size1, data_size2, max1, max2);
+    return DB_CORRUPTION;
+  }
 
   /* Restore the cursor position. */
-  if (pos)
-    cursor->rec = page_rec_get_nth(block->frame, pos);
+  if (!pos)
+    ut_ad(cursor->rec == page_get_infimum_rec(block->page.frame));
+  else if (!(cursor->rec= page_rec_get_nth(block->page.frame, pos)))
+    return DB_CORRUPTION;
+
+  if (block->page.id().page_no() != cursor->index->page ||
+      fil_page_get_type(old->page.frame) != FIL_PAGE_TYPE_INSTANT)
+    ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER));
+  else if (!cursor->index->is_instant())
+  {
+    ut_ad(!memcmp(old->page.frame, block->page.frame, FIL_PAGE_TYPE));
+    ut_ad(!memcmp(old->page.frame + FIL_PAGE_TYPE + 2,
+                  block->page.frame + FIL_PAGE_TYPE + 2,
+                  PAGE_HEADER - FIL_PAGE_TYPE - 2));
+    mtr->write<2,mtr_t::FORCED>(*block, FIL_PAGE_TYPE + block->page.frame,
+                                FIL_PAGE_INDEX);
+  }
   else
-    ut_ad(cursor->rec == page_get_infimum_rec(block->frame));
-
-  if (block->page.id().page_no() == index->page &&
-      fil_page_get_type(old->frame) == FIL_PAGE_TYPE_INSTANT)
   {
     /* Preserve the PAGE_INSTANT information. */
-    ut_ad(index->is_instant());
-    memcpy_aligned<2>(FIL_PAGE_TYPE + block->frame,
-                      FIL_PAGE_TYPE + old->frame, 2);
-    memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->frame,
-                      PAGE_HEADER + PAGE_INSTANT + old->frame, 2);
-    if (!index->table->instant);
-    else if (page_is_comp(block->frame))
+    memcpy_aligned<2>(FIL_PAGE_TYPE + block->page.frame,
+                      FIL_PAGE_TYPE + old->page.frame, 2);
+    memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->page.frame,
+                      PAGE_HEADER + PAGE_INSTANT + old->page.frame, 2);
+    if (!cursor->index->table->instant);
+    else if (page_is_comp(block->page.frame))
     {
-      memcpy(PAGE_NEW_INFIMUM + block->frame,
-             PAGE_NEW_INFIMUM + old->frame, 8);
-      memcpy(PAGE_NEW_SUPREMUM + block->frame,
-             PAGE_NEW_SUPREMUM + old->frame, 8);
+      memcpy(PAGE_NEW_INFIMUM + block->page.frame,
+             PAGE_NEW_INFIMUM + old->page.frame, 8);
+      memcpy(PAGE_NEW_SUPREMUM + block->page.frame,
+             PAGE_NEW_SUPREMUM + old->page.frame, 8);
     }
     else
     {
-      memcpy(PAGE_OLD_INFIMUM + block->frame,
-             PAGE_OLD_INFIMUM + old->frame, 8);
-      memcpy(PAGE_OLD_SUPREMUM + block->frame,
-             PAGE_OLD_SUPREMUM + old->frame, 8);
+      memcpy(PAGE_OLD_INFIMUM + block->page.frame,
+             PAGE_OLD_INFIMUM + old->page.frame, 8);
+      memcpy(PAGE_OLD_SUPREMUM + block->page.frame,
+             PAGE_OLD_SUPREMUM + old->page.frame, 8);
     }
+
+    ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER));
   }
 
-  ut_ad(!memcmp(old->frame, block->frame, PAGE_HEADER));
-  ut_ad(!memcmp(old->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
-                block->frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+  ut_ad(!memcmp(old->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                block->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
                 PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER)));
 
-  if (!dict_table_is_locking_disabled(index->table))
+  if (!cursor->index->has_locking());
+  else if (cursor->index->page == FIL_NULL)
+    ut_ad(cursor->index->is_dummy);
+  else
     lock_move_reorganize_page(block, old);
 
   /* Write log for the changes, if needed. */
-  mtr->set_log_mode(log_mode);
   if (log_mode == MTR_LOG_ALL)
   {
     /* Check and log the changes in the page header. */
     ulint a, e;
     for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++)
     {
-      if (old->frame[a] == block->frame[a])
+      if (old->page.frame[a] == block->page.frame[a])
         continue;
-      while (--e, old->frame[e] == block->frame[e]);
+      while (--e, old->page.frame[e] == block->page.frame[e]);
       e++;
       ut_ad(a < e);
       /* Write log for the changed page header fields. */
@@ -1472,88 +1529,92 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
       break;
     }
 
-    const uint16_t top= page_header_get_offs(block->frame, PAGE_HEAP_TOP);
+    const uint16_t top= page_header_get_offs(block->page.frame, PAGE_HEAP_TOP);
 
-    if (page_is_comp(block->frame))
+    if (page_is_comp(block->page.frame))
     {
       /* info_bits=0, n_owned=1, heap_no=0, status */
-      ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + block->frame,
-                    PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + old->frame, 3));
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES +
+                    block->page.frame,
+                    PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES +
+                    old->page.frame, 3));
       /* If the 'next' pointer of the infimum record has changed, log it. */
       a= PAGE_NEW_INFIMUM - 2;
       e= a + 2;
-      if (block->frame[a] == old->frame[a])
+      if (block->page.frame[a] == old->page.frame[a])
         a++;
-      if (--e, block->frame[e] != old->frame[e])
+      if (--e, block->page.frame[e] != old->page.frame[e])
         e++;
       if (ulint len= e - a)
         mtr->memcpy(*block, a, len);
       /* The infimum record itself must not change. */
-      ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->frame,
-                    PAGE_NEW_INFIMUM + old->frame, 8));
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->page.frame,
+                    PAGE_NEW_INFIMUM + old->page.frame, 8));
       /* Log any change of the n_owned of the supremum record. */
       a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES;
-      if (block->frame[a] != old->frame[a])
+      if (block->page.frame[a] != old->page.frame[a])
         mtr->memcpy(*block, a, 1);
       /* The rest of the supremum record must not change. */
-      ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+      ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1],
                     PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM +
                     REC_N_NEW_EXTRA_BYTES - 1));
 
       /* Log the differences in the payload. */
       for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++)
       {
-        if (old->frame[a] == block->frame[a])
+        if (old->page.frame[a] == block->page.frame[a])
           continue;
-        while (--e, old->frame[e] == block->frame[e]);
+        while (--e, old->page.frame[e] == block->page.frame[e]);
         e++;
         ut_ad(a < e);
-	/* TODO: write MEMMOVE records to minimize this further! */
+        /* TODO: write MEMMOVE records to minimize this further! */
         mtr->memcpy(*block, a, e - a);
-	break;
+        break;
       }
     }
     else
     {
       /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */
-      ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + block->frame,
-                    PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + old->frame, 4));
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES +
+                    block->page.frame,
+                    PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES +
+                    old->page.frame, 4));
       /* If the 'next' pointer of the infimum record has changed, log it. */
       a= PAGE_OLD_INFIMUM - 2;
       e= a + 2;
-      if (block->frame[a] == old->frame[a])
+      if (block->page.frame[a] == old->page.frame[a])
         a++;
-      if (--e, block->frame[e] != old->frame[e])
+      if (--e, block->page.frame[e] != old->page.frame[e])
         e++;
       if (ulint len= e - a)
         mtr->memcpy(*block, a, len);
       /* The infimum record itself must not change. */
-      ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->frame,
-                    PAGE_OLD_INFIMUM + old->frame, 8));
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->page.frame,
+                    PAGE_OLD_INFIMUM + old->page.frame, 8));
       /* Log any change of the n_owned of the supremum record. */
       a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES;
-      if (block->frame[a] != old->frame[a])
+      if (block->page.frame[a] != old->page.frame[a])
         mtr->memcpy(*block, a, 1);
-      ut_ad(!memcmp(&block->frame[a + 1], &old->frame[a + 1],
+      ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1],
                     PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM +
                     REC_N_OLD_EXTRA_BYTES - 1));
 
       /* Log the differences in the payload. */
       for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++)
       {
-        if (old->frame[a] == block->frame[a])
+        if (old->page.frame[a] == block->page.frame[a])
           continue;
-        while (--e, old->frame[e] == block->frame[e]);
+        while (--e, old->page.frame[e] == block->page.frame[e]);
         e++;
         ut_ad(a < e);
-	/* TODO: write MEMMOVE records to minimize this further! */
+        /* TODO: write MEMMOVE records to minimize this further! */
         mtr->memcpy(*block, a, e - a);
-	break;
+        break;
       }
     }
 
     e= srv_page_size - PAGE_DIR;
-    a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->frame);
+    a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->page.frame);
 
     /* Zero out the payload area. */
     mtr->memset(*block, top, a - top, 0);
@@ -1561,9 +1622,9 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
     /* Log changes to the page directory. */
     for (; a < e; a++)
     {
-      if (old->frame[a] == block->frame[a])
+      if (old->page.frame[a] == block->page.frame[a])
         continue;
-      while (--e, old->frame[e] == block->frame[e]);
+      while (--e, old->page.frame[e] == block->page.frame[e]);
       e++;
       ut_ad(a < e);
       /* Write log for the changed page directory slots. */
@@ -1576,6 +1637,7 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index,
 
   MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
   MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
+  return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -1587,9 +1649,9 @@ be done either within the same mini-transaction, or by invoking
 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
 IBUF_BITMAP_FREE is unaffected by reorganization.
 
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t
 btr_page_reorganize_block(
 	ulint		z_level,/*!< in: compression level to be used
 				if dealing with compressed page */
@@ -1597,15 +1659,12 @@ btr_page_reorganize_block(
 	dict_index_t*	index,	/*!< in: the index tree of the page */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	if (buf_block_get_page_zip(block)) {
-		return page_zip_reorganize(block, index, z_level, mtr, true);
-	}
-
-	page_cur_t	cur;
-	page_cur_set_before_first(block, &cur);
-
-	btr_page_reorganize_low(&cur, index, mtr);
-	return true;
+  if (buf_block_get_page_zip(block))
+    return page_zip_reorganize(block, index, z_level, mtr, true);
+  page_cur_t cur;
+  page_cur_set_before_first(block, &cur);
+  cur.index= index;
+  return btr_page_reorganize_low(&cur, mtr);
 }
 
 /*************************************************************//**
@@ -1617,33 +1676,28 @@ be done either within the same mini-transaction, or by invoking
 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
 IBUF_BITMAP_FREE is unaffected by reorganization.
 
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool
-btr_page_reorganize(
-/*================*/
-	page_cur_t*	cursor,	/*!< in/out: page cursor */
-	dict_index_t*	index,	/*!< in: the index tree of the page */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+@param cursor  page cursor
+@param mtr     mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
 {
-	if (!buf_block_get_page_zip(cursor->block)) {
-		btr_page_reorganize_low(cursor, index, mtr);
-		return true;
-	}
-
-	ulint pos = page_rec_get_n_recs_before(cursor->rec);
-	if (!page_zip_reorganize(cursor->block, index, page_zip_level, mtr,
-				 true)) {
-		return false;
-	}
-	if (pos) {
-		cursor->rec = page_rec_get_nth(cursor->block->frame, pos);
-	} else {
-		ut_ad(cursor->rec == page_get_infimum_rec(
-			      cursor->block->frame));
-	}
-
-	return true;
+  if (!buf_block_get_page_zip(cursor->block))
+    return btr_page_reorganize_low(cursor, mtr);
+
+  ulint pos= page_rec_get_n_recs_before(cursor->rec);
+  if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+    return DB_CORRUPTION;
+
+  dberr_t err= page_zip_reorganize(cursor->block, cursor->index,
+                                   page_zip_level, mtr, true);
+  if (err == DB_FAIL);
+  else if (!pos)
+    ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->page.frame));
+  else if (!(cursor->rec= page_rec_get_nth(cursor->block->page.frame, pos)))
+    err= DB_CORRUPTION;
+
+  return err;
 }
 
 /** Empty an index page (possibly the root page). @see btr_page_create().
@@ -1665,7 +1719,8 @@ btr_page_empty(
 	ut_ad(!index->is_dummy);
 	ut_ad(index->table->space->id == block->page.id().space());
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+	ut_a(!page_zip
+	     || page_zip_validate(page_zip, block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
 	btr_search_drop_page_hash_index(block, false);
@@ -1678,7 +1733,7 @@ btr_page_empty(
 	const ib_uint64_t	autoinc
 		= dict_index_is_clust(index)
 		&& index->page == block->page.id().page_no()
-		? page_get_autoinc(block->frame)
+		? page_get_autoinc(block->page.frame)
 		: 0;
 
 	if (page_zip) {
@@ -1689,19 +1744,20 @@ btr_page_empty(
 			static_assert(((FIL_PAGE_INDEX & 0xff00)
 				       | byte(FIL_PAGE_RTREE))
 				      == FIL_PAGE_RTREE, "compatibility");
-			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1
+				      + block->page.frame,
 				      byte(FIL_PAGE_RTREE));
-			if (mach_read_from_8(block->frame
+			if (mach_read_from_8(block->page.frame
 					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
 				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
 					    8, 0);
 			}
 		}
 		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
-					       + block->frame, level);
+					       + block->page.frame, level);
 		if (autoinc) {
 			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
-				      + block->frame, autoinc);
+				      + block->page.frame, autoinc);
 		}
 	}
 }
@@ -1715,19 +1771,20 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
 	ut_ad(index.n_core_fields > 0);
 	ut_ad(index.n_core_fields < REC_MAX_N_FIELDS);
 	ut_ad(index.is_instant());
-	ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT
-	      || fil_page_get_type(root->frame) == FIL_PAGE_INDEX);
-	ut_ad(!page_has_siblings(root->frame));
+	ut_ad(fil_page_get_type(root->page.frame) == FIL_PAGE_TYPE_INSTANT
+	      || fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX);
+	ut_ad(!page_has_siblings(root->page.frame));
 	ut_ad(root->page.id().page_no() == index.page);
 
-	rec_t* infimum = page_get_infimum_rec(root->frame);
-	rec_t* supremum = page_get_supremum_rec(root->frame);
-	byte* page_type = root->frame + FIL_PAGE_TYPE;
-	uint16_t i = page_header_get_field(root->frame, PAGE_INSTANT);
+	rec_t* infimum = page_get_infimum_rec(root->page.frame);
+	rec_t* supremum = page_get_supremum_rec(root->page.frame);
+	byte* page_type = root->page.frame + FIL_PAGE_TYPE;
+	uint16_t i = page_header_get_field(root->page.frame, PAGE_INSTANT);
 
 	switch (mach_read_from_2(page_type)) {
 	case FIL_PAGE_TYPE_INSTANT:
-		ut_ad(page_get_instant(root->frame) == index.n_core_fields);
+		ut_ad(page_get_instant(root->page.frame)
+		      == index.n_core_fields);
 		if (memcmp(infimum, "infimum", 8)
 		    || memcmp(supremum, "supremum", 8)) {
 			ut_ad(index.table->instant);
@@ -1744,21 +1801,21 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
 		ut_ad("wrong page type" == 0);
 		/* fall through */
 	case FIL_PAGE_INDEX:
-		ut_ad(!page_is_comp(root->frame)
-		      || !page_get_instant(root->frame));
+		ut_ad(!page_is_comp(root->page.frame)
+		      || !page_get_instant(root->page.frame));
 		ut_ad(!memcmp(infimum, "infimum", 8));
 		ut_ad(!memcmp(supremum, "supremum", 8));
 		mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT);
 		ut_ad(i <= PAGE_NO_DIRECTION);
 		i |= static_cast<uint16_t>(index.n_core_fields << 3);
-		mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + root->frame,
-			      i);
+		mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT
+			      + root->page.frame, i);
 		break;
 	}
 
 	if (index.table->instant) {
-		mtr->memset(root, infimum - root->frame, 8, 0);
-		mtr->memset(root, supremum - root->frame, 7, 0);
+		mtr->memset(root, infimum - root->page.frame, 8, 0);
+		mtr->memset(root, supremum - root->page.frame, 7, 0);
 		mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7],
 					       index.n_core_null_bytes);
 	}
@@ -1773,39 +1830,37 @@ void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
 {
   ut_ad(!index.table->is_temporary());
   ut_ad(index.is_primary());
-  if (buf_block_t *root = btr_root_block_get(&index, RW_SX_LATCH, mtr))
+  buf_block_t *root= btr_get_latched_root(index, mtr);
+  byte *page_type= root->page.frame + FIL_PAGE_TYPE;
+  if (all)
   {
-    byte *page_type= root->frame + FIL_PAGE_TYPE;
-    if (all)
-    {
-      ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
-            mach_read_from_2(page_type) == FIL_PAGE_INDEX);
-      mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
-      byte *instant= PAGE_INSTANT + PAGE_HEADER + root->frame;
-      mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
-                                     page_ptr_get_direction(instant + 1));
-    }
-    else
-      ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
-    static const byte supremuminfimum[8 + 8] = "supremuminfimum";
-    uint16_t infimum, supremum;
-    if (page_is_comp(root->frame))
-    {
-      infimum= PAGE_NEW_INFIMUM;
-      supremum= PAGE_NEW_SUPREMUM;
-    }
-    else
-    {
-      infimum= PAGE_OLD_INFIMUM;
-      supremum= PAGE_OLD_SUPREMUM;
-    }
-    ut_ad(!memcmp(&root->frame[infimum], supremuminfimum + 8, 8) ==
-          !memcmp(&root->frame[supremum], supremuminfimum, 8));
-    mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[infimum],
-                                  supremuminfimum + 8, 8);
-    mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->frame[supremum],
-                                  supremuminfimum, 8);
+    ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+          mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+    mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
+    byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame;
+    mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
+                                   page_ptr_get_direction(instant + 1));
+  }
+  else
+    ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+  static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+  uint16_t infimum, supremum;
+  if (page_is_comp(root->page.frame))
+  {
+    infimum= PAGE_NEW_INFIMUM;
+    supremum= PAGE_NEW_SUPREMUM;
+  }
+  else
+  {
+    infimum= PAGE_OLD_INFIMUM;
+    supremum= PAGE_OLD_SUPREMUM;
   }
+  ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) ==
+        !memcmp(&root->page.frame[supremum], supremuminfimum, 8));
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[infimum],
+                                supremuminfimum + 8, 8);
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[supremum],
+                                supremuminfimum, 8);
 }
 
 /*************************************************************//**
@@ -1827,10 +1882,10 @@ btr_root_raise_and_insert(
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
 {
 	dict_index_t*	index;
-	ulint		new_page_no;
 	rec_t*		rec;
 	dtuple_t*	node_ptr;
 	ulint		level;
@@ -1843,39 +1898,42 @@ btr_root_raise_and_insert(
 
 	root = btr_cur_get_block(cursor);
 	root_page_zip = buf_block_get_page_zip(root);
-	ut_ad(!page_is_empty(root->frame));
+	ut_ad(!page_is_empty(root->page.frame));
 	index = btr_cur_get_index(cursor);
 	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root->frame,
-						 index));
+	ut_a(!root_page_zip
+	     || page_zip_validate(root_page_zip, root->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
-#ifdef UNIV_BTR_DEBUG
-	if (!dict_index_is_ibuf(index)) {
-		ulint	space = index->table->space_id;
-
-		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-					    + root->frame, space));
-		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-					    + root->frame, space));
-	}
+	const page_id_t root_id{root->page.id()};
 
-	ut_a(dict_index_get_page(index) == root->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 					 | MTR_MEMO_SX_LOCK));
 	ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
 
+	if (index->page != root_id.page_no()) {
+		ut_ad("corrupted root page number" == 0);
+		return nullptr;
+	}
+
+	if (index->is_ibuf()) {
+        } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+                                           *root, *index->table->space)
+                   || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+                                              *root, *index->table->space)) {
+                return nullptr;
+	}
+
 	/* Allocate a new page to the tree. Root splitting is done by first
 	moving the root records to the new page, emptying the root, putting
 	a node pointer to the new page, and then splitting the new page. */
 
-	level = btr_page_get_level(root->frame);
+	level = btr_page_get_level(root->page.frame);
 
-	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err);
 
-	if (new_block == NULL && os_has_said_disk_full) {
-		return(NULL);
+	if (!new_block) {
+		return nullptr;
 	}
 
 	new_page_zip = buf_block_get_page_zip(new_block);
@@ -1885,11 +1943,12 @@ btr_root_raise_and_insert(
 	     == page_zip_get_size(root_page_zip));
 
 	btr_page_create(new_block, new_page_zip, index, level, mtr);
-	if (page_has_siblings(new_block->frame)) {
+	if (page_has_siblings(new_block->page.frame)) {
 		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
 		compile_time_assert(FIL_NULL == 0xffffffff);
 		static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
-		memset_aligned<8>(new_block->frame + FIL_PAGE_PREV, 0xff, 8);
+		memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV,
+				  0xff, 8);
 		mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
 		if (UNIV_LIKELY_NULL(new_page_zip)) {
 			memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
@@ -1898,27 +1957,39 @@ btr_root_raise_and_insert(
 	}
 
 	/* Copy the records from root to the new page one by one. */
-
 	if (0
 #ifdef UNIV_ZIP_COPY
 	    || new_page_zip
 #endif /* UNIV_ZIP_COPY */
 	    || !page_copy_rec_list_end(new_block, root,
-				       page_get_infimum_rec(root->frame),
-				       index, mtr)) {
+				       page_get_infimum_rec(root->page.frame),
+				       index, mtr, err)) {
+		switch (*err) {
+		case DB_SUCCESS:
+			break;
+		case DB_FAIL:
+			*err = DB_SUCCESS;
+			break;
+		default:
+			return nullptr;
+		}
+
 		ut_a(new_page_zip);
 
 		/* Copy the page byte for byte. */
-		page_zip_copy_recs(new_block,
-				   root_page_zip, root->frame, index, mtr);
+		page_zip_copy_recs(new_block, root_page_zip,
+				   root->page.frame, index, mtr);
 
 		/* Update the lock table and possible hash index. */
-		lock_move_rec_list_end(new_block, root,
-				       page_get_infimum_rec(root->frame));
+		if (index->has_locking()) {
+			lock_move_rec_list_end(
+				new_block, root,
+				page_get_infimum_rec(root->page.frame));
+		}
 
 		/* Move any existing predicate locks */
 		if (dict_index_is_spatial(index)) {
-			lock_prdt_rec_move(new_block, root);
+			lock_prdt_rec_move(new_block, root_id);
 		} else {
 			btr_search_move_or_delete_hash_entries(
 				new_block, root);
@@ -1933,7 +2004,7 @@ btr_root_raise_and_insert(
 		longer is a leaf page. (Older versions of InnoDB did
 		set PAGE_MAX_TRX_ID on all secondary index pages.) */
 		byte* p = my_assume_aligned<8>(
-			PAGE_HEADER + PAGE_MAX_TRX_ID + root->frame);
+			PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame);
 		if (mach_read_from_8(p)) {
 			mtr->memset(root, max_trx_id, 8, 0);
 			if (UNIV_LIKELY_NULL(root->page.zip.data)) {
@@ -1946,7 +2017,7 @@ btr_root_raise_and_insert(
 		root page; on other clustered index pages, we want to reserve
 		the field PAGE_MAX_TRX_ID for future use. */
 		byte* p = my_assume_aligned<8>(
-			PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->frame);
+			PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame);
 		if (mach_read_from_8(p)) {
 			mtr->memset(new_block, max_trx_id, 8, 0);
 			if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
@@ -1962,8 +2033,8 @@ btr_root_raise_and_insert(
 	information of the record to be inserted on the infimum of the
 	root page: we cannot discard the lock structs on the root page */
 
-	if (!dict_table_is_locking_disabled(index->table)) {
-		lock_update_root_raise(new_block, root);
+	if (index->has_locking()) {
+		lock_update_root_raise(*new_block, root_id);
 	}
 
 	/* Create a memory heap where the node pointer is stored */
@@ -1971,8 +2042,9 @@ btr_root_raise_and_insert(
 		*heap = mem_heap_create(1000);
 	}
 
-	rec = page_rec_get_next(page_get_infimum_rec(new_block->frame));
-	new_page_no = new_block->page.id().page_no();
+	const uint32_t new_page_no = new_block->page.id().page_no();
+	rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame));
+	ut_ad(rec); /* We just created the page. */
 
 	/* Build the node pointer (= node key and page address) for the
 	child */
@@ -1996,14 +2068,14 @@ btr_root_raise_and_insert(
 	/* Rebuild the root page to get free space */
 	btr_page_empty(root, root_page_zip, index, level + 1, mtr);
 	/* btr_page_empty() is supposed to zero-initialize the field. */
-	ut_ad(!page_get_instant(root->frame));
+	ut_ad(!page_get_instant(root->page.frame));
 
 	if (index->is_instant()) {
 		ut_ad(!root_page_zip);
 		btr_set_instant(root, *index, mtr);
 	}
 
-	ut_ad(!page_has_siblings(root->frame));
+	ut_ad(!page_has_siblings(root->page.frame));
 
 	page_cursor = btr_cur_get_page_cur(cursor);
 
@@ -2012,7 +2084,7 @@ btr_root_raise_and_insert(
 	page_cur_set_before_first(root, page_cursor);
 
 	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
-					     index, offsets, heap, 0, mtr);
+					     offsets, heap, 0, mtr);
 
 	/* The root page should only contain the node pointer
 	to new_block at this point.  Thus, the data should fit. */
@@ -2025,23 +2097,23 @@ btr_root_raise_and_insert(
 		ibuf_reset_free_bits(new_block);
 	}
 
-	if (tuple != NULL) {
-		/* Reposition the cursor to the child node */
-		page_cur_search(new_block, index, tuple, page_cursor);
-	} else {
-		/* Set cursor to first record on child node */
-		page_cur_set_before_first(new_block, page_cursor);
+	page_cursor->block = new_block;
+	page_cursor->index = index;
+
+	ut_ad(dtuple_check_typed(tuple));
+	/* Reposition the cursor to the child node */
+	ulint low_match = 0, up_match = 0;
+
+	if (page_cur_search_with_match(tuple, PAGE_CUR_LE,
+				       &up_match, &low_match,
+				       page_cursor, nullptr)) {
+		*err = DB_CORRUPTION;
+		return nullptr;
 	}
 
 	/* Split the child and insert tuple */
-	if (dict_index_is_spatial(index)) {
-		/* Split rtree page and insert tuple */
-		return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
-						 tuple, n_ext, mtr));
-	} else {
-		return(btr_page_split_and_insert(flags, cursor, offsets, heap,
-						 tuple, n_ext, mtr));
-	}
+	return btr_page_split_and_insert(flags, cursor, offsets, heap,
+					 tuple, n_ext, mtr, err);
 }
 
 /** Decide if the page should be split at the convergence point of inserts
@@ -2067,10 +2139,10 @@ rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor)
 	So, we can only assert that when the metadata record exists,
 	index->is_instant() must hold. */
 	ut_ad(!page_is_leaf(page) || page_has_prev(page)
-	      || cursor->index->is_instant()
+	      || cursor->index()->is_instant()
 	      || !(rec_get_info_bits(page_rec_get_next_const(
 					     page_get_infimum_rec(page)),
-				     cursor->index->table->not_redundant())
+				     cursor->index()->table->not_redundant())
 		   & REC_INFO_MIN_REC_FLAG));
 
 	const rec_t* infimum = page_get_infimum_rec(page);
@@ -2112,7 +2184,7 @@ btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec)
 
 	insert_point = page_rec_get_next(insert_point);
 
-	if (page_rec_is_supremum(insert_point)) {
+	if (!insert_point || page_rec_is_supremum(insert_point)) {
 		insert_point = NULL;
 	} else {
 		insert_point = page_rec_get_next(insert_point);
@@ -2163,14 +2235,14 @@ btr_page_get_split_rec(
 
 	page = btr_cur_get_page(cursor);
 
-	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
 	page_zip = btr_cur_get_page_zip(cursor);
 	if (page_zip) {
 		/* Estimate the free space of an empty compressed page. */
 		ulint	free_space_zip = page_zip_empty_size(
-			cursor->index->n_fields,
+			cursor->index()->n_fields,
 			page_zip_get_size(page_zip));
 
 		if (free_space > (ulint) free_space_zip) {
@@ -2215,9 +2287,10 @@ btr_page_get_split_rec(
 			/* Include tuple */
 			incl_data += insert_size;
 		} else {
-			offsets = rec_get_offsets(rec, cursor->index, offsets,
-						  page_is_leaf(page)
-						  ? cursor->index->n_core_fields
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets, page_is_leaf(page)
+						  ? cursor->index()
+						  ->n_core_fields
 						  : 0,
 						  ULINT_UNDEFINED, &heap);
 			incl_data += rec_offs_size(offsets);
@@ -2254,6 +2327,7 @@ func_exit:
 	return(rec);
 }
 
+#ifdef UNIV_DEBUG
 /*************************************************************//**
 Returns TRUE if the insert fits on the appropriate half-page with the
 chosen split_rec.
@@ -2268,7 +2342,7 @@ btr_page_insert_fits(
 				on upper half-page, or NULL if
 				tuple to be inserted should be first */
 	rec_offs**	offsets,/*!< in: rec_get_offsets(
-				split_rec, cursor->index); out: garbage */
+				split_rec, cursor->index()); out: garbage */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	mem_heap_t**	heap)	/*!< in: temporary memory heap */
@@ -2286,9 +2360,9 @@ btr_page_insert_fits(
 	ut_ad(!split_rec
 	      || !page_is_comp(page) == !rec_offs_comp(*offsets));
 	ut_ad(!split_rec
-	      || rec_offs_validate(split_rec, cursor->index, *offsets));
+	      || rec_offs_validate(split_rec, cursor->index(), *offsets));
 
-	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
 	/* free_space is now the free space of a created new page */
@@ -2300,19 +2374,19 @@ btr_page_insert_fits(
 	end_rec) will end up on the other half page from tuple when it is
 	inserted. */
 
-	if (split_rec == NULL) {
-		rec = page_rec_get_next(page_get_infimum_rec(page));
+	if (!(end_rec = split_rec)) {
 		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
-
-	} else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
-
-		rec = page_rec_get_next(page_get_infimum_rec(page));
-		end_rec = split_rec;
-	} else {
+	} else if (cmp_dtuple_rec(tuple, split_rec, *offsets) < 0) {
 		rec = split_rec;
 		end_rec = page_get_supremum_rec(page);
+		goto got_rec;
+	}
+
+	if (!(rec = page_rec_get_next(page_get_infimum_rec(page)))) {
+		return false;
 	}
 
+got_rec:
 	if (total_data + page_dir_calc_reserved_space(total_n_recs)
 	    <= free_space) {
 
@@ -2326,9 +2400,9 @@ btr_page_insert_fits(
 		/* In this loop we calculate the amount of reserved
 		space after rec is removed from page. */
 
-		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
 					   page_is_leaf(page)
-					   ? cursor->index->n_core_fields
+					   ? cursor->index()->n_core_fields
 					   : 0,
 					   ULINT_UNDEFINED, heap);
 
@@ -2344,29 +2418,28 @@ btr_page_insert_fits(
 			return(true);
 		}
 
-		rec = page_rec_get_next_const(rec);
+		if (!(rec = page_rec_get_next_const(rec))) {
+			break;
+		}
 	}
 
 	return(false);
 }
+#endif
 
 /*******************************************************//**
 Inserts a data tuple to a tree on a non-leaf level. It is assumed
 that mtr holds an x-latch on the tree. */
-void
-btr_insert_on_non_leaf_level_func(
-/*==============================*/
+dberr_t
+btr_insert_on_non_leaf_level(
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	dict_index_t*	index,	/*!< in: index */
 	ulint		level,	/*!< in: level, must be > 0 */
 	dtuple_t*	tuple,	/*!< in: the record to be inserted */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	big_rec_t*	dummy_big_rec;
 	btr_cur_t	cursor;
-	dberr_t		err;
 	rec_t*		rec;
 	mem_heap_t*	heap = NULL;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
@@ -2376,71 +2449,70 @@ btr_insert_on_non_leaf_level_func(
 
 	ut_ad(level > 0);
 
-	if (!dict_index_is_spatial(index)) {
-		dberr_t err = btr_cur_search_to_nth_level(
-			index, level, tuple, PAGE_CUR_LE,
-			BTR_CONT_MODIFY_TREE,
-			&cursor, file, line, mtr);
-
-		if (err != DB_SUCCESS) {
-			ib::warn() << " Error code: " << err
-				   << " btr_page_get_father_node_ptr_func "
-				   << " level: " << level
-				   << " called from file: "
-				   << file << " line: " << line
-				   << " table: " << index->table->name
-				   << " index: " << index->name;
-		}
-	} else {
+	flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
+		| BTR_NO_UNDO_LOG_FLAG;
+	cursor.page_cur.index = index;
+
+	dberr_t err;
+
+	if (index->is_spatial()) {
 		/* For spatial index, initialize structures to track
 		its parents etc. */
 		rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
 
 		rtr_info_update_btr(&cursor, &rtr_info);
-
-		btr_cur_search_to_nth_level(index, level, tuple,
-					    PAGE_CUR_RTREE_INSERT,
-					    BTR_CONT_MODIFY_TREE,
-					    &cursor, file, line, mtr);
+		err = rtr_search_to_nth_level(level, tuple,
+					      PAGE_CUR_RTREE_INSERT,
+					      BTR_CONT_MODIFY_TREE,
+					      &cursor, mtr);
+	} else {
+		err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH,
+						  &cursor, mtr);
 	}
 
 	ut_ad(cursor.flag == BTR_CUR_BINARY);
+	ut_ad(btr_cur_get_block(&cursor)
+	      != mtr->at_savepoint(mtr->get_savepoint() - 1)
+	      || index->is_spatial()
+	      || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
 
-	err = btr_cur_optimistic_insert(
-		flags
-		| BTR_NO_LOCKING_FLAG
-		| BTR_KEEP_SYS_FLAG
-		| BTR_NO_UNDO_LOG_FLAG,
-		&cursor, &offsets, &heap,
-		tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		err = btr_cur_optimistic_insert(flags,
+						&cursor, &offsets, &heap,
+						tuple, &rec,
+						&dummy_big_rec, 0, NULL, mtr);
+	}
 
 	if (err == DB_FAIL) {
-		err = btr_cur_pessimistic_insert(flags
-						 | BTR_NO_LOCKING_FLAG
-						 | BTR_KEEP_SYS_FLAG
-						 | BTR_NO_UNDO_LOG_FLAG,
+		err = btr_cur_pessimistic_insert(flags,
 						 &cursor, &offsets, &heap,
 						 tuple, &rec,
 						 &dummy_big_rec, 0, NULL, mtr);
-		ut_a(err == DB_SUCCESS);
 	}
 
-	if (heap != NULL) {
+	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
 
-	if (dict_index_is_spatial(index)) {
+	if (index->is_spatial()) {
 		ut_ad(cursor.rtr_info);
 
 		rtr_clean_rtr_info(&rtr_info, true);
 	}
+
+	return err;
 }
 
+static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
 /**************************************************************//**
 Attaches the halves of an index page on the appropriate level in an
 index tree. */
-static MY_ATTRIBUTE((nonnull))
-void
+static
+dberr_t
 btr_attach_half_pages(
 /*==================*/
 	ulint		flags,		/*!< in: undo logging and
@@ -2455,8 +2527,8 @@ btr_attach_half_pages(
 {
 	dtuple_t*	node_ptr_upper;
 	mem_heap_t*	heap;
-	buf_block_t*	prev_block = NULL;
-	buf_block_t*	next_block = NULL;
+	buf_block_t*	prev_block = nullptr;
+	buf_block_t*	next_block = nullptr;
 	buf_block_t*	lower_block;
 	buf_block_t*	upper_block;
 
@@ -2475,9 +2547,12 @@ btr_attach_half_pages(
 		lower_block = new_block;
 		upper_block = block;
 
+		cursor.page_cur.block = block;
+		cursor.page_cur.index = index;
+
 		/* Look up the index for the node pointer to page */
-		offsets = btr_page_get_father_block(NULL, heap, index,
-						    block, mtr, &cursor);
+		offsets = btr_page_get_father_block(nullptr, heap, mtr,
+						    &cursor);
 
 		/* Replace the address of the old child node (= page) with the
 		address of the new lower half */
@@ -2493,21 +2568,38 @@ btr_attach_half_pages(
 	}
 
 	/* Get the level of the split pages */
-	const ulint level = btr_page_get_level(buf_block_get_frame(block));
-	ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
+	const ulint level = btr_page_get_level(block->page.frame);
+	ut_ad(level == btr_page_get_level(new_block->page.frame));
+	page_id_t id{block->page.id()};
 
 	/* Get the previous and next pages of page */
-	const uint32_t prev_page_no = btr_page_get_prev(block->frame);
-	const uint32_t next_page_no = btr_page_get_next(block->frame);
+	const uint32_t prev_page_no = btr_page_get_prev(block->page.frame);
+	const uint32_t next_page_no = btr_page_get_next(block->page.frame);
 
 	/* for consistency, both blocks should be locked, before change */
 	if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
-		prev_block = btr_block_get(*index, prev_page_no, RW_X_LATCH,
-					   !level, mtr);
+		id.set_page_no(prev_page_no);
+		prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+		if (!prev_block) {
+			ut_ad(mtr->memo_contains(index->lock,
+						 MTR_MEMO_X_LOCK));
+			prev_block = btr_block_get(*index, prev_page_no,
+						   RW_X_LATCH, !level, mtr);
+		}
+#endif
 	}
 	if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
-		next_block = btr_block_get(*index, next_page_no, RW_X_LATCH,
-					   !level, mtr);
+		id.set_page_no(next_page_no);
+		next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+		if (!next_block) {
+			ut_ad(mtr->memo_contains(index->lock,
+						 MTR_MEMO_X_LOCK));
+			next_block = btr_block_get(*index, next_page_no,
+						   RW_X_LATCH, !level, mtr);
+		}
+#endif
 	}
 
 	/* Build the node pointer (= node key and page address) for the upper
@@ -2520,48 +2612,58 @@ btr_attach_half_pages(
 	/* Insert it next to the pointer to the lower half. Note that this
 	may generate recursion leading to a split on the higher level. */
 
-	btr_insert_on_non_leaf_level(flags, index, level + 1,
-				     node_ptr_upper, mtr);
+	dberr_t err = btr_insert_on_non_leaf_level(
+		flags, index, level + 1, node_ptr_upper, mtr);
 
 	/* Free the memory heap */
 	mem_heap_free(heap);
 
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return err;
+	}
+
 	/* Update page links of the level */
 
 	if (prev_block) {
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(prev_block->frame)
-		     == page_is_comp(block->frame));
-		ut_a(btr_page_get_next(prev_block->frame)
-		     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame
+                                                    + FIL_PAGE_NEXT,
+                                                    block->page.frame
+                                                    + FIL_PAGE_OFFSET,
+                                                    4))) {
+			return DB_CORRUPTION;
+		}
 		btr_page_set_next(prev_block, lower_block->page.id().page_no(),
 				  mtr);
 	}
 
 	if (next_block) {
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(next_block->frame)
-		     == page_is_comp(block->frame));
-		ut_a(btr_page_get_prev(next_block->frame)
-		     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame
+                                                    + FIL_PAGE_PREV,
+                                                    block->page.frame
+                                                    + FIL_PAGE_OFFSET,
+                                                    4))) {
+			return DB_CORRUPTION;
+		}
 		btr_page_set_prev(next_block, upper_block->page.id().page_no(),
 				  mtr);
 	}
 
 	if (direction == FSP_DOWN) {
 		ut_ad(lower_block == new_block);
-		ut_ad(btr_page_get_next(upper_block->frame) == next_page_no);
+		ut_ad(btr_page_get_next(upper_block->page.frame)
+		      == next_page_no);
 		btr_page_set_prev(lower_block, prev_page_no, mtr);
 	} else {
 		ut_ad(upper_block == new_block);
-		ut_ad(btr_page_get_prev(lower_block->frame) == prev_page_no);
+		ut_ad(btr_page_get_prev(lower_block->page.frame)
+		      == prev_page_no);
 		btr_page_set_next(upper_block, next_page_no, mtr);
 	}
 
 	btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr);
 	btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr);
+
+	return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -2585,13 +2687,15 @@ btr_page_tuple_smaller(
 	/* Read the first user record in the page. */
 	block = btr_cur_get_block(cursor);
 	page_cur_set_before_first(block, &pcur);
-	page_cur_move_to_next(&pcur);
-	first_rec = page_cur_get_rec(&pcur);
+	if (UNIV_UNLIKELY(!(first_rec = page_cur_move_to_next(&pcur)))) {
+		ut_ad("corrupted page" == 0);
+		return false;
+	}
 
-	*offsets = rec_get_offsets(
-		first_rec, cursor->index, *offsets,
-		page_is_leaf(block->frame) ? cursor->index->n_core_fields : 0,
-		n_uniq, heap);
+	*offsets = rec_get_offsets(first_rec, cursor->index(), *offsets,
+				   page_is_leaf(block->page.frame)
+				   ? cursor->index()->n_core_fields : 0,
+				   n_uniq, heap);
 
 	return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
 }
@@ -2624,62 +2728,70 @@ btr_insert_into_right_sibling(
 	page_t*		page = buf_block_get_frame(block);
 	const uint32_t	next_page_no = btr_page_get_next(page);
 
-	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(heap);
+	ut_ad(dtuple_check_typed(tuple));
 
 	if (next_page_no == FIL_NULL || !page_rec_is_supremum(
 			page_rec_get_next(btr_cur_get_rec(cursor)))) {
 
-		return(NULL);
+		return nullptr;
 	}
 
 	page_cur_t	next_page_cursor;
 	buf_block_t*	next_block;
 	page_t*		next_page;
 	btr_cur_t	next_father_cursor;
-	rec_t*		rec = NULL;
+	rec_t*		rec = nullptr;
 	ulint		max_size;
 
-	next_block = btr_block_get(*cursor->index, next_page_no, RW_X_LATCH,
+	next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH,
 				   page_is_leaf(page), mtr);
 	if (UNIV_UNLIKELY(!next_block)) {
-		return NULL;
+		return nullptr;
 	}
 	next_page = buf_block_get_frame(next_block);
+	const bool is_leaf = page_is_leaf(next_page);
+
+	next_page_cursor.index = cursor->index();
+	next_page_cursor.block = next_block;
+	next_father_cursor.page_cur = next_page_cursor;
 
-	bool	is_leaf = page_is_leaf(next_page);
+	if (!btr_page_get_father(mtr, &next_father_cursor)) {
+		return nullptr;
+	}
 
-	btr_page_get_father(
-		cursor->index, next_block, mtr, &next_father_cursor);
+	ulint up_match = 0, low_match = 0;
 
-	page_cur_search(
-		next_block, cursor->index, tuple, PAGE_CUR_LE,
-		&next_page_cursor);
+	if (page_cur_search_with_match(tuple,
+				       PAGE_CUR_LE, &up_match, &low_match,
+				       &next_page_cursor, nullptr)) {
+		return nullptr;
+	}
 
 	max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
 
 	/* Extends gap lock for the next page */
-	if (is_leaf && !dict_table_is_locking_disabled(cursor->index->table)) {
+	if (is_leaf && cursor->index()->has_locking()) {
 		lock_update_node_pointer(block, next_block);
 	}
 
-	rec = page_cur_tuple_insert(
-		&next_page_cursor, tuple, cursor->index, offsets, &heap,
-		n_ext, mtr);
+	rec = page_cur_tuple_insert(&next_page_cursor, tuple, offsets, &heap,
+				    n_ext, mtr);
 
-	if (rec == NULL) {
+	if (!rec) {
 		if (is_leaf
 		    && next_block->page.zip.ssize
-		    && !dict_index_is_clust(cursor->index)
-		    && !cursor->index->table->is_temporary()) {
+		    && !dict_index_is_clust(cursor->index())
+		    && !cursor->index()->table->is_temporary()) {
 			/* Reset the IBUF_BITMAP_FREE bits, because
 			page_cur_tuple_insert() will have attempted page
 			reorganize before failing. */
 			ibuf_reset_free_bits(next_block);
 		}
-		return(NULL);
+		return nullptr;
 	}
 
 	ibool	compressed;
@@ -2698,24 +2810,28 @@ btr_insert_into_right_sibling(
 		&err, TRUE, &next_father_cursor,
 		BTR_CREATE_FLAG, false, mtr);
 
-	ut_a(err == DB_SUCCESS);
+	if (err != DB_SUCCESS) {
+		return nullptr;
+	}
 
 	if (!compressed) {
-		btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
+		btr_cur_compress_if_useful(&next_father_cursor, false, mtr);
 	}
 
 	dtuple_t*	node_ptr = dict_index_build_node_ptr(
-		cursor->index, rec, next_block->page.id().page_no(),
+		cursor->index(), rec, next_block->page.id().page_no(),
 		heap, level);
 
-	btr_insert_on_non_leaf_level(
-		flags, cursor->index, level + 1, node_ptr, mtr);
+	if (btr_insert_on_non_leaf_level(flags, cursor->index(), level + 1,
+					 node_ptr, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
 
-	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
 
 	if (is_leaf
-	    && !dict_index_is_clust(cursor->index)
-	    && !cursor->index->table->is_temporary()) {
+	    && !dict_index_is_clust(cursor->index())
+	    && !cursor->index()->table->is_temporary()) {
 		/* Update the free bits of the B-tree page in the
 		insert buffer bitmap. */
 
@@ -2732,15 +2848,99 @@ btr_insert_into_right_sibling(
 }
 
 /*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+static
+dberr_t
+page_move_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in: index page from where to move */
+	rec_t*		split_rec,	/*!< in: first record to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	ulint		old_data_size;
+	ulint		new_data_size;
+	ulint		old_n_recs;
+	ulint		new_n_recs;
+
+	ut_ad(!dict_index_is_spatial(index));
+
+	old_data_size = page_get_data_size(new_page);
+	old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	new_page_zip
+			= buf_block_get_page_zip(new_block);
+		page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(!new_page_zip == !page_zip);
+		ut_a(!new_page_zip
+		     || page_zip_validate(new_page_zip, new_page, index));
+		ut_a(!page_zip
+		     || page_zip_validate(page_zip, page_align(split_rec),
+					  index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	dberr_t err;
+	if (!page_copy_rec_list_end(new_block, block,
+				    split_rec, index, mtr, &err)) {
+		return err;
+	}
+
+	new_data_size = page_get_data_size(new_page);
+	new_n_recs = page_get_n_recs(new_page);
+
+	ut_ad(new_data_size >= old_data_size);
+
+	return page_delete_rec_list_end(split_rec, block, index,
+					new_n_recs - old_n_recs,
+					new_data_size - old_data_size, mtr);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+static
+dberr_t
+page_move_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in/out: page containing split_rec */
+	rec_t*		split_rec,	/*!< in: first record not to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  dberr_t err;
+  if (page_copy_rec_list_start(new_block, block, split_rec, index, mtr, &err))
+    page_delete_rec_list_start(split_rec, block, index, mtr);
+  return err;
+}
+
+/*************************************************************//**
 Splits an index page to halves and inserts the tuple. It is assumed
 that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
 released within this function! NOTE that the operation of this
 function must always succeed, we cannot reverse it: therefore enough
 free disk space (2 pages) must be guaranteed to be available before
 this function is called.
-NOTE: jonaso added support for calling function with tuple == NULL
-which cause it to only split a page.
-
 @return inserted record or NULL if run out of space */
 rec_t*
 btr_page_split_and_insert(
@@ -2753,7 +2953,8 @@ btr_page_split_and_insert(
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
 {
 	buf_block_t*	block;
 	page_t*		page;
@@ -2771,27 +2972,32 @@ btr_page_split_and_insert(
 	ulint		n_iterations = 0;
 	ulint		n_uniq;
 
-	if (cursor->index->is_spatial()) {
+	ut_ad(*err == DB_SUCCESS);
+	ut_ad(dtuple_check_typed(tuple));
+
+	buf_pool.pages_split++;
+
+	if (cursor->index()->is_spatial()) {
 		/* Split rtree page and update parent */
-		return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
-						 tuple, n_ext, mtr));
+		return rtr_page_split_and_insert(flags, cursor, offsets, heap,
+						 tuple, n_ext, mtr, err);
 	}
 
 	if (!*heap) {
 		*heap = mem_heap_create(1024);
 	}
-	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index());
 func_start:
 	mem_heap_empty(*heap);
 	*offsets = NULL;
 
-	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_X_LOCK
 					 | MTR_MEMO_SX_LOCK));
-	ut_ad(!dict_index_is_online_ddl(cursor->index)
+	ut_ad(!dict_index_is_online_ddl(cursor->index())
 	      || (flags & BTR_CREATE_FLAG)
-	      || dict_index_is_clust(cursor->index));
-	ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index),
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+	      || dict_index_is_clust(cursor->index()));
+	ut_ad(cursor->index()->lock.have_u_or_x());
 
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
@@ -2813,7 +3019,7 @@ func_start:
 	uint32_t hint_page_no = block->page.id().page_no() + 1;
 	byte direction = FSP_UP;
 
-	if (tuple && n_iterations > 0) {
+	if (n_iterations > 0) {
 		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
 
 		if (split_rec == NULL) {
@@ -2838,20 +3044,23 @@ func_start:
 				page_get_infimum_rec(page));
 		} else {
 			split_rec = NULL;
+			goto got_split_rec;
 		}
-	}
 
-	DBUG_EXECUTE_IF("disk_is_full",
-			os_has_said_disk_full = true;
-			return(NULL););
+		if (UNIV_UNLIKELY(!split_rec)) {
+			*err = DB_CORRUPTION;
+			return nullptr;
+		}
+	}
 
+got_split_rec:
 	/* 2. Allocate a new page to the index */
 	const uint16_t page_level = btr_page_get_level(page);
-	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
-				   page_level, mtr, mtr);
+	new_block = btr_page_alloc(cursor->index(), hint_page_no, direction,
+				   page_level, mtr, mtr, err);
 
 	if (!new_block) {
-		return(NULL);
+		return nullptr;
 	}
 
 	new_page = buf_block_get_frame(new_block);
@@ -2862,13 +3071,13 @@ func_start:
 		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
 		memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
 	}
-	btr_page_create(new_block, new_page_zip, cursor->index,
+	btr_page_create(new_block, new_page_zip, cursor->index(),
 			page_level, mtr);
 	/* Only record the leaf level page splits. */
 	if (!page_level) {
-		cursor->index->stat_defrag_n_page_split ++;
-		cursor->index->stat_defrag_modified_counter ++;
-		btr_defragment_save_defrag_stats_if_needed(cursor->index);
+		cursor->index()->stat_defrag_n_page_split ++;
+		cursor->index()->stat_defrag_modified_counter ++;
+		btr_defragment_save_defrag_stats_if_needed(cursor->index());
 	}
 
 	/* 3. Calculate the first record on the upper half-page, and the
@@ -2878,13 +3087,13 @@ func_start:
 	if (split_rec) {
 		first_rec = move_limit = split_rec;
 
-		*offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
-					   page_is_leaf(page)
-					   ? cursor->index->n_core_fields : 0,
+		*offsets = rec_get_offsets(split_rec, cursor->index(),
+					   *offsets, page_is_leaf(page)
+					   ? cursor->index()->n_core_fields
+					   : 0,
 					   n_uniq, heap);
 
-		insert_left = !tuple
-			|| cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
+		insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
 
 		if (!insert_left && new_page_zip && n_iterations > 0) {
 			/* If a compressed page has already been split,
@@ -2894,62 +3103,63 @@ func_start:
 			goto insert_empty;
 		}
 	} else if (insert_left) {
-		ut_a(n_iterations > 0);
+		if (UNIV_UNLIKELY(!n_iterations)) {
+corrupted:
+			*err = DB_CORRUPTION;
+			return nullptr;
+		}
 		first_rec = page_rec_get_next(page_get_infimum_rec(page));
+insert_move_limit:
 		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+		if (UNIV_UNLIKELY(!first_rec || !move_limit)) {
+			goto corrupted;
+		}
 	} else {
 insert_empty:
 		ut_ad(!split_rec);
 		ut_ad(!insert_left);
 		buf = UT_NEW_ARRAY_NOKEY(
 			byte,
-			rec_get_converted_size(cursor->index, tuple, n_ext));
+			rec_get_converted_size(cursor->index(), tuple, n_ext));
 
-		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
+		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index(),
 						      tuple, n_ext);
-		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+		goto insert_move_limit;
 	}
 
 	/* 4. Do first the modifications in the tree structure */
 
 	/* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
-	btr_attach_half_pages(flags, cursor->index, block,
-			      first_rec, new_block, direction, mtr);
+	*err = btr_attach_half_pages(flags, cursor->index(), block,
+				     first_rec, new_block, direction, mtr);
 
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
+	}
+
+#ifdef UNIV_DEBUG
 	/* If the split is made on the leaf level and the insert will fit
 	on the appropriate half-page, we may release the tree x-latch.
 	We can then move the records after releasing the tree latch,
 	thus reducing the tree latch contention. */
-	bool insert_will_fit;
-	if (tuple == NULL) {
-		insert_will_fit = true;
-	} else if (split_rec) {
-		insert_will_fit = !new_page_zip
-			&& btr_page_insert_fits(cursor, split_rec,
-						offsets, tuple, n_ext, heap);
-	} else {
-		if (!insert_left) {
-			UT_DELETE_ARRAY(buf);
-			buf = NULL;
-		}
-
-		insert_will_fit = !new_page_zip
-			&& btr_page_insert_fits(cursor, NULL,
-						offsets, tuple, n_ext, heap);
+	const bool insert_will_fit = !new_page_zip
+		&& btr_page_insert_fits(cursor, split_rec, offsets, tuple,
+					n_ext, heap);
+#endif
+	if (!split_rec && !insert_left) {
+		UT_DELETE_ARRAY(buf);
+		buf = NULL;
 	}
 
-	if (!srv_read_only_mode
-	    && insert_will_fit
+#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled
+	if (insert_will_fit
 	    && page_is_leaf(page)
-	    && !dict_index_is_online_ddl(cursor->index)) {
-
-		mtr->memo_release(
-			dict_index_get_lock(cursor->index),
-			MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
-
+	    && !dict_index_is_online_ddl(cursor->index())) {
+		mtr->release(cursor->index()->lock);
 		/* NOTE: We cannot release root block latch here, because it
 		has segment header and already modified in most of cases.*/
 	}
+#endif
 
 	/* 5. Move then the records to the new page */
 	if (direction == FSP_DOWN) {
@@ -2959,26 +3169,39 @@ insert_empty:
 #ifdef UNIV_ZIP_COPY
 		    || page_zip
 #endif /* UNIV_ZIP_COPY */
-		    || !page_move_rec_list_start(new_block, block, move_limit,
-						 cursor->index, mtr)) {
-			/* For some reason, compressing new_page failed,
+		    || (*err = page_move_rec_list_start(new_block, block,
+							move_limit,
+							cursor->index(),
+							mtr))) {
+			if (*err != DB_FAIL) {
+				return nullptr;
+			}
+
+			/* For some reason, compressing new_block failed,
 			even though it should contain fewer records than
 			the original page.  Copy the page byte for byte
 			and then delete the records from both pages
 			as appropriate.  Deleting will always succeed. */
 			ut_a(new_page_zip);
 
-			page_zip_copy_recs(new_block,
-					   page_zip, page, cursor->index, mtr);
-			page_delete_rec_list_end(move_limit - page + new_page,
-						 new_block, cursor->index,
-						 ULINT_UNDEFINED,
-						 ULINT_UNDEFINED, mtr);
+			page_zip_copy_recs(new_block, page_zip, page,
+					   cursor->index(), mtr);
+			*err = page_delete_rec_list_end(move_limit
+							- page + new_page,
+							new_block,
+							cursor->index(),
+							ULINT_UNDEFINED,
+							ULINT_UNDEFINED, mtr);
+			if (*err != DB_SUCCESS) {
+				return nullptr;
+			}
 
 			/* Update the lock table and possible hash index. */
-			lock_move_rec_list_start(
-				new_block, block, move_limit,
-				new_page + PAGE_NEW_INFIMUM);
+			if (cursor->index()->has_locking()) {
+				lock_move_rec_list_start(
+					new_block, block, move_limit,
+					new_page + PAGE_NEW_INFIMUM);
+			}
 
 			btr_search_move_or_delete_hash_entries(
 				new_block, block);
@@ -2986,13 +3209,13 @@ insert_empty:
 			/* Delete the records from the source page. */
 
 			page_delete_rec_list_start(move_limit, block,
-						   cursor->index, mtr);
+						   cursor->index(), mtr);
 		}
 
 		left_block = new_block;
 		right_block = block;
 
-		if (!dict_table_is_locking_disabled(cursor->index->table)) {
+		if (cursor->index()->has_locking()) {
 			lock_update_split_left(right_block, left_block);
 		}
 	} else {
@@ -3002,8 +3225,13 @@ insert_empty:
 #ifdef UNIV_ZIP_COPY
 		    || page_zip
 #endif /* UNIV_ZIP_COPY */
-		    || !page_move_rec_list_end(new_block, block, move_limit,
-					       cursor->index, mtr)) {
+		    || (*err = page_move_rec_list_end(new_block, block,
+						      move_limit,
+						      cursor->index(), mtr))) {
+			if (*err != DB_FAIL) {
+				return nullptr;
+			}
+
 			/* For some reason, compressing new_page failed,
 			even though it should contain fewer records than
 			the original page.  Copy the page byte for byte
@@ -3011,38 +3239,45 @@ insert_empty:
 			as appropriate.  Deleting will always succeed. */
 			ut_a(new_page_zip);
 
-			page_zip_copy_recs(new_block,
-					   page_zip, page, cursor->index, mtr);
+			page_zip_copy_recs(new_block, page_zip, page,
+					   cursor->index(), mtr);
 			page_delete_rec_list_start(move_limit - page
 						   + new_page, new_block,
-						   cursor->index, mtr);
+						   cursor->index(), mtr);
 
 			/* Update the lock table and possible hash index. */
-			lock_move_rec_list_end(new_block, block, move_limit);
+			if (cursor->index()->has_locking()) {
+				lock_move_rec_list_end(new_block, block,
+						       move_limit);
+			}
 
 			btr_search_move_or_delete_hash_entries(
 				new_block, block);
 
 			/* Delete the records from the source page. */
 
-			page_delete_rec_list_end(move_limit, block,
-						 cursor->index,
-						 ULINT_UNDEFINED,
-						 ULINT_UNDEFINED, mtr);
+			*err = page_delete_rec_list_end(move_limit, block,
+							cursor->index(),
+							ULINT_UNDEFINED,
+							ULINT_UNDEFINED, mtr);
+			if (*err != DB_SUCCESS) {
+				return nullptr;
+			}
 		}
 
 		left_block = block;
 		right_block = new_block;
 
-		if (!dict_table_is_locking_disabled(cursor->index->table)) {
+		if (cursor->index()->has_locking()) {
 			lock_update_split_right(right_block, left_block);
 		}
 	}
 
 #ifdef UNIV_ZIP_DEBUG
 	if (page_zip) {
-		ut_a(page_zip_validate(page_zip, page, cursor->index));
-		ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
+		ut_a(page_zip_validate(page_zip, page, cursor->index()));
+		ut_a(page_zip_validate(new_page_zip, new_page,
+				       cursor->index()));
 	}
 #endif /* UNIV_ZIP_DEBUG */
 
@@ -3055,17 +3290,20 @@ insert_empty:
 	buf_block_t* const insert_block = insert_left
 		? left_block : right_block;
 
-	if (UNIV_UNLIKELY(!tuple)) {
-		rec = NULL;
-		goto func_exit;
-	}
-
 	/* 7. Reposition the cursor for insert and try insertion */
 	page_cursor = btr_cur_get_page_cur(cursor);
+	page_cursor->block = insert_block;
+
+	ulint up_match = 0, low_match = 0;
 
-	page_cur_search(insert_block, cursor->index, tuple, page_cursor);
+	if (page_cur_search_with_match(tuple,
+				       PAGE_CUR_LE, &up_match, &low_match,
+				       page_cursor, nullptr)) {
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
 
-	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+	rec = page_cur_tuple_insert(page_cursor, tuple,
 				    offsets, heap, n_ext, mtr);
 
 #ifdef UNIV_ZIP_DEBUG
@@ -3078,7 +3316,7 @@ insert_empty:
 
 		ut_a(!insert_page_zip
 		     || page_zip_validate(insert_page_zip, insert_page,
-					  cursor->index));
+					  cursor->index()));
 	}
 #endif /* UNIV_ZIP_DEBUG */
 
@@ -3091,13 +3329,17 @@ insert_empty:
 	For compressed pages, page_cur_tuple_insert() will have
 	attempted this already. */
 
-	if (page_cur_get_page_zip(page_cursor)
-	    || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
-
+	if (page_cur_get_page_zip(page_cursor)) {
 		goto insert_failed;
 	}
 
-	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+	*err = btr_page_reorganize(page_cursor, mtr);
+
+	if (*err != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	rec = page_cur_tuple_insert(page_cursor, tuple,
 				    offsets, heap, n_ext, mtr);
 
 	if (rec == NULL) {
@@ -3105,8 +3347,8 @@ insert_empty:
 		start of the function for a new split */
 insert_failed:
 		/* We play safe and reset the free bits for new_page */
-		if (!dict_index_is_clust(cursor->index)
-		    && !cursor->index->table->is_temporary()) {
+		if (!dict_index_is_clust(page_cursor->index)
+		    && !page_cursor->index->table->is_temporary()) {
 			ibuf_reset_free_bits(new_block);
 			ibuf_reset_free_bits(block);
 		}
@@ -3123,21 +3365,20 @@ func_exit:
 	/* Insert fit on the page: update the free bits for the
 	left and right pages in the same mtr */
 
-	if (!dict_index_is_clust(cursor->index)
-	    && !cursor->index->table->is_temporary()
+	if (!dict_index_is_clust(page_cursor->index)
+	    && !page_cursor->index->table->is_temporary()
 	    && page_is_leaf(page)) {
 
 		ibuf_update_free_bits_for_two_pages_low(
 			left_block, right_block, mtr);
 	}
 
-	MONITOR_INC(MONITOR_INDEX_SPLIT);
-
-	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
-	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(left_block),
+			    page_cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(right_block),
+			    page_cursor->index));
 
-	ut_ad(tuple || !rec);
-	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
 	return(rec);
 }
 
@@ -3148,68 +3389,69 @@ func_exit:
 dberr_t btr_level_list_remove(const buf_block_t& block,
                               const dict_index_t& index, mtr_t* mtr)
 {
-	ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
-	ut_ad(block.zip_size() == index.table->space->zip_size());
-	ut_ad(index.table->space->id == block.page.id().space());
-	/* Get the previous and next page numbers of page */
-
-	const page_t* page = block.frame;
-	const uint32_t	prev_page_no = btr_page_get_prev(page);
-	const uint32_t	next_page_no = btr_page_get_next(page);
-
-	/* Update page links of the level */
-
-	if (prev_page_no != FIL_NULL) {
-		buf_block_t*	prev_block = btr_block_get(
-			index, prev_page_no, RW_X_LATCH, page_is_leaf(page),
-			mtr);
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
-		static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
-		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-		ut_a(!memcmp_aligned<4>(prev_block->frame + FIL_PAGE_NEXT,
-					page + FIL_PAGE_OFFSET, 4));
-#endif /* UNIV_BTR_DEBUG */
-
-		btr_page_set_next(prev_block, next_page_no, mtr);
-	}
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(block.zip_size() == index.table->space->zip_size());
+  ut_ad(index.table->space->id == block.page.id().space());
+  /* Get the previous and next page numbers of page */
+  const uint32_t prev_page_no= btr_page_get_prev(block.page.frame);
+  const uint32_t next_page_no= btr_page_get_next(block.page.frame);
+  page_id_t id{block.page.id()};
+  buf_block_t *prev= nullptr, *next;
+  dberr_t err;
+
+  /* Update page links of the level */
+  if (prev_page_no != FIL_NULL)
+  {
+    id.set_page_no(prev_page_no);
+    prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+    if (!prev)
+    {
+      ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
+      prev= btr_block_get(index, id.page_no(), RW_X_LATCH,
+                          page_is_leaf(block.page.frame), mtr, &err);
+      if (UNIV_UNLIKELY(!prev))
+        return err;
+    }
+#endif
+  }
 
-	if (next_page_no != FIL_NULL) {
-		buf_block_t*	next_block = btr_block_get(
-			index, next_page_no, RW_X_LATCH, page_is_leaf(page),
-			mtr);
+  if (next_page_no != FIL_NULL)
+  {
+    id.set_page_no(next_page_no);
+    next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+    if (!next)
+    {
+      ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
+      next= btr_block_get(index, id.page_no(), RW_X_LATCH,
+                          page_is_leaf(block.page.frame), mtr, &err);
+      if (UNIV_UNLIKELY(!next))
+        return err;
+    }
+#endif
+    btr_page_set_prev(next, prev_page_no, mtr);
+  }
 
-		if (!next_block) {
-			return DB_ERROR;
-		}
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
-		static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
-		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-		ut_a(!memcmp_aligned<4>(next_block->frame + FIL_PAGE_PREV,
-					page + FIL_PAGE_OFFSET, 4));
-#endif /* UNIV_BTR_DEBUG */
+  if (prev)
+    btr_page_set_next(prev, next_page_no, mtr);
 
-		btr_page_set_prev(next_block, prev_page_no, mtr);
-	}
-
-	return DB_SUCCESS;
+  return DB_SUCCESS;
 }
 
 /*************************************************************//**
 If page is the only on its level, this function moves its records to the
 father page, thus reducing the tree height.
 @return father block */
-UNIV_INTERN
 buf_block_t*
 btr_lift_page_up(
-/*=============*/
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: page which is the only on its level;
 				must not be empty: use
 				btr_discard_only_page_on_level if the last
 				record from the page should be removed */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	dberr_t*	err)	/*!< out: error code */
 {
 	buf_block_t*	father_block;
 	ulint		page_level;
@@ -3237,14 +3479,14 @@ btr_lift_page_up(
 			* (REC_OFFS_HEADER_SIZE + 1 + 1
 			   + unsigned(index->n_fields)));
 		buf_block_t*	b;
+		cursor.page_cur.index = index;
+		cursor.page_cur.block = block;
 
-		if (dict_index_is_spatial(index)) {
+		if (index->is_spatial()) {
 			offsets = rtr_page_get_father_block(
-				NULL, heap, index, block, mtr,
-				NULL, &cursor);
+				nullptr, heap, mtr, nullptr, &cursor);
 		} else {
 			offsets = btr_page_get_father_block(offsets, heap,
-							    index, block,
 							    mtr, &cursor);
 		}
 		father_block = btr_cur_get_block(&cursor);
@@ -3261,14 +3503,12 @@ btr_lift_page_up(
 		     b->page.id().page_no() != root_page_no; ) {
 			ut_a(n_blocks < BTR_MAX_LEVELS);
 
-			if (dict_index_is_spatial(index)) {
+			if (index->is_spatial()) {
 				offsets = rtr_page_get_father_block(
-					NULL, heap, index, b, mtr,
-					NULL, &cursor);
+					nullptr, heap, mtr, nullptr, &cursor);
 			} else {
 				offsets = btr_page_get_father_block(offsets,
 								    heap,
-								    index, b,
 								    mtr,
 								    &cursor);
 			}
@@ -3306,7 +3546,7 @@ btr_lift_page_up(
 	/* Make the father empty */
 	btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
 	/* btr_page_empty() is supposed to zero-initialize the field. */
-	ut_ad(!page_get_instant(father_block->frame));
+	ut_ad(!page_get_instant(father_block->page.frame));
 
 	if (index->is_instant()
 	    && father_block->page.id().page_no() == root_page_no) {
@@ -3333,7 +3573,17 @@ btr_lift_page_up(
 #endif /* UNIV_ZIP_COPY */
 	    || !page_copy_rec_list_end(father_block, block,
 				       page_get_infimum_rec(page),
-				       index, mtr)) {
+				       index, mtr, err)) {
+		switch (*err) {
+		case DB_SUCCESS:
+			break;
+		case DB_FAIL:
+			*err = DB_SUCCESS;
+			break;
+		default:
+			return nullptr;
+		}
+
 		const page_zip_des_t*	page_zip
 			= buf_block_get_page_zip(block);
 		ut_a(father_page_zip);
@@ -3345,12 +3595,14 @@ btr_lift_page_up(
 
 		/* Update the lock table and possible hash index. */
 
-		lock_move_rec_list_end(father_block, block,
-				       page_get_infimum_rec(page));
+		if (index->has_locking()) {
+			lock_move_rec_list_end(father_block, block,
+					       page_get_infimum_rec(page));
+		}
 
 		/* Also update the predicate locks */
 		if (dict_index_is_spatial(index)) {
-			lock_prdt_rec_move(father_block, block);
+			lock_prdt_rec_move(father_block, block->page.id());
 		} else {
 			btr_search_move_or_delete_hash_entries(
 				father_block, block);
@@ -3358,22 +3610,22 @@ btr_lift_page_up(
 	}
 
 copied:
-	if (!dict_table_is_locking_disabled(index->table)) {
+	if (index->has_locking()) {
+		const page_id_t id{block->page.id()};
 		/* Free predicate page locks on the block */
-		if (dict_index_is_spatial(index)) {
-			lock_mutex_enter();
-			lock_prdt_page_free_from_discard(
-				block, &lock_sys.prdt_page_hash);
-			lock_mutex_exit();
+		if (index->is_spatial()) {
+			lock_sys.prdt_page_free_from_discard(id);
+		} else {
+			lock_update_copy_and_discard(*father_block, id);
 		}
-		lock_update_copy_and_discard(father_block, block);
 	}
 
 	page_level++;
 
 	/* Go upward to root page, decrementing levels by one. */
 	for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
-		ut_ad(btr_page_get_level(blocks[i]->frame) == page_level + 1);
+		ut_ad(btr_page_get_level(blocks[i]->page.frame)
+		      == page_level + 1);
 		btr_page_set_level(blocks[i], page_level, mtr);
 	}
 
@@ -3389,7 +3641,7 @@ copied:
 	    && !index->table->is_temporary()) {
 		ibuf_reset_free_bits(father_block);
 	}
-	ut_ad(page_validate(father_block->frame, index));
+	ut_ad(page_validate(father_block->page.frame, index));
 	ut_ad(btr_check_node_ptr(index, father_block, mtr));
 
 	return(lift_father_up ? block_orig : father_block);
@@ -3404,21 +3656,21 @@ level lifts the records of the page to the father page, thus reducing the
 tree height. It is assumed that mtr holds an x-latch on the tree and on the
 page. If cursor is on the leaf level, mtr must also hold x-latches to the
 brothers, if they exist.
-@return TRUE on success */
-ibool
+@return error code */
+dberr_t
 btr_compress(
 /*=========*/
 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
 				or lift; the page must not be empty:
 				when deleting records, use btr_discard_page()
 				if the page would become empty */
-	ibool		adjust,	/*!< in: TRUE if should adjust the
-				cursor position even if compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
 	dict_index_t*	index;
-	buf_block_t*	merge_block;
-	page_t*		merge_page = NULL;
+	buf_block_t*	merge_block = nullptr;
+	page_t*		merge_page = nullptr;
 	page_zip_des_t*	merge_page_zip;
 	ibool		is_left;
 	buf_block_t*	block;
@@ -3437,8 +3689,6 @@ btr_compress(
 	page = btr_cur_get_page(cursor);
 	index = btr_cur_get_index(cursor);
 
-	btr_assert_not_corrupted(block, index);
-
 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 					 | MTR_MEMO_SX_LOCK));
 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
@@ -3447,20 +3697,20 @@ btr_compress(
 
 	const uint32_t left_page_no = btr_page_get_prev(page);
 	const uint32_t right_page_no = btr_page_get_next(page);
+	dberr_t err = DB_SUCCESS;
 
-#ifdef UNIV_DEBUG
-	if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
-		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
-			page_rec_get_next(page_get_infimum_rec(page)),
-			page_is_comp(page)));
-	}
-#endif /* UNIV_DEBUG */
+	ut_ad(page_is_leaf(page) || left_page_no != FIL_NULL
+	      || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			  page_rec_get_next(page_get_infimum_rec(page)),
+			  page_is_comp(page))));
 
 	heap = mem_heap_create(100);
+	father_cursor.page_cur.index = index;
+	father_cursor.page_cur.block = block;
 
-	if (dict_index_is_spatial(index)) {
+	if (index->is_spatial()) {
 		offsets = rtr_page_get_father_block(
-			NULL, heap, index, block, mtr, cursor, &father_cursor);
+			NULL, heap, mtr, cursor, &father_cursor);
 		ut_ad(cursor->page_cur.block->page.id() == block->page.id());
 		rec_t*  my_rec = father_cursor.page_cur.rec;
 
@@ -3471,24 +3721,51 @@ btr_compress(
 				<< page_no << "instead of "
 				<< block->page.id().page_no();
 			offsets = btr_page_get_father_block(
-				NULL, heap, index, block, mtr, &father_cursor);
+				NULL, heap, mtr, &father_cursor);
 		}
 	} else {
 		offsets = btr_page_get_father_block(
-			NULL, heap, index, block, mtr, &father_cursor);
+			NULL, heap, mtr, &father_cursor);
 	}
 
 	if (adjust) {
 		nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
-		ut_ad(nth_rec > 0);
+		if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) {
+		corrupted:
+			err = DB_CORRUPTION;
+		err_exit:
+			/* We play it safe and reset the free bits. */
+			if (merge_block && merge_block->zip_size()
+			    && page_is_leaf(merge_block->page.frame)
+			    && !index->is_clust()) {
+				ibuf_reset_free_bits(merge_block);
+			}
+			goto func_exit;
+		}
 	}
 
 	if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
 		/* The page is the only one on the level, lift the records
 		to the father */
 
-		merge_block = btr_lift_page_up(index, block, mtr);
-		goto func_exit;
+		merge_block = btr_lift_page_up(index, block, mtr, &err);
+success:
+		if (adjust) {
+			ut_ad(nth_rec > 0);
+			if (rec_t* nth
+			    = page_rec_get_nth(merge_block->page.frame,
+					       nth_rec)) {
+				btr_cur_position(index, nth,
+						 merge_block, cursor);
+			} else {
+				goto corrupted;
+			}
+		}
+
+		MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
+func_exit:
+		mem_heap_free(heap);
+		DBUG_RETURN(err);
 	}
 
 	ut_d(leftmost_child =
@@ -3512,20 +3789,20 @@ retry:
 		if (!merge_block) {
 			merge_page = NULL;
 		}
+cannot_merge:
+		err = DB_FAIL;
 		goto err_exit;
 	}
 
 	merge_page = buf_block_get_frame(merge_block);
 
-#ifdef UNIV_BTR_DEBUG
-	if (is_left) {
-		ut_a(btr_page_get_next(merge_page)
-		     == block->page.id().page_no());
-	} else {
-		ut_a(btr_page_get_prev(merge_page)
-		     == block->page.id().page_no());
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_page + (is_left
+							  ? FIL_PAGE_NEXT
+							  : FIL_PAGE_PREV),
+					    block->page.frame
+					    + FIL_PAGE_OFFSET, 4))) {
+		goto corrupted;
 	}
-#endif /* UNIV_BTR_DEBUG */
 
 	ut_ad(page_validate(merge_page, index));
 
@@ -3540,9 +3817,12 @@ retry:
 	}
 #endif /* UNIV_ZIP_DEBUG */
 
+	btr_cur_t cursor2;
+	cursor2.page_cur.index = index;
+	cursor2.page_cur.block = merge_block;
+
 	/* Move records to the merge page */
 	if (is_left) {
-		btr_cur_t	cursor2;
 		rtr_mbr_t	new_mbr;
 		rec_offs*	offsets2 = NULL;
 
@@ -3552,8 +3832,7 @@ retry:
 			page */
 			if (!rtr_check_same_block(
 				index, &cursor2,
-				btr_cur_get_block(&father_cursor),
-				merge_block, heap)) {
+				btr_cur_get_block(&father_cursor), heap)) {
 				is_left = false;
 				goto retry;
 			}
@@ -3565,7 +3844,7 @@ retry:
 
 			offsets2 = rec_get_offsets(
 				btr_cur_get_rec(&cursor2), index, NULL,
-				page_is_leaf(cursor2.page_cur.block->frame)
+				page_is_leaf(btr_cur_get_page(&cursor2))
 				? index->n_fields : 0,
 				ULINT_UNDEFINED, &heap);
 
@@ -3577,7 +3856,7 @@ retry:
 
 		rec_t*	orig_pred = page_copy_rec_list_start(
 			merge_block, block, page_get_supremum_rec(page),
-			index, mtr);
+			index, mtr, &err);
 
 		if (!orig_pred) {
 			goto err_exit;
@@ -3586,11 +3865,15 @@ retry:
 		btr_search_drop_page_hash_index(block, false);
 
 		/* Remove the page from the level list */
-		if (DB_SUCCESS != btr_level_list_remove(*block, *index, mtr)) {
+		err = btr_level_list_remove(*block, *index, mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 			goto err_exit;
 		}
 
-		if (dict_index_is_spatial(index)) {
+		const page_id_t id{block->page.id()};
+
+		if (index->is_spatial()) {
 			rec_t*  my_rec = father_cursor.page_cur.rec;
 
 			ulint page_no = btr_node_ptr_get_child_page_no(
@@ -3611,53 +3894,48 @@ retry:
 			}
 
 			/* No GAP lock needs to be worrying about */
-			lock_mutex_enter();
-			lock_prdt_page_free_from_discard(
-				block, &lock_sys.prdt_page_hash);
-			lock_rec_free_all_from_discard_page(block);
-			lock_mutex_exit();
+			lock_sys.prdt_page_free_from_discard(id);
 		} else {
-			btr_cur_node_ptr_delete(&father_cursor, mtr);
-			if (!dict_table_is_locking_disabled(index->table)) {
+			err = btr_cur_node_ptr_delete(&father_cursor, mtr);
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto err_exit;
+			}
+			if (index->has_locking()) {
 				lock_update_merge_left(
-					merge_block, orig_pred, block);
+					*merge_block, orig_pred, id);
 			}
 		}
 
 		if (adjust) {
-			nth_rec += page_rec_get_n_recs_before(orig_pred);
+			ulint n = page_rec_get_n_recs_before(orig_pred);
+			if (UNIV_UNLIKELY(!n || n == ULINT_UNDEFINED)) {
+				goto corrupted;
+			}
+			nth_rec += n;
 		}
 	} else {
 		rec_t*		orig_succ;
 		ibool		compressed;
 		dberr_t		err;
-		btr_cur_t	cursor2;
-					/* father cursor pointing to node ptr
-					of the right sibling */
-#ifdef UNIV_BTR_DEBUG
 		byte		fil_page_prev[4];
-#endif /* UNIV_BTR_DEBUG */
-
-		if (dict_index_is_spatial(index)) {
-			cursor2.rtr_info = NULL;
 
+		if (index->is_spatial()) {
 			/* For spatial index, we disallow merge of blocks
 			with different parents, since the merge would need
 			to update entry (for MBR and Primary key) in the
 			parent of block being merged */
 			if (!rtr_check_same_block(
 				index, &cursor2,
-				btr_cur_get_block(&father_cursor),
-				merge_block, heap)) {
-				goto err_exit;
+				btr_cur_get_block(&father_cursor), heap)) {
+				goto cannot_merge;
 			}
 
 			/* Set rtr_info for cursor2, since it is
 			necessary in recursive page merge. */
 			cursor2.rtr_info = cursor->rtr_info;
 			cursor2.tree_height = cursor->tree_height;
-		} else {
-			btr_page_get_father(index, merge_block, mtr, &cursor2);
+		} else if (!btr_page_get_father(mtr, &cursor2)) {
+			goto cannot_merge;
 		}
 
 		if (merge_page_zip && left_page_no == FIL_NULL) {
@@ -3667,33 +3945,28 @@ retry:
 			requires that FIL_PAGE_PREV be FIL_NULL.
 			Clear the field, but prepare to restore it. */
 			static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
-#ifdef UNIV_BTR_DEBUG
 			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
-#endif /* UNIV_BTR_DEBUG */
 			compile_time_assert(FIL_NULL == 0xffffffffU);
 			memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4);
 		}
 
 		orig_succ = page_copy_rec_list_end(merge_block, block,
 						   page_get_infimum_rec(page),
-						   cursor->index, mtr);
+						   cursor->index(), mtr, &err);
 
 		if (!orig_succ) {
 			ut_a(merge_page_zip);
-#ifdef UNIV_BTR_DEBUG
 			if (left_page_no == FIL_NULL) {
 				/* FIL_PAGE_PREV was restored from
 				merge_page_zip. */
-				ut_a(!memcmp(fil_page_prev,
-					     merge_page + FIL_PAGE_PREV, 4));
+				ut_ad(!memcmp(fil_page_prev,
+					      merge_page + FIL_PAGE_PREV, 4));
 			}
-#endif /* UNIV_BTR_DEBUG */
 			goto err_exit;
 		}
 
 		btr_search_drop_page_hash_index(block, false);
 
-#ifdef UNIV_BTR_DEBUG
 		if (merge_page_zip && left_page_no == FIL_NULL) {
 
 			/* Restore FIL_PAGE_PREV in order to avoid an assertion
@@ -3704,10 +3977,11 @@ retry:
 			are X-latched. */
 			memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
 		}
-#endif /* UNIV_BTR_DEBUG */
 
 		/* Remove the page from the level list */
-		if (DB_SUCCESS != btr_level_list_remove(*block, *index, mtr)) {
+		err = btr_level_list_remove(*block, *index, mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 			goto err_exit;
 		}
 
@@ -3738,7 +4012,7 @@ retry:
 
 			offsets2 = rec_get_offsets(
 				btr_cur_get_rec(&cursor2), index, NULL,
-				page_is_leaf(cursor2.page_cur.block->frame)
+				page_is_leaf(btr_cur_get_page(&cursor2))
 				? index->n_fields : 0,
 				ULINT_UNDEFINED, &heap);
 
@@ -3767,11 +4041,8 @@ retry:
 							 offsets2, offsets,
 							 merge_page, mtr);
 			}
-			lock_mutex_enter();
-			lock_prdt_page_free_from_discard(
-				block, &lock_sys.prdt_page_hash);
-			lock_rec_free_all_from_discard_page(block);
-			lock_mutex_exit();
+			const page_id_t id{block->page.id()};
+			lock_sys.prdt_page_free_from_discard(id);
 		} else {
 
 			compressed = btr_cur_pessimistic_delete(&err, TRUE,
@@ -3781,12 +4052,11 @@ retry:
 			ut_a(err == DB_SUCCESS);
 
 			if (!compressed) {
-				btr_cur_compress_if_useful(&cursor2,
-							   FALSE,
+				btr_cur_compress_if_useful(&cursor2, false,
 							   mtr);
 			}
 
-			if (!dict_table_is_locking_disabled(index->table)) {
+			if (index->has_locking()) {
 				lock_update_merge_right(
 					merge_block, orig_succ, block);
 			}
@@ -3847,39 +4117,14 @@ retry:
 	}
 
 	/* Free the file page */
-	btr_page_free(index, block, mtr);
-
-	/* btr_check_node_ptr() needs parent block latched.
-	If the merge_block's parent block is not same,
-	we cannot use btr_check_node_ptr() */
-	ut_ad(leftmost_child
-	      || btr_check_node_ptr(index, merge_block, mtr));
-func_exit:
-	mem_heap_free(heap);
-
-	if (adjust) {
-		ut_ad(nth_rec > 0);
-		btr_cur_position(
-			index,
-			page_rec_get_nth(merge_block->frame, nth_rec),
-			merge_block, cursor);
-	}
-
-	MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
-
-	DBUG_RETURN(TRUE);
-
-err_exit:
-	/* We play it safe and reset the free bits. */
-	if (merge_block && merge_block->zip_size()
-	    && page_is_leaf(merge_block->frame)
-	    && !dict_index_is_clust(index)) {
-
-		ibuf_reset_free_bits(merge_block);
-	}
-
-	mem_heap_free(heap);
-	DBUG_RETURN(FALSE);
+	err = btr_page_free(index, block, mtr);
+        if (err == DB_SUCCESS) {
+		ut_ad(leftmost_child
+		      || btr_check_node_ptr(index, merge_block, mtr));
+		goto success;
+        } else {
+		goto err_exit;
+        }
 }
 
 /*************************************************************//**
@@ -3901,8 +4146,12 @@ btr_discard_only_page_on_level(
 	ut_ad(!index->is_dummy);
 
 	/* Save the PAGE_MAX_TRX_ID from the leaf page. */
-	const trx_id_t max_trx_id = page_get_max_trx_id(block->frame);
-	const rec_t* r = page_rec_get_next(page_get_infimum_rec(block->frame));
+	const trx_id_t max_trx_id = page_get_max_trx_id(block->page.frame);
+	const rec_t* r = page_rec_get_next(
+		page_get_infimum_rec(block->page.frame));
+	/* In the caller we checked that a valid key exists in the page,
+	because we were able to look up a parent page. */
+	ut_ad(r);
 	ut_ad(rec_is_metadata(r, *index) == index->is_instant());
 
 	while (block->page.id().page_no() != dict_index_get_page(index)) {
@@ -3917,23 +4166,31 @@ btr_discard_only_page_on_level(
 		ut_ad(block->page.id().space() == index->table->space->id);
 		ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 		btr_search_drop_page_hash_index(block, false);
+		cursor.page_cur.index = index;
+		cursor.page_cur.block = block;
 
-		if (dict_index_is_spatial(index)) {
+		if (index->is_spatial()) {
 			/* Check any concurrent search having this page */
 			rtr_check_discard_page(index, NULL, block);
-			rtr_page_get_father(index, block, mtr, NULL, &cursor);
+			if (!rtr_page_get_father(mtr, nullptr, &cursor)) {
+				return;
+			}
 		} else {
-			btr_page_get_father(index, block, mtr, &cursor);
+			if (!btr_page_get_father(mtr, &cursor)) {
+				return;
+			}
 		}
 		father = btr_cur_get_block(&cursor);
 
-		if (!dict_table_is_locking_disabled(index->table)) {
+		if (index->has_locking()) {
 			lock_update_discard(
 				father, PAGE_HEAP_NO_SUPREMUM, block);
 		}
 
 		/* Free the file page */
-		btr_page_free(index, block, mtr);
+		if (btr_page_free(index, block, mtr) != DB_SUCCESS) {
+			return;
+		}
 
 		block = father;
 		page_level++;
@@ -3941,18 +4198,7 @@ btr_discard_only_page_on_level(
 
 	/* block is the root page, which must be empty, except
 	for the node pointer to the (now discarded) block(s). */
-	ut_ad(!page_has_siblings(block->frame));
-
-#ifdef UNIV_BTR_DEBUG
-	if (!dict_index_is_ibuf(index)) {
-		const page_t*	root	= buf_block_get_frame(block);
-		const ulint	space	= index->table->space_id;
-		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
-					    + root, space));
-		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
-					    + root, space));
-	}
-#endif /* UNIV_BTR_DEBUG */
+	ut_ad(!page_has_siblings(block->page.frame));
 
 	mem_heap_t* heap = nullptr;
 	const rec_t* rec = nullptr;
@@ -3975,17 +4221,17 @@ btr_discard_only_page_on_level(
 	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
 	/* btr_page_empty() is supposed to zero-initialize the field. */
-	ut_ad(!page_get_instant(block->frame));
+	ut_ad(!page_get_instant(block->page.frame));
 
 	if (index->is_primary()) {
 		if (rec) {
 			page_cur_t cur;
 			page_cur_set_before_first(block, &cur);
+			cur.index = index;
 			DBUG_ASSERT(index->table->instant);
 			DBUG_ASSERT(rec_is_alter_metadata(rec, *index));
 			btr_set_instant(block, *index, mtr);
-			rec = page_cur_insert_rec_low(&cur, index, rec,
-						      offsets, mtr);
+			rec = page_cur_insert_rec_low(&cur, rec, offsets, mtr);
 			ut_ad(rec);
 			mem_heap_free(heap);
 		} else if (index->is_instant()) {
@@ -4006,7 +4252,7 @@ btr_discard_only_page_on_level(
 Discards a page from a B-tree. This is used to remove the last record from
 a B-tree page: the whole page must be removed at the same time. This cannot
 be used for the root page, which is allowed to be empty. */
-void
+dberr_t
 btr_discard_page(
 /*=============*/
 	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
@@ -4020,6 +4266,7 @@ btr_discard_page(
 
 	block = btr_cur_get_block(cursor);
 	index = btr_cur_get_index(cursor);
+	parent_cursor.page_cur = cursor->page_cur;
 
 	ut_ad(dict_index_get_page(index) != block->page.id().page_no());
 
@@ -4029,25 +4276,40 @@ btr_discard_page(
 
 	MONITOR_INC(MONITOR_INDEX_DISCARD);
 
-	if (dict_index_is_spatial(index)) {
-		rtr_page_get_father(index, block, mtr, cursor, &parent_cursor);
-	} else {
-		btr_page_get_father(index, block, mtr, &parent_cursor);
+	if (index->is_spatial()
+	    ? !rtr_page_get_father(mtr, cursor, &parent_cursor)
+	    : !btr_page_get_father(mtr, &parent_cursor)) {
+		return DB_CORRUPTION;
 	}
 
 	/* Decide the page which will inherit the locks */
 
-	const uint32_t left_page_no = btr_page_get_prev(block->frame);
-	const uint32_t right_page_no = btr_page_get_next(block->frame);
+	const uint32_t left_page_no = btr_page_get_prev(block->page.frame);
+	const uint32_t right_page_no = btr_page_get_next(block->page.frame);
+	page_id_t merge_page_id{block->page.id()};
 
 	ut_d(bool parent_is_different = false);
+	dberr_t err;
 	if (left_page_no != FIL_NULL) {
-		merge_block = btr_block_get(*index, left_page_no, RW_X_LATCH,
-					    true, mtr);
-#ifdef UNIV_BTR_DEBUG
-		ut_a(btr_page_get_next(merge_block->frame)
-		     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
+		merge_page_id.set_page_no(left_page_no);
+		merge_block = btr_block_reget(mtr, *index, merge_page_id,
+					      RW_X_LATCH, &err);
+		if (UNIV_UNLIKELY(!merge_block)) {
+			return err;
+		}
+#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */
+		ut_ad(!memcmp_aligned<4>(merge_block->page.frame
+					 + FIL_PAGE_NEXT,
+					 block->page.frame + FIL_PAGE_OFFSET,
+					 4));
+#else
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame
+						    + FIL_PAGE_NEXT,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4))) {
+			return DB_CORRUPTION;
+		}
+#endif
 		ut_d(parent_is_different =
 			(page_rec_get_next(
 				page_get_infimum_rec(
@@ -4055,54 +4317,73 @@ btr_discard_page(
 						&parent_cursor)))
 			 == btr_cur_get_rec(&parent_cursor)));
 	} else if (right_page_no != FIL_NULL) {
-		merge_block = btr_block_get(*index, right_page_no, RW_X_LATCH,
-					    true, mtr);
-#ifdef UNIV_BTR_DEBUG
-		ut_a(btr_page_get_prev(merge_block->frame)
-		     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
+		merge_page_id.set_page_no(right_page_no);
+		merge_block = btr_block_reget(mtr, *index, merge_page_id,
+                                              RW_X_LATCH, &err);
+		if (UNIV_UNLIKELY(!merge_block)) {
+			return err;
+		}
+#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */
+		ut_ad(!memcmp_aligned<4>(merge_block->page.frame
+					 + FIL_PAGE_PREV,
+					 block->page.frame + FIL_PAGE_OFFSET,
+					 4));
+#else
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame
+						    + FIL_PAGE_PREV,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4))) {
+			return DB_CORRUPTION;
+		}
+#endif
 		ut_d(parent_is_different = page_rec_is_supremum(
 			page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
-		if (!page_is_leaf(merge_block->frame)) {
-			rec_t* node_ptr = page_rec_get_next(
-				page_get_infimum_rec(merge_block->frame));
+		if (page_is_leaf(merge_block->page.frame)) {
+		} else if (rec_t* node_ptr =
+                           page_rec_get_next(page_get_infimum_rec(
+					   merge_block->page.frame))) {
 			ut_ad(page_rec_is_user_rec(node_ptr));
 			/* We have to mark the leftmost node pointer as the
 			predefined minimum record. */
 			btr_set_min_rec_mark<true>(node_ptr, *merge_block,
 						   mtr);
+		} else {
+			return DB_CORRUPTION;
 		}
 	} else {
 		btr_discard_only_page_on_level(index, block, mtr);
+		return DB_SUCCESS;
+	}
 
-		return;
+	if (UNIV_UNLIKELY(memcmp_aligned<2>(&merge_block->page.frame
+					    [PAGE_HEADER + PAGE_LEVEL],
+					    &block->page.frame
+					    [PAGE_HEADER + PAGE_LEVEL], 2))) {
+		return DB_CORRUPTION;
 	}
 
-	ut_a(page_is_comp(merge_block->frame) == page_is_comp(block->frame));
-	ut_ad(!memcmp_aligned<2>(&merge_block->frame[PAGE_HEADER + PAGE_LEVEL],
-				 &block->frame[PAGE_HEADER + PAGE_LEVEL], 2));
 	btr_search_drop_page_hash_index(block, false);
 
 	if (dict_index_is_spatial(index)) {
 		rtr_node_ptr_delete(&parent_cursor, mtr);
-	} else {
-		btr_cur_node_ptr_delete(&parent_cursor, mtr);
+	} else if (dberr_t err =
+		   btr_cur_node_ptr_delete(&parent_cursor, mtr)) {
+		return err;
 	}
 
 	/* Remove the page from the level list */
-	ut_a(DB_SUCCESS == btr_level_list_remove(*block, *index, mtr));
+	if (dberr_t err = btr_level_list_remove(*block, *index, mtr)) {
+		return err;
+	}
 
 #ifdef UNIV_ZIP_DEBUG
-	{
-		page_zip_des_t*	merge_page_zip
-			= buf_block_get_page_zip(merge_block);
-		ut_a(!merge_page_zip
-		     || page_zip_validate(merge_page_zip, merge_block->frame,
-					  index));
-	}
+	if (page_zip_des_t* merge_page_zip
+	    = buf_block_get_page_zip(merge_block))
+		ut_a(page_zip_validate(merge_page_zip,
+				       merge_block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	if (!dict_table_is_locking_disabled(index->table)) {
+	if (index->has_locking()) {
 		if (left_page_no != FIL_NULL) {
 			lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
 					    block);
@@ -4111,27 +4392,32 @@ btr_discard_page(
 					    lock_get_min_heap_no(merge_block),
 					    block);
 		}
-	}
 
-	if (dict_index_is_spatial(index)) {
-		rtr_check_discard_page(index, cursor, block);
+		if (index->is_spatial()) {
+			rtr_check_discard_page(index, cursor, block);
+		}
 	}
 
 	/* Free the file page */
-	btr_page_free(index, block, mtr);
-
-	/* btr_check_node_ptr() needs parent block latched.
-	If the merge_block's parent block is not same,
-	we cannot use btr_check_node_ptr() */
-	ut_ad(parent_is_different
-	      || btr_check_node_ptr(index, merge_block, mtr));
-
-	if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
-	    == index->page
-	    && !page_has_siblings(btr_cur_get_page(&parent_cursor))
-	    && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) {
-		btr_lift_page_up(index, merge_block, mtr);
+	err = btr_page_free(index, block, mtr);
+
+	if (err == DB_SUCCESS) {
+		/* btr_check_node_ptr() needs parent block latched.
+		If the merge_block's parent block is not same,
+		we cannot use btr_check_node_ptr() */
+		ut_ad(parent_is_different
+		      || btr_check_node_ptr(index, merge_block, mtr));
+
+		if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
+		    == index->page
+		    && !page_has_siblings(btr_cur_get_page(&parent_cursor))
+		    && page_get_n_recs(btr_cur_get_page(&parent_cursor))
+		    == 1) {
+			btr_lift_page_up(index, merge_block, mtr, &err);
+		}
 	}
+
+	return err;
 }
 
 #ifdef UNIV_BTR_PRINT
@@ -4222,12 +4508,12 @@ btr_print_recursive(
 			*offsets = rec_get_offsets(
 				node_ptr, index, *offsets, 0,
 				ULINT_UNDEFINED, heap);
-			btr_print_recursive(index,
-					    btr_node_ptr_get_child(node_ptr,
-								   index,
-								   *offsets,
-								   &mtr2),
-					    width, heap, offsets, &mtr2);
+			if (buf_block_t *child =
+			    btr_node_ptr_get_child(node_ptr, index, *offsets,
+						   &mtr2)) {
+				btr_print_recursive(index, child, width, heap,
+						    offsets, &mtr2);
+			}
 			mtr_commit(&mtr2);
 		}
 
@@ -4294,16 +4580,20 @@ btr_check_node_ptr(
 		return(TRUE);
 	}
 
+	cursor.page_cur.index = index;
+	cursor.page_cur.block = block;
+
 	heap = mem_heap_create(256);
 
 	if (dict_index_is_spatial(index)) {
-		offsets = rtr_page_get_father_block(NULL, heap, index, block, mtr,
+		offsets = rtr_page_get_father_block(NULL, heap, mtr,
 						    NULL, &cursor);
 	} else {
-		offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
-						    &cursor);
+		offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor);
 	}
 
+	ut_ad(offsets);
+
 	if (page_is_leaf(page)) {
 
 		goto func_exit;
@@ -4519,16 +4809,15 @@ next_field:
 /************************************************************//**
 Checks the size and number of fields in records based on the definition of
 the index.
-@return TRUE if ok */
+@return true if ok */
 static
-ibool
+bool
 btr_index_page_validate(
 /*====================*/
 	buf_block_t*	block,	/*!< in: index page */
 	dict_index_t*	index)	/*!< in: index */
 {
 	page_cur_t	cur;
-	ibool		ret	= TRUE;
 #ifndef DBUG_OFF
 	ulint		nth	= 1;
 #endif /* !DBUG_OFF */
@@ -4545,17 +4834,13 @@ btr_index_page_validate(
 					     page_cur_get_page(&cur), 0))
 			     == 1););
 
-	page_cur_move_to_next(&cur);
-
-	for (;;) {
+	while (page_cur_move_to_next(&cur)) {
 		if (page_cur_is_after_last(&cur)) {
-
-			break;
+			return true;
 		}
 
 		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
-
-			return(FALSE);
+			break;
 		}
 
 		/* Verify that page_rec_get_nth_const() is correctly
@@ -4567,11 +4852,9 @@ btr_index_page_validate(
 						     cur.rec)));
 				ut_a(nth++ == page_rec_get_n_recs_before(
 					     cur.rec)););
-
-		page_cur_move_to_next(&cur);
 	}
 
-	return(ret);
+	return false;
 }
 
 /************************************************************//**
@@ -4614,17 +4897,14 @@ btr_validate_report2(
     error << ", index tree level " << level;
 }
 
-/************************************************************//**
-Validates index tree level.
-@return TRUE if ok */
+/** Validate an index tree level. */
 static
-bool
+dberr_t
 btr_validate_level(
 /*===============*/
 	dict_index_t*	index,	/*!< in: index tree */
 	const trx_t*	trx,	/*!< in: transaction or NULL */
-	ulint		level,	/*!< in: level number */
-	bool		lockout)/*!< in: true if X-latch index is intended */
+	ulint		level)	/*!< in: level number */
 {
 	buf_block_t*	block;
 	page_t*		page;
@@ -4636,7 +4916,6 @@ btr_validate_level(
 	rec_t*		rec;
 	page_cur_t	cursor;
 	dtuple_t*	node_ptr_tuple;
-	bool		ret	= true;
 	mtr_t		mtr;
 	mem_heap_t*	heap	= mem_heap_create(256);
 	rec_offs*	offsets	= NULL;
@@ -4644,56 +4923,61 @@ btr_validate_level(
 #ifdef UNIV_ZIP_DEBUG
 	page_zip_des_t*	page_zip;
 #endif /* UNIV_ZIP_DEBUG */
-	ulint		savepoint = 0;
-	ulint		savepoint2 = 0;
-	uint32_t	parent_page_no = FIL_NULL;
-	uint32_t	parent_right_page_no = FIL_NULL;
-	bool		rightmost_child = false;
 
 	mtr.start();
 
-	if (!srv_read_only_mode) {
-		if (lockout) {
-			mtr_x_lock_index(index, &mtr);
-		} else {
-			mtr_sx_lock_index(index, &mtr);
-		}
-	}
+	mtr_x_lock_index(index, &mtr);
 
-	block = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+	dberr_t err;
+	block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err);
+	if (!block) {
+		mtr.commit();
+		return err;
+	}
 	page = buf_block_get_frame(block);
 
 	fil_space_t*		space	= index->table->space;
 
 	while (level != btr_page_get_level(page)) {
 		const rec_t*	node_ptr;
-
-		if (fseg_page_is_free(space, block->page.id().page_no())) {
-
+		switch (dberr_t e =
+			fseg_page_is_allocated(space,
+					       block->page.id().page_no())) {
+		case DB_SUCCESS_LOCKED_REC:
+			break;
+		case DB_SUCCESS:
 			btr_validate_report1(index, level, block);
-
 			ib::warn() << "Page is free";
-
-			ret = false;
+			e = DB_CORRUPTION;
+			/* fall through */
+		default:
+			err = e;
 		}
-
-		ut_a(index->table->space_id == block->page.id().space());
-		ut_a(block->page.id().space() == page_get_space_id(page));
+		ut_ad(index->table->space_id == block->page.id().space());
+		ut_ad(block->page.id().space() == page_get_space_id(page));
 #ifdef UNIV_ZIP_DEBUG
 		page_zip = buf_block_get_page_zip(block);
 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
-		ut_a(!page_is_leaf(page));
+		if (page_is_leaf(page)) {
+corrupted:
+			err = DB_CORRUPTION;
+			goto invalid_page;
+		}
 
 		page_cur_set_before_first(block, &cursor);
-		page_cur_move_to_next(&cursor);
+		if (!(node_ptr = page_cur_move_to_next(&cursor))) {
+			goto corrupted;
+		}
 
-		node_ptr = page_cur_get_rec(&cursor);
 		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
 					  ULINT_UNDEFINED, &heap);
 
-		savepoint2 = mtr_set_savepoint(&mtr);
-		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
+		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr,
+					       &err);
+		if (!block) {
+			break;
+		}
 		page = buf_block_get_frame(block);
 
 		/* For R-Tree, since record order might not be the same as
@@ -4709,13 +4993,14 @@ btr_validate_level(
 				/* To obey latch order of tree blocks,
 				we should release the right_block once to
 				obtain lock of the uncle block. */
-				mtr_release_block_at_savepoint(
-					&mtr, savepoint2, block);
+				mtr.release_last_page();
 
-				savepoint2 = mtr_set_savepoint(&mtr);
 				block = btr_block_get(*index, left_page_no,
 						      RW_SX_LATCH, false,
-						      &mtr);
+						      &mtr, &err);
+				if (!block) {
+					goto invalid_page;
+				}
 				page = buf_block_get_frame(block);
 				left_page_no = btr_page_get_prev(page);
 			}
@@ -4726,87 +5011,89 @@ btr_validate_level(
 	level. */
 
 loop:
+	if (!block) {
+invalid_page:
+		mtr.commit();
+func_exit:
+		mem_heap_free(heap);
+		return err;
+	}
+
 	mem_heap_empty(heap);
 	offsets = offsets2 = NULL;
-	if (!srv_read_only_mode) {
-		if (lockout) {
-			mtr_x_lock_index(index, &mtr);
-		} else {
-			mtr_sx_lock_index(index, &mtr);
-		}
-	}
+
+	mtr_x_lock_index(index, &mtr);
+
+	page = block->page.frame;
 
 #ifdef UNIV_ZIP_DEBUG
 	page_zip = buf_block_get_page_zip(block);
 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-	ut_a(block->page.id().space() == index->table->space_id);
-
-	if (fseg_page_is_free(space, block->page.id().page_no())) {
-
+	if (DB_SUCCESS_LOCKED_REC
+	    != fseg_page_is_allocated(space, block->page.id().page_no())) {
 		btr_validate_report1(index, level, block);
 
 		ib::warn() << "Page is marked as free";
-		ret = false;
-
+		err = DB_CORRUPTION;
 	} else if (btr_page_get_index_id(page) != index->id) {
-
 		ib::error() << "Page index id " << btr_page_get_index_id(page)
 			<< " != data dictionary index id " << index->id;
-
-		ret = false;
-
+		err = DB_CORRUPTION;
 	} else if (!page_validate(page, index)) {
-
 		btr_validate_report1(index, level, block);
-		ret = false;
-
+		err = DB_CORRUPTION;
+	} else if (btr_page_get_level(page) != level) {
+		btr_validate_report1(index, level, block);
+		ib::error() << "Page level is not " << level;
+		err = DB_CORRUPTION;
 	} else if (level == 0 && !btr_index_page_validate(block, index)) {
-
 		/* We are on level 0. Check that the records have the right
 		number of fields, and field lengths are right. */
-
-		ret = false;
+		err = DB_CORRUPTION;
+	} else if (!page_is_empty(page)) {
+	} else if (level) {
+		btr_validate_report1(index, level, block);
+		ib::error() << "Non-leaf page is empty";
+	} else if (block->page.id().page_no() != index->page) {
+		btr_validate_report1(index, level, block);
+		ib::error() << "Empty leaf page is not index root";
 	}
 
-	ut_a(btr_page_get_level(page) == level);
-
 	uint32_t right_page_no = btr_page_get_next(page);
 	uint32_t left_page_no = btr_page_get_prev(page);
 
-	ut_a(!page_is_empty(page)
-	     || (level == 0
-		 && page_get_page_no(page) == dict_index_get_page(index)));
-
 	if (right_page_no != FIL_NULL) {
 		const rec_t*	right_rec;
-		savepoint = mtr_set_savepoint(&mtr);
 
 		right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
-					    !level, &mtr);
+					    !level, &mtr, &err);
+		if (!right_block) {
+			btr_validate_report1(index, level, block);
+			fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr);
+			goto invalid_page;
+		}
 		right_page = buf_block_get_frame(right_block);
 
 		if (btr_page_get_prev(right_page) != page_get_page_no(page)) {
 			btr_validate_report2(index, level, block, right_block);
 			fputs("InnoDB: broken FIL_PAGE_NEXT"
 			      " or FIL_PAGE_PREV links\n", stderr);
-
-			ret = false;
+                        err = DB_CORRUPTION;
 		}
 
-		if (page_is_comp(right_page) != page_is_comp(page)) {
-			btr_validate_report2(index, level, block, right_block);
-			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
-
-			ret = false;
-
-			goto node_ptr_fails;
+		if (!(rec = page_rec_get_prev(page_get_supremum_rec(page)))) {
+broken_links:
+			btr_validate_report1(index, level, block);
+			fputs("InnoDB: broken record links\n", stderr);
+			goto invalid_page;
+		}
+		if (!(right_rec =
+		      page_rec_get_next(page_get_infimum_rec(right_page)))) {
+			goto broken_links;
 		}
 
-		rec = page_rec_get_prev(page_get_supremum_rec(page));
-		right_rec = page_rec_get_next(page_get_infimum_rec(
-						      right_page));
 		offsets = rec_get_offsets(rec, index, offsets,
 					  page_is_leaf(page)
 					  ? index->n_core_fields : 0,
@@ -4819,7 +5106,7 @@ loop:
 		/* For spatial index, we cannot guarantee the key ordering
 		across pages, so skip the record compare verification for
 		now. Will enhanced in special R-Tree index validation scheme */
-		if (!dict_index_is_spatial(index)
+		if (!index->is_spatial()
 		    && cmp_rec_rec(rec, right_rec,
 				   offsets, offsets2, index) >= 0) {
 
@@ -4828,24 +5115,35 @@ loop:
 			fputs("InnoDB: records in wrong order"
 			      " on adjacent pages\n", stderr);
 
-			fputs("InnoDB: record ", stderr);
 			rec = page_rec_get_prev(page_get_supremum_rec(page));
-			rec_print(stderr, rec, index);
-			putc('\n', stderr);
+			if (rec) {
+				fputs("InnoDB: record ", stderr);
+				rec_print(stderr, rec, index);
+				putc('\n', stderr);
+			}
 			fputs("InnoDB: record ", stderr);
 			rec = page_rec_get_next(
 				page_get_infimum_rec(right_page));
-			rec_print(stderr, rec, index);
+			if (rec) {
+				rec_print(stderr, rec, index);
+			}
 			putc('\n', stderr);
-
-			ret = false;
+			err = DB_CORRUPTION;
 		}
 	}
 
-	if (level > 0 && left_page_no == FIL_NULL) {
-		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
-			     page_rec_get_next(page_get_infimum_rec(page)),
-			     page_is_comp(page)));
+	if (!level || left_page_no != FIL_NULL) {
+	} else if (const rec_t* first =
+		   page_rec_get_next_const(page_get_infimum_rec(page))) {
+		if (!(REC_INFO_MIN_REC_FLAG
+		      & rec_get_info_bits(first, page_is_comp(page)))) {
+			btr_validate_report1(index, level, block);
+			ib::error() << "Missing REC_INFO_MIN_REC_FLAG";
+			err = DB_CORRUPTION;
+		}
+	} else {
+		err = DB_CORRUPTION;
+		goto node_ptr_fails;
 	}
 
 	/* Similarly skip the father node check for spatial index for now,
@@ -4854,35 +5152,34 @@ loop:
 	in parent level and linked pages in the child level.
 	2) Search parent from root is very costly for R-tree.
 	We will add special validation mechanism for R-tree later (WL #7520) */
-	if (!dict_index_is_spatial(index)
-	    && block->page.id().page_no() != dict_index_get_page(index)) {
-
+	if (!index->is_spatial()
+	    && block->page.id().page_no() != index->page) {
 		/* Check father node pointers */
-		rec_t*	node_ptr;
+		rec_t*	node_ptr
+			= page_rec_get_next(page_get_infimum_rec(page));
+		if (!node_ptr) {
+			err = DB_CORRUPTION;
+			goto node_ptr_fails;
+		}
 
-		btr_cur_position(
-			index, page_rec_get_next(page_get_infimum_rec(page)),
-			block, &node_cur);
+		btr_cur_position(index, node_ptr, block, &node_cur);
 		offsets = btr_page_get_father_node_ptr_for_validate(
 			offsets, heap, &node_cur, &mtr);
 
 		father_page = btr_cur_get_page(&node_cur);
 		node_ptr = btr_cur_get_rec(&node_cur);
 
-		parent_page_no = page_get_page_no(father_page);
-		parent_right_page_no = btr_page_get_next(father_page);
-		rightmost_child = page_rec_is_supremum(
-					page_rec_get_next(node_ptr));
-
-		btr_cur_position(
-			index,
-			page_rec_get_prev(page_get_supremum_rec(page)),
-			block, &node_cur);
+		rec = page_rec_get_prev(page_get_supremum_rec(page));
+		if (rec) {
+			btr_cur_position(index, rec, block, &node_cur);
 
-		offsets = btr_page_get_father_node_ptr_for_validate(
+			offsets = btr_page_get_father_node_ptr_for_validate(
 				offsets, heap, &node_cur, &mtr);
+		} else {
+			offsets = nullptr;
+		}
 
-		if (node_ptr != btr_cur_get_rec(&node_cur)
+		if (!offsets || node_ptr != btr_cur_get_rec(&node_cur)
 		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
 		    != block->page.id().page_no()) {
 
@@ -4894,30 +5191,30 @@ loop:
 			fputs("InnoDB: node ptr ", stderr);
 			rec_print(stderr, node_ptr, index);
 
-			rec = btr_cur_get_rec(&node_cur);
-			fprintf(stderr, "\n"
-				"InnoDB: node ptr child page n:o %u\n",
-				btr_node_ptr_get_child_page_no(rec, offsets));
-
-			fputs("InnoDB: record on page ", stderr);
-			rec_print_new(stderr, rec, offsets);
-			putc('\n', stderr);
-			ret = false;
+			if (offsets) {
+				rec = btr_cur_get_rec(&node_cur);
+				fprintf(stderr, "\n"
+					"InnoDB: node ptr child page n:o %u\n",
+					btr_node_ptr_get_child_page_no(
+						rec, offsets));
+				fputs("InnoDB: record on page ", stderr);
+				rec_print_new(stderr, rec, offsets);
+				putc('\n', stderr);
+			}
 
+			err = DB_CORRUPTION;
 			goto node_ptr_fails;
 		}
 
-		if (!page_is_leaf(page)) {
+		if (page_is_leaf(page)) {
+		} else if (const rec_t* first_rec =
+			   page_rec_get_next(page_get_infimum_rec(page))) {
 			node_ptr_tuple = dict_index_build_node_ptr(
-				index,
-				page_rec_get_next(page_get_infimum_rec(page)),
+				index, first_rec,
 				0, heap, btr_page_get_level(page));
 
 			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
 					   offsets)) {
-				const rec_t* first_rec = page_rec_get_next(
-					page_get_infimum_rec(page));
-
 				btr_validate_report1(index, level, block);
 
 				ib::error() << "Node ptrs differ on levels > 0";
@@ -4927,54 +5224,39 @@ loop:
 				fputs("InnoDB: first rec ", stderr);
 				rec_print(stderr, first_rec, index);
 				putc('\n', stderr);
-				ret = false;
-
+				err = DB_CORRUPTION;
 				goto node_ptr_fails;
 			}
+		} else {
+			err = DB_CORRUPTION;
+			goto node_ptr_fails;
 		}
 
 		if (left_page_no == FIL_NULL) {
-			ut_a(node_ptr == page_rec_get_next(
-				     page_get_infimum_rec(father_page)));
-			ut_a(!page_has_prev(father_page));
+			if (page_has_prev(father_page)
+			    || node_ptr != page_rec_get_next(
+				     page_get_infimum_rec(father_page))) {
+				err = DB_CORRUPTION;
+				goto node_ptr_fails;
+			}
 		}
 
 		if (right_page_no == FIL_NULL) {
-			ut_a(node_ptr == page_rec_get_prev(
-				     page_get_supremum_rec(father_page)));
-			ut_a(!page_has_next(father_page));
-		} else {
-			const rec_t*	right_node_ptr;
-
-			right_node_ptr = page_rec_get_next(node_ptr);
-
-			if (!lockout && rightmost_child) {
-
-				/* To obey latch order of tree blocks,
-				we should release the right_block once to
-				obtain lock of the uncle block. */
-				mtr_release_block_at_savepoint(
-					&mtr, savepoint, right_block);
-
-				if (parent_right_page_no != FIL_NULL) {
-					btr_block_get(*index,
-						      parent_right_page_no,
-						      RW_SX_LATCH, false,
-						      &mtr);
-				}
-
-				right_block = btr_block_get(*index,
-							    right_page_no,
-							    RW_SX_LATCH,
-							    !level, &mtr);
+			if (page_has_next(father_page)
+			    || node_ptr != page_rec_get_prev(
+				     page_get_supremum_rec(father_page))) {
+				err = DB_CORRUPTION;
+				goto node_ptr_fails;
 			}
-
+		} else if (const rec_t* right_node_ptr
+			   = page_rec_get_next(node_ptr)) {
 			btr_cur_position(
-				index, page_rec_get_next(
-					page_get_infimum_rec(
-						buf_block_get_frame(
-							right_block))),
+				index,
+				page_get_infimum_rec(right_block->page.frame),
 				right_block, &right_node_cur);
+			if (!page_cur_move_to_next(&right_node_cur.page_cur)) {
+				goto node_pointer_corrupted;
+			}
 
 			offsets = btr_page_get_father_node_ptr_for_validate(
 					offsets, heap, &right_node_cur, &mtr);
@@ -4984,7 +5266,8 @@ loop:
 
 				if (btr_cur_get_rec(&right_node_cur)
 				    != right_node_ptr) {
-					ret = false;
+node_pointer_corrupted:
+					err = DB_CORRUPTION;
 					fputs("InnoDB: node pointer to"
 					      " the right page is wrong\n",
 					      stderr);
@@ -5000,7 +5283,7 @@ loop:
 				    != page_rec_get_next(
 					    page_get_infimum_rec(
 						    right_father_page))) {
-					ret = false;
+					err = DB_CORRUPTION;
 					fputs("InnoDB: node pointer 2 to"
 					      " the right page is wrong\n",
 					      stderr);
@@ -5012,7 +5295,7 @@ loop:
 				if (page_get_page_no(right_father_page)
 				    != btr_page_get_next(father_page)) {
 
-					ret = false;
+					err = DB_CORRUPTION;
 					fputs("InnoDB: node pointer 3 to"
 					      " the right page is wrong\n",
 					      stderr);
@@ -5021,6 +5304,8 @@ loop:
 							     block);
 				}
 			}
+		} else {
+			err = DB_CORRUPTION;
 		}
 	}
 
@@ -5036,30 +5321,12 @@ node_ptr_fails:
 
 		mtr.start();
 
-		if (!lockout) {
-			if (rightmost_child) {
-				if (parent_right_page_no != FIL_NULL) {
-					btr_block_get(*index,
-						      parent_right_page_no,
-						      RW_SX_LATCH, false,
-						      &mtr);
-				}
-			} else if (parent_page_no != FIL_NULL) {
-				btr_block_get(*index, parent_page_no,
-					      RW_SX_LATCH, false, &mtr);
-			}
-		}
-
 		block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
-				      !level, &mtr);
-		page = buf_block_get_frame(block);
-
+				      !level, &mtr, &err);
 		goto loop;
 	}
 
-	mem_heap_free(heap);
-
-	return(ret);
+	goto func_exit;
 }
 
 /**************************************************************//**
@@ -5071,55 +5338,23 @@ btr_validate_index(
 	dict_index_t*	index,	/*!< in: index */
 	const trx_t*	trx)	/*!< in: transaction or NULL */
 {
-	dberr_t err = DB_SUCCESS;
-	bool lockout = dict_index_is_spatial(index);
-
-	/* Full Text index are implemented by auxiliary tables,
-	not the B-tree */
-	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
-		return(err);
-	}
-
-	mtr_t		mtr;
-
-	mtr_start(&mtr);
+  mtr_t mtr;
+  mtr.start();
 
-	if (!srv_read_only_mode) {
-		if (lockout) {
-			mtr_x_lock_index(index, &mtr);
-		} else {
-			mtr_sx_lock_index(index, &mtr);
-		}
-	}
-
-	page_t*	root = btr_root_get(index, &mtr);
-
-	if (!root) {
-		mtr_commit(&mtr);
-		return DB_CORRUPTION;
-	}
-
-	ulint	n = btr_page_get_level(root);
-
-	btr_validate_index_running++;
-	for (ulint i = 0; i <= n; ++i) {
-
-		if (!btr_validate_level(index, trx, n - i, lockout)) {
-			err = DB_CORRUPTION;
-		}
-	}
-
-	mtr_commit(&mtr);
-	/* In theory we need release barrier here, so that
-	btr_validate_index_running decrement is guaranteed to
-	happen after latches are released.
+  mtr_x_lock_index(index, &mtr);
 
-	Original code issued SEQ_CST on update and non-atomic
-	access on load. Which means it had broken synchronisation
-	as well. */
-	btr_validate_index_running--;
+  dberr_t err;
+  if (page_t *root= btr_root_get(index, &mtr, &err))
+    for (auto level= btr_page_get_level(root);; level--)
+    {
+      if (dberr_t err_level= btr_validate_level(index, trx, level))
+        err= err_level;
+      if (!level)
+        break;
+    }
 
-	return(err);
+  mtr.commit();
+  return err;
 }
 
 /**************************************************************//**
@@ -5146,6 +5381,7 @@ btr_can_merge_with_page(
 	DBUG_ENTER("btr_can_merge_with_page");
 
 	if (page_no == FIL_NULL) {
+error:
 		*merge_block = NULL;
 		DBUG_RETURN(false);
 	}
@@ -5155,6 +5391,9 @@ btr_can_merge_with_page(
 
 	mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
 			       mtr);
+	if (!mblock) {
+		goto error;
+	}
 	mpage = buf_block_get_frame(mblock);
 
 	n_recs = page_get_n_recs(page);
@@ -5181,8 +5420,8 @@ btr_can_merge_with_page(
 
 	if (data_size > max_ins_size) {
 		/* We have to reorganize mpage */
-		if (!btr_page_reorganize_block(page_zip_level, mblock, index,
-					       mtr)) {
+		if (btr_page_reorganize_block(page_zip_level, mblock, index,
+					      mtr) != DB_SUCCESS) {
 			goto error;
 		}
 
@@ -5202,8 +5441,4 @@ btr_can_merge_with_page(
 
 	*merge_block = mblock;
 	DBUG_RETURN(true);
-
-error:
-	*merge_block = NULL;
-	DBUG_RETURN(false);
 }
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
index 0dca1b2c02f..013cd13102c 100644
--- a/storage/innobase/btr/btr0bulk.cc
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -61,17 +61,22 @@ PageBulk::init()
 		m_index->set_modified(alloc_mtr);
 
 		uint32_t n_reserved;
-		if (!fsp_reserve_free_extents(&n_reserved,
-					      m_index->table->space,
-					      1, FSP_NORMAL, &alloc_mtr)) {
+		dberr_t err = fsp_reserve_free_extents(
+			&n_reserved, m_index->table->space, 1, FSP_NORMAL,
+			&alloc_mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+oom:
 			alloc_mtr.commit();
 			m_mtr.commit();
-			return(DB_OUT_OF_FILE_SPACE);
+			return err;
 		}
 
 		/* Allocate a new page. */
 		new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level,
-					   &alloc_mtr, &m_mtr);
+					   &alloc_mtr, &m_mtr, &err);
+		if (!new_block) {
+			goto oom;
+		}
 
 		m_index->table->space->release_free_extents(n_reserved);
 
@@ -103,9 +108,12 @@ PageBulk::init()
 	} else {
 		new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
 					  false, &m_mtr);
+		if (!new_block) {
+			m_mtr.commit();
+			return(DB_CORRUPTION);
+		}
 
 		new_page = buf_block_get_frame(new_block);
-		ut_ad(new_block->page.id().page_no() == m_page_no);
 
 		ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
 
@@ -491,8 +499,8 @@ inline void PageBulk::finishPage()
 
 inline bool PageBulk::needs_finish() const
 {
-  ut_ad(page_align(m_cur_rec) == m_block->frame);
-  ut_ad(m_page == m_block->frame);
+  ut_ad(page_align(m_cur_rec) == m_block->page.frame);
+  ut_ad(m_page == m_block->page.frame);
   if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B])
     return true;
   ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP);
@@ -627,7 +635,7 @@ PageBulk::getSplitRec()
 		 < total_used_size / 2);
 
 	/* Keep at least one record on left page */
-	if (page_rec_is_infimum(page_rec_get_prev(rec))) {
+	if (page_rec_is_first(rec, m_page)) {
 		rec = page_rec_get_next(rec);
 		ut_ad(page_rec_is_user_rec(rec));
 	}
@@ -669,35 +677,40 @@ void
 PageBulk::copyOut(
 	rec_t*		split_rec)
 {
-	rec_t*		rec;
-	rec_t*		last_rec;
-	ulint		n;
-
 	/* Suppose before copyOut, we have 5 records on the page:
 	infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec.
 
 	after copyOut, we have 2 records on the page:
 	infimum->r1->r2->supremum. slot ajustment is not done. */
 
-	rec = page_rec_get_next(page_get_infimum_rec(m_page));
-	last_rec = page_rec_get_prev(page_get_supremum_rec(m_page));
-	n = 0;
+	rec_t *rec = page_get_infimum_rec(m_page);
+	ulint n;
 
-	while (rec != split_rec) {
-		rec = page_rec_get_next(rec);
-		n++;
+	for (n = 0;; n++) {
+		rec_t *next = page_rec_get_next(rec);
+		if (next == split_rec) {
+			break;
+		}
+		rec = next;
 	}
 
 	ut_ad(n > 0);
 
+        const rec_t *last_rec = split_rec;
+	for (;;) {
+		const rec_t *next = page_rec_get_next_const(last_rec);
+		if (page_rec_is_supremum(next)) {
+			break;
+		}
+		last_rec = next;
+	}
+
 	/* Set last record's next in page */
-	rec_offs*	offsets = NULL;
-	rec = page_rec_get_prev(split_rec);
 	const ulint n_core = page_rec_is_leaf(split_rec)
 		? m_index->n_core_fields : 0;
 
-	offsets = rec_get_offsets(rec, m_index, offsets, n_core,
-				  ULINT_UNDEFINED, &m_heap);
+	rec_offs* offsets = rec_get_offsets(rec, m_index, nullptr, n_core,
+					    ULINT_UNDEFINED, &m_heap);
 	mach_write_to_2(rec - REC_NEXT, m_is_comp
 			? static_cast<uint16_t>
 			(PAGE_NEW_SUPREMUM - page_offset(rec))
@@ -814,7 +827,6 @@ PageBulk::storeExt(
 	btr_pcur_t	btr_pcur;
 	btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED;
 	btr_pcur.latch_mode = BTR_MODIFY_LEAF;
-	btr_pcur.btr_cur.index = m_index;
 	btr_pcur.btr_cur.page_cur.index = m_index;
 	btr_pcur.btr_cur.page_cur.rec = m_cur_rec;
 	btr_pcur.btr_cur.page_cur.offsets = offsets;
@@ -834,7 +846,7 @@ PageBulk::release()
 	finish();
 
 	/* We fix the block because we will re-pin it soon. */
-	buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+	m_block->page.fix();
 
 	/* No other threads can modify this block. */
 	m_modify_clock = buf_block_get_modify_clock(m_block);
@@ -843,37 +855,19 @@ PageBulk::release()
 }
 
 /** Start mtr and latch the block */
-dberr_t
-PageBulk::latch()
+void PageBulk::latch()
 {
-	m_mtr.start();
-	m_index->set_modified(m_mtr);
-
-	ut_ad(m_block->page.buf_fix_count());
-
-	/* In case the block is S-latched by page_cleaner. */
-	if (!buf_page_optimistic_get(RW_X_LATCH, m_block, m_modify_clock,
-				     __FILE__, __LINE__, &m_mtr)) {
-		m_block = buf_page_get_gen(page_id_t(m_index->table->space_id,
-						     m_page_no),
-					   0, RW_X_LATCH,
-					   m_block, BUF_GET_IF_IN_POOL,
-					   __FILE__, __LINE__, &m_mtr, &m_err);
-
-		if (m_err != DB_SUCCESS) {
-			return (m_err);
-		}
-
-		ut_ad(m_block != NULL);
-	}
-
-	buf_block_buf_fix_dec(m_block);
-
-	ut_ad(m_block->page.buf_fix_count());
-
-	ut_ad(m_cur_rec > m_page && m_cur_rec < m_heap_top);
-
-	return (m_err);
+  m_mtr.start();
+  m_index->set_modified(m_mtr);
+#ifdef BTR_CUR_HASH_ADAPT
+  ut_ad(!m_block->index);
+#endif
+  m_block->page.lock.x_lock();
+  ut_ad(m_block->page.buf_fix_count());
+  m_mtr.memo_push(m_block, MTR_MEMO_PAGE_X_FIX);
+
+  ut_ad(m_cur_rec > m_page);
+  ut_ad(m_cur_rec < m_heap_top);
 }
 
 /** Split a page
@@ -949,9 +943,7 @@ BtrBulk::pageCommit(
 		page_bulk->set_modified();
 	}
 
-	ut_ad(!rw_lock_own_flagged(&m_index->lock,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX
-				   | RW_LOCK_FLAG_S));
+	ut_ad(!m_index->lock.have_any());
 
 	/* Compress page if it's a compressed table. */
 	if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) {
@@ -1203,30 +1195,38 @@ BtrBulk::finish(dberr_t	err)
 		ut_ad(last_page_no != FIL_NULL);
 		last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
 					   false, &mtr);
+		if (!last_block) {
+			err = DB_CORRUPTION;
+err_exit:
+			mtr.commit();
+			return err;
+		}
+
 		first_rec = page_rec_get_next(
-			page_get_infimum_rec(last_block->frame));
+			page_get_infimum_rec(last_block->page.frame));
+		/* Because this index tree is being created by this thread,
+		we assume that it cannot be corrupted. */
+		ut_ad(first_rec);
 		ut_ad(page_rec_is_user_rec(first_rec));
 
 		/* Copy last page to root page. */
 		err = root_page_bulk.init();
 		if (err != DB_SUCCESS) {
-			mtr.commit();
-			return(err);
+			goto err_exit;
 		}
 		root_page_bulk.copyIn(first_rec);
 		root_page_bulk.finish();
 
 		/* Remove last page. */
-		btr_page_free(m_index, last_block, &mtr);
-
+		err = btr_page_free(m_index, last_block, &mtr);
 		mtr.commit();
 
-		err = pageCommit(&root_page_bulk, NULL, false);
+		if (dberr_t e = pageCommit(&root_page_bulk, NULL, false)) {
+			err = e;
+		}
 		ut_ad(err == DB_SUCCESS);
 	}
 
-	ut_ad(!sync_check_iterate(dict_sync_check()));
-
 	ut_ad(err != DB_SUCCESS
 	      || btr_validate_index(m_index, NULL) == DB_SUCCESS);
 	return(err);
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index a14874ec011..70b0ae4c32c 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -3,7 +3,7 @@
 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -67,9 +67,11 @@ Created 10/16/1994 Heikki Tuuri
 #include "srv0start.h"
 #include "mysql_com.h"
 #include "dict0stats.h"
+#include "row0ins.h"
 #ifdef WITH_WSREP
 #include "mysql/service_wsrep.h"
 #endif /* WITH_WSREP */
+#include "log.h"
 
 /** Buffered B-tree operation types, introduced as part of delete buffering. */
 enum btr_op_t {
@@ -100,16 +102,16 @@ operations by purge as the previous, when it seems to be growing huge.
 throughput clearly from about 100000. */
 #define BTR_CUR_FINE_HISTORY_LENGTH	100000
 
-/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
-Atomic_counter<ulint>	btr_cur_n_non_sea;
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_non_sea;
 /** Old value of btr_cur_n_non_sea.  Copied by
 srv_refresh_innodb_monitor_stats().  Referenced by
 srv_printf_innodb_monitor(). */
 ulint	btr_cur_n_non_sea_old;
-#ifdef BTR_CUR_HASH_ADAPT
 /** Number of successful adaptive hash index lookups in
-btr_cur_search_to_nth_level(). */
-ulint	btr_cur_n_sea;
+btr_cur_t::search_leaf(). */
+ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_sea;
 /** Old value of btr_cur_n_sea.  Copied by
 srv_refresh_innodb_monitor_stats().  Referenced by
 srv_printf_innodb_monitor(). */
@@ -136,17 +138,6 @@ can be released by page reorganize, then it is reorganized */
 #define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
 						part header, in bytes */
 
-/** Estimated table level stats from sampled value.
-@param value sampled stats
-@param index index being sampled
-@param sample number of sampled rows
-@param ext_size external stored data size
-@param not_empty table not empty
-@return estimated table wide stats from sampled value */
-#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
-	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
-	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
-
 /* @} */
 
 /*******************************************************************//**
@@ -162,17 +153,6 @@ btr_cur_unmark_extern_fields(
 	dict_index_t*	index,	/*!< in: index of the page */
 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
 	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
-/*******************************************************************//**
-Adds path information to the cursor for the current page, for which
-the binary search has been performed. */
-static
-void
-btr_cur_add_path_info(
-/*==================*/
-	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
-	ulint		height,		/*!< in: height of the page in tree;
-					0 means leaf node */
-	ulint		root_height);	/*!< in: root node height in tree */
 /***********************************************************//**
 Frees the externally stored fields for a record, if the field is mentioned
 in the update vector. */
@@ -207,186 +187,6 @@ btr_rec_free_externally_stored_fields(
 
 /*==================== B-TREE SEARCH =========================*/
 
-/** Latches the leaf page or pages requested.
-@param[in]	block		leaf page where the search converged
-@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
-@param[in]	cursor		cursor
-@param[in]	mtr		mini-transaction
-@return	blocks and savepoints which actually latched. */
-btr_latch_leaves_t
-btr_cur_latch_leaves(
-	buf_block_t*		block,
-	ulint			latch_mode,
-	btr_cur_t*		cursor,
-	mtr_t*			mtr)
-{
-	rw_lock_type_t	mode;
-	uint32_t	left_page_no;
-	uint32_t	right_page_no;
-	buf_block_t*	get_block;
-	bool		spatial;
-	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
-
-	compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
-	compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
-	compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
-	ut_ad(block->page.id().space() == cursor->index->table->space->id);
-
-	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
-	ut_ad(block->page.in_file());
-
-	switch (latch_mode) {
-	case BTR_SEARCH_LEAF:
-	case BTR_MODIFY_LEAF:
-	case BTR_SEARCH_TREE:
-		if (spatial) {
-			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
-				= mtr_set_savepoint(mtr);
-		}
-
-		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
-		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
-		get_block = btr_block_get(*cursor->index,
-					  block->page.id().page_no(), mode,
-					  true, mtr);
-		latch_leaves.blocks[1] = get_block;
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(get_block->frame)
-		     == page_is_comp(block->frame));
-#endif /* UNIV_BTR_DEBUG */
-		if (spatial) {
-			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
-				= get_block;
-		}
-
-		return(latch_leaves);
-	case BTR_MODIFY_TREE:
-		/* It is exclusive for other operations which calls
-		btr_page_set_prev() */
-		ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
-						 MTR_MEMO_X_LOCK
-						 | MTR_MEMO_SX_LOCK));
-		/* x-latch also siblings from left to right */
-		left_page_no = btr_page_get_prev(block->frame);
-
-		if (left_page_no != FIL_NULL) {
-
-			if (spatial) {
-				cursor->rtr_info->tree_savepoints[
-					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
-			}
-
-			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
-			get_block = btr_block_get(
-				*cursor->index, left_page_no, RW_X_LATCH,
-				true, mtr);
-			latch_leaves.blocks[0] = get_block;
-
-			if (spatial) {
-				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
-					= get_block;
-			}
-		}
-
-		if (spatial) {
-			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
-				= mtr_set_savepoint(mtr);
-		}
-
-		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
-		get_block = btr_block_get(
-			*cursor->index, block->page.id().page_no(),
-			RW_X_LATCH, true, mtr);
-		latch_leaves.blocks[1] = get_block;
-
-#ifdef UNIV_BTR_DEBUG
-		/* Sanity check only after both the blocks are latched. */
-		if (latch_leaves.blocks[0] != NULL) {
-			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
-			     == page_is_comp(block->frame));
-			ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame)
-			     == block->page.id().page_no());
-		}
-		ut_a(page_is_comp(get_block->frame)
-		     == page_is_comp(block->frame));
-#endif /* UNIV_BTR_DEBUG */
-
-		if (spatial) {
-			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
-				= get_block;
-		}
-
-		right_page_no = btr_page_get_next(block->frame);
-
-		if (right_page_no != FIL_NULL) {
-			if (spatial) {
-				cursor->rtr_info->tree_savepoints[
-					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
-								mtr);
-			}
-			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
-			get_block = btr_block_get(*cursor->index,
-						  right_page_no, RW_X_LATCH,
-						  true, mtr);
-			latch_leaves.blocks[2] = get_block;
-#ifdef UNIV_BTR_DEBUG
-			if (get_block) {
-				ut_a(page_is_comp(get_block->frame)
-				     == page_is_comp(block->frame));
-				ut_a(btr_page_get_prev(get_block->frame)
-				     == block->page.id().page_no());
-			}
-#endif /* UNIV_BTR_DEBUG */
-			if (spatial) {
-				cursor->rtr_info->tree_blocks[
-					RTR_MAX_LEVELS + 2] = get_block;
-			}
-		}
-
-		return(latch_leaves);
-
-	case BTR_SEARCH_PREV:
-	case BTR_MODIFY_PREV:
-		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
-		/* latch also left sibling */
-		rw_lock_s_lock(&block->lock);
-		left_page_no = btr_page_get_prev(block->frame);
-		rw_lock_s_unlock(&block->lock);
-
-		if (left_page_no != FIL_NULL) {
-			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
-			get_block = btr_block_get(
-				*cursor->index, left_page_no, mode,
-				true, mtr);
-			latch_leaves.blocks[0] = get_block;
-			cursor->left_block = get_block;
-#ifdef UNIV_BTR_DEBUG
-			ut_a(page_is_comp(get_block->frame)
-			     == page_is_comp(block->frame));
-			ut_a(btr_page_get_next(get_block->frame)
-			     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
-		}
-
-		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
-		get_block = btr_block_get(*cursor->index,
-					  block->page.id().page_no(), mode,
-					  true, mtr);
-		latch_leaves.blocks[1] = get_block;
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(get_block->frame)
-		     == page_is_comp(block->frame));
-#endif /* UNIV_BTR_DEBUG */
-		return(latch_leaves);
-	case BTR_CONT_MODIFY_TREE:
-		ut_ad(dict_index_is_spatial(cursor->index));
-		return(latch_leaves);
-	}
-
-	ut_error;
-	return(latch_leaves);
-}
-
 /** Load the instant ALTER TABLE metadata from the clustered index
 when loading a table definition.
 @param[in,out]	index	clustered index definition
@@ -401,24 +201,31 @@ static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
 	ut_ad(index->table->supports_instant());
 	ut_ad(index->table->is_readable());
 
+	dberr_t err;
 	const fil_space_t* space = index->table->space;
 	if (!space) {
+corrupted:
+		err = DB_CORRUPTION;
 unreadable:
 		ib::error() << "Table " << index->table->name
 			    << " has an unreadable root page";
 		index->table->corrupted = true;
-		return DB_CORRUPTION;
+		index->table->file_unreadable = true;
+		return err;
 	}
 
-	page_t* root = btr_root_get(index, mtr);
-
-	if (!root || btr_cur_instant_root_init(index, root)) {
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
+	if (!root) {
 		goto unreadable;
 	}
 
+	if (btr_cur_instant_root_init(index, root->page.frame)) {
+		goto corrupted;
+	}
+
 	ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
 
-	if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
+	if (fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX) {
 		ut_ad(!index->is_instant());
 		return DB_SUCCESS;
 	}
@@ -427,26 +234,24 @@ unreadable:
 	/* Relax the assertion in rec_init_offsets(). */
 	ut_ad(!index->in_instant_init);
 	ut_d(index->in_instant_init = true);
-	dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF,
-						 &cur, 0, mtr);
+	err = cur.open_leaf(true, index, BTR_SEARCH_LEAF, mtr);
 	ut_d(index->in_instant_init = false);
 	if (err != DB_SUCCESS) {
+		index->table->file_unreadable = true;
 		index->table->corrupted = true;
 		return err;
 	}
 
 	ut_ad(page_cur_is_before_first(&cur.page_cur));
-	ut_ad(page_is_leaf(cur.page_cur.block->frame));
-
-	page_cur_move_to_next(&cur.page_cur);
+	ut_ad(page_is_leaf(cur.page_cur.block->page.frame));
 
-	const rec_t* rec = cur.page_cur.rec;
+	const rec_t* rec = page_cur_move_to_next(&cur.page_cur);
 	const ulint comp = dict_table_is_comp(index->table);
-	const ulint info_bits = rec_get_info_bits(rec, comp);
+	const ulint info_bits = rec ? rec_get_info_bits(rec, comp) : 0;
 
 	if (page_rec_is_supremum(rec)
 	    || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
-		if (!index->is_instant()) {
+		if (rec && !index->is_instant()) {
 			/* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
 			assigned even if instant ADD COLUMN was not
 			committed. Changes to these page header fields are not
@@ -561,21 +366,26 @@ incompatible:
 			page_id_t(space->id,
 				  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
 			0, RW_S_LATCH, mtr);
-		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
-		if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB
-		    || mach_read_from_4(&block->frame[FIL_PAGE_DATA
-						      + BTR_BLOB_HDR_NEXT_PAGE_NO])
+		if (!block) {
+			goto incompatible;
+		}
+
+		if (fil_page_get_type(block->page.frame) != FIL_PAGE_TYPE_BLOB
+		    || mach_read_from_4(&block->page.frame
+					[FIL_PAGE_DATA
+					 + BTR_BLOB_HDR_NEXT_PAGE_NO])
 		    != FIL_NULL
-		    || mach_read_from_4(&block->frame[FIL_PAGE_DATA
-						      + BTR_BLOB_HDR_PART_LEN])
+		    || mach_read_from_4(&block->page.frame
+					[FIL_PAGE_DATA
+					 + BTR_BLOB_HDR_PART_LEN])
 		    != len) {
 			goto incompatible;
 		}
 
 		/* The unused part of the BLOB page should be zero-filled. */
-		for (const byte* b = block->frame
+		for (const byte* b = block->page.frame
 		       + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
-		       * const end = block->frame + srv_page_size
+		       * const end = block->page.frame + srv_page_size
 		       - BTR_EXTERN_LEN;
 		     b < end; ) {
 			if (*b++) {
@@ -584,8 +394,8 @@ incompatible:
 		}
 
 		if (index->table->deserialise_columns(
-			    &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE],
-			    len)) {
+			    &block->page.frame
+			    [FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) {
 			goto incompatible;
 		}
 
@@ -678,14 +488,14 @@ index root page.
 bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 {
 	ut_ad(!index->is_dummy);
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(!page_has_siblings(page));
-	ut_ad(page_get_space_id(page) == index->table->space_id);
-	ut_ad(page_get_page_no(page) == index->page);
-	ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
 	ut_ad(index->is_primary());
 	ut_ad(!index->is_instant());
 	ut_ad(index->table->supports_instant());
+
+	if (page_has_siblings(page)) {
+		return true;
+	}
+
 	/* This is normally executed as part of btr_cur_instant_init()
 	when dict_load_table_one() is loading a table definition.
 	Other threads should not access or modify the n_core_null_bytes,
@@ -696,13 +506,14 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 
 	switch (fil_page_get_type(page)) {
 	default:
-		ut_ad("wrong page type" == 0);
 		return true;
 	case FIL_PAGE_INDEX:
 		/* The field PAGE_INSTANT is guaranteed 0 on clustered
 		index root pages of ROW_FORMAT=COMPACT or
 		ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
-		ut_ad(!page_is_comp(page) || !page_get_instant(page));
+		if (page_is_comp(page) && page_get_instant(page)) {
+			return true;
+		}
 		index->n_core_null_bytes = static_cast<uint8_t>(
 			UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
 		return false;
@@ -757,114 +568,13 @@ bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 	return index->n_core_null_bytes > 128;
 }
 
-/** Optimistically latches the leaf page or pages requested.
-@param[in]	block		guessed buffer block
-@param[in]	modify_clock	modify clock value
-@param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
-@param[in,out]	cursor		cursor
-@param[in]	file		file name
-@param[in]	line		line where called
-@param[in]	mtr		mini-transaction
-@return true if success */
-bool
-btr_cur_optimistic_latch_leaves(
-	buf_block_t*	block,
-	ib_uint64_t	modify_clock,
-	ulint*		latch_mode,
-	btr_cur_t*	cursor,
-	const char*	file,
-	unsigned	line,
-	mtr_t*		mtr)
-{
-	ut_ad(block->page.buf_fix_count());
-	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
-
-	switch (*latch_mode) {
-	default:
-		ut_error;
-		return(false);
-	case BTR_SEARCH_LEAF:
-	case BTR_MODIFY_LEAF:
-		return(buf_page_optimistic_get(*latch_mode, block,
-				modify_clock, file, line, mtr));
-	case BTR_SEARCH_PREV:
-	case BTR_MODIFY_PREV:
-		rw_lock_s_lock(&block->lock);
-		if (block->modify_clock != modify_clock) {
-			rw_lock_s_unlock(&block->lock);
-			return false;
-		}
-		const uint32_t curr_page_no = block->page.id().page_no();
-		const uint32_t left_page_no = btr_page_get_prev(block->frame);
-		rw_lock_s_unlock(&block->lock);
-
-		const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV
-			? RW_S_LATCH : RW_X_LATCH;
-
-		if (left_page_no != FIL_NULL) {
-			dberr_t	err = DB_SUCCESS;
-			cursor->left_block = buf_page_get_gen(
-				page_id_t(cursor->index->table->space_id,
-					  left_page_no),
-				cursor->index->table->space->zip_size(),
-				mode, nullptr, BUF_GET_POSSIBLY_FREED,
-				__FILE__, __LINE__, mtr, &err);
-
-			if (!cursor->left_block) {
-				cursor->index->table->file_unreadable = true;
-			}
-
-			if (cursor->left_block->page.status
-			    == buf_page_t::FREED
-			    || btr_page_get_next(cursor->left_block->frame)
-			    != curr_page_no) {
-				/* release the left block */
-				btr_leaf_page_release(
-					cursor->left_block, mode, mtr);
-				return false;
-			}
-		} else {
-			cursor->left_block = NULL;
-		}
-
-		if (buf_page_optimistic_get(mode, block, modify_clock,
-					    file, line, mtr)) {
-			if (btr_page_get_prev(block->frame) == left_page_no) {
-				/* block was already buffer-fixed while
-				entering the function and
-				buf_page_optimistic_get() buffer-fixes
-				it again. */
-				ut_ad(2 <= block->page.buf_fix_count());
-				*latch_mode = mode;
-				return(true);
-			} else {
-				/* release the block and decrement of
-				buf_fix_count which was incremented
-				in buf_page_optimistic_get() */
-				btr_leaf_page_release(block, mode, mtr);
-			}
-		}
-
-		ut_ad(block->page.buf_fix_count());
-		/* release the left block */
-		if (cursor->left_block != NULL) {
-			btr_leaf_page_release(cursor->left_block,
-					      mode, mtr);
-		}
-	}
-
-	return false;
-}
-
 /**
 Gets intention in btr_intention_t from latch_mode, and cleares the intention
 at the latch_mode.
 @param latch_mode	in/out: pointer to latch_mode
 @return intention for latching tree */
 static
-btr_intention_t
-btr_cur_get_and_clear_intention(
-	ulint	*latch_mode)
+btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode)
 {
 	btr_intention_t	intention;
 
@@ -879,41 +589,25 @@ btr_cur_get_and_clear_intention(
 		/* both or unknown */
 		intention = BTR_INTENTION_BOTH;
 	}
-	*latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
+	*latch_mode = btr_latch_mode(
+		*latch_mode & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
 
 	return(intention);
 }
 
-/**
-Gets the desired latch type for the root leaf (root page is root leaf)
-at the latch mode.
-@param latch_mode	in: BTR_SEARCH_LEAF, ...
-@return latch type */
-static
-rw_lock_type_t
-btr_cur_latch_for_root_leaf(
-	ulint	latch_mode)
+/** @return whether the distance between two records is at most the
+specified value */
+static bool
+page_rec_distance_is_at_most(const rec_t *left, const rec_t *right, ulint val)
 {
-	switch (latch_mode) {
-	case BTR_SEARCH_LEAF:
-	case BTR_SEARCH_TREE:
-	case BTR_SEARCH_PREV:
-		return(RW_S_LATCH);
-	case BTR_MODIFY_LEAF:
-	case BTR_MODIFY_TREE:
-	case BTR_MODIFY_PREV:
-		return(RW_X_LATCH);
-	case BTR_CONT_MODIFY_TREE:
-	case BTR_CONT_SEARCH_TREE:
-		/* A root page should be latched already,
-		and don't need to be latched here.
-		fall through (RW_NO_LATCH) */
-	case BTR_NO_LATCHES:
-		return(RW_NO_LATCH);
-	}
-
-	ut_error;
-	return(RW_NO_LATCH); /* avoid compiler warnings */
+  do
+  {
+    if (left == right)
+      return true;
+    left= page_rec_get_next_const(left);
+  }
+  while (left && val--);
+  return false;
 }
 
 /** Detects whether the modifying record might need a modifying tree structure.
@@ -1054,29 +748,34 @@ btr_cur_will_modify_tree(
 
 /** Detects whether the modifying record might need a opposite modification
 to the intention.
-@param[in]	page		page
-@param[in]	lock_intention	lock intention for the tree operation
-@param[in]	rec		record (current node_ptr)
+@param page		 page
+@param lock_intention	 lock intention for the tree operation
+@param node_ptr_max_size the maximum size of a node pointer
+@param compress_limit    BTR_CUR_PAGE_COMPRESS_LIMIT(index)
+@param rec		 record (current node_ptr)
 @return	true if tree modification is needed */
-static
-bool
-btr_cur_need_opposite_intention(
-	const page_t*	page,
-	btr_intention_t	lock_intention,
-	const rec_t*	rec)
+static bool btr_cur_need_opposite_intention(const page_t *page,
+                                            btr_intention_t lock_intention,
+                                            ulint node_ptr_max_size,
+                                            ulint compress_limit,
+                                            const rec_t *rec)
 {
-	switch (lock_intention) {
-	case BTR_INTENTION_DELETE:
-		return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
-			(page_has_next(page) && page_rec_is_last(rec, page));
-	case BTR_INTENTION_INSERT:
-		return page_has_next(page) && page_rec_is_last(rec, page);
-	case BTR_INTENTION_BOTH:
-		return(false);
-	}
-
-	ut_error;
-	return(false);
+  if (lock_intention != BTR_INTENTION_INSERT)
+  {
+    /* We compensate also for btr_cur_compress_recommendation() */
+    if (!page_has_siblings(page) ||
+        page_rec_is_first(rec, page) || page_rec_is_last(rec, page) ||
+        page_get_data_size(page) < node_ptr_max_size + compress_limit)
+      return true;
+    if (lock_intention == BTR_INTENTION_DELETE)
+      return false;
+  }
+  else if (page_has_next(page) && page_rec_is_last(rec, page))
+    return true;
+  LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true);
+  const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2);
+  return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size ||
+    max_size < node_ptr_max_size * 2;
 }
 
 /**
@@ -1218,1931 +917,1143 @@ static ulint btr_node_ptr_max_size(const dict_index_t* index)
 	return rec_max_size;
 }
 
-/********************************************************************//**
-Searches an index tree and positions a tree cursor on a given level.
-NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
-to node pointer page number fields on the upper levels of the tree!
-Note that if mode is PAGE_CUR_LE, which is used in inserts, then
-cursor->up_match and cursor->low_match both will have sensible values.
-If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
-
-If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
-search tuple should be performed in the B-tree. InnoDB does an insert
-immediately after the cursor. Thus, the cursor may end up on a user record,
-or on a page infimum record.
-@param index      index
-@param level      the tree level of search
-@param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
-                  it cannot get compared to the node ptr page number field!
-@param mode       PAGE_CUR_L, NOTE that if the search is made using a unique
-                  prefix of a record, mode should be PAGE_CUR_LE, not
-                  PAGE_CUR_GE, as the latter may end up on the previous page of
-                  the record! Inserts should always be made using PAGE_CUR_LE
-                  to search the position!
-@param latch_mode BTR_SEARCH_LEAF, ..., ORed with at most one of BTR_INSERT,
-                  BTR_DELETE_MARK, BTR_DELETE, or BTR_ESTIMATE;
-                  cursor->left_block is used to store a pointer to the left
-                  neighbor page, in the cases BTR_SEARCH_PREV and
-                  BTR_MODIFY_PREV; NOTE that if ahi_latch, we might not have a
-                  cursor page latch, we assume that ahi_latch protects the
-                  record!
-@param cursor     tree cursor; the cursor page is s- or x-latched, but see also
-                  above!
-@param file       file name
-@param line       line where called
-@param mtr        mini-transaction
-@param autoinc    PAGE_ROOT_AUTO_INC to be written (0 if none)
-@return DB_SUCCESS on success or error code otherwise */
-dberr_t btr_cur_search_to_nth_level(dict_index_t *index, ulint level,
-                                    const dtuple_t *tuple,
-                                    page_cur_mode_t mode, ulint latch_mode,
-                                    btr_cur_t *cursor, const char *file,
-                                    unsigned line, mtr_t *mtr,
-                                    ib_uint64_t autoinc)
+/** @return a B-tree search mode suitable for non-leaf pages
+@param mode  leaf page search mode */
+static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)
 {
-	page_t*		page = NULL; /* remove warning */
-	buf_block_t*	block;
-	buf_block_t*	guess;
-	ulint		height;
-	ulint		up_match;
-	ulint		up_bytes;
-	ulint		low_match;
-	ulint		low_bytes;
-	ulint		rw_latch;
-	page_cur_mode_t	page_mode;
-	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
-	ulint		buf_mode;
-	ulint		estimate;
-	ulint		node_ptr_max_size = srv_page_size / 2;
-	page_cur_t*	page_cursor;
-	btr_op_t	btr_op;
-	ulint		root_height = 0; /* remove warning */
-	dberr_t		err = DB_SUCCESS;
-
-	btr_intention_t	lock_intention;
-	bool		modify_external;
-	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
-	ulint		tree_savepoints[BTR_MAX_LEVELS];
-	ulint		n_blocks = 0;
-	ulint		n_releases = 0;
-	bool		detected_same_key_root = false;
-
-	bool		retrying_for_search_prev = false;
-	ulint		leftmost_from_level = 0;
-	buf_block_t**	prev_tree_blocks = NULL;
-	ulint*		prev_tree_savepoints = NULL;
-	ulint		prev_n_blocks = 0;
-	ulint		prev_n_releases = 0;
-	bool		need_path = true;
-	bool		rtree_parent_modified = false;
-	bool		mbr_adj = false;
-	bool		found = false;
-
-	DBUG_ENTER("btr_cur_search_to_nth_level");
-
-#ifdef BTR_CUR_ADAPT
-	btr_search_t*	info;
-#endif /* BTR_CUR_ADAPT */
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets		= offsets_;
-	rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets2	= offsets2_;
-	rec_offs_init(offsets_);
-	rec_offs_init(offsets2_);
-	/* Currently, PAGE_CUR_LE is the only search mode used for searches
-	ending to upper levels */
-
-	ut_ad(level == 0 || mode == PAGE_CUR_LE
-	      || RTREE_SEARCH_MODE(mode));
-	ut_ad(dict_index_check_search_tuple(index, tuple));
-	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
-	ut_ad(dtuple_check_typed(tuple));
-	ut_ad(!(index->type & DICT_FTS));
-	ut_ad(index->page != FIL_NULL);
-
-	MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match);
-	MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
-	MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match);
-	MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
-#ifdef UNIV_DEBUG
-	cursor->up_match = ULINT_UNDEFINED;
-	cursor->low_match = ULINT_UNDEFINED;
-#endif /* UNIV_DEBUG */
-
-	ibool	s_latch_by_caller;
-
-	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
-
-	ut_ad(!s_latch_by_caller
-	      || srv_read_only_mode
-	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
-					    | MTR_MEMO_SX_LOCK));
-
-	/* These flags are mutually exclusive, they are lumped together
-	with the latch mode for historical reasons. It's possible for
-	none of the flags to be set. */
-	switch (UNIV_EXPECT(latch_mode
-			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
-			    0)) {
-	case 0:
-		btr_op = BTR_NO_OP;
-		break;
-	case BTR_INSERT:
-		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
-			? BTR_INSERT_IGNORE_UNIQUE_OP
-			: BTR_INSERT_OP;
-		break;
-	case BTR_DELETE:
-		btr_op = BTR_DELETE_OP;
-		ut_a(cursor->purge_node);
-		break;
-	case BTR_DELETE_MARK:
-		btr_op = BTR_DELMARK_OP;
-		break;
-	default:
-		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
-		should be specified at a time */
-		ut_error;
-	}
-
-	/* Operations on the insert buffer tree cannot be buffered. */
-	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
-	/* Operations on the clustered index cannot be buffered. */
-	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
-	/* Operations on the temporary table(indexes) cannot be buffered. */
-	ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary());
-	/* Operation on the spatial index cannot be buffered. */
-	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
-
-	estimate = latch_mode & BTR_ESTIMATE;
-
-	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
-
-	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
-
-	/* Turn the flags unrelated to the latch mode off. */
-	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+  if (mode > PAGE_CUR_GE)
+  {
+    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+    return mode;
+  }
+  if (mode == PAGE_CUR_GE)
+    return PAGE_CUR_L;
+  ut_ad(mode == PAGE_CUR_G);
+  return PAGE_CUR_LE;
+}
 
-	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
+dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                               btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  ut_ad(index()->is_btree() || index()->is_ibuf());
+  ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+
+  buf_block_t *guess;
+  btr_op_t btr_op;
+  btr_intention_t lock_intention;
+  bool detected_same_key_root= false;
+
+  mem_heap_t*	heap		= NULL;
+  rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs*	offsets		= offsets_;
+  rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
+  rec_offs*	offsets2	= offsets2_;
+  rec_offs_init(offsets_);
+  rec_offs_init(offsets2_);
+
+  ut_ad(dict_index_check_search_tuple(index(), tuple));
+  ut_ad(dtuple_check_typed(tuple));
+  ut_ad(index()->page != FIL_NULL);
+
+  MEM_UNDEFINED(&up_match, sizeof up_match);
+  MEM_UNDEFINED(&up_bytes, sizeof up_bytes);
+  MEM_UNDEFINED(&low_match, sizeof low_match);
+  MEM_UNDEFINED(&low_bytes, sizeof low_bytes);
+  ut_d(up_match= ULINT_UNDEFINED);
+  ut_d(low_match= ULINT_UNDEFINED);
+
+  ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) ||
+        mtr->memo_contains_flagged(&index()->lock,
+                                   MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK |
+                                   MTR_MEMO_X_LOCK));
+
+  /* These flags are mutually exclusive, they are lumped together
+     with the latch mode for historical reasons. It's possible for
+     none of the flags to be set. */
+  switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) {
+  default:
+    btr_op= BTR_NO_OP;
+    break;
+  case BTR_INSERT:
+    btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+      ? BTR_INSERT_IGNORE_UNIQUE_OP
+      : BTR_INSERT_OP;
+    break;
+  case BTR_DELETE:
+    btr_op= BTR_DELETE_OP;
+    ut_a(purge_node);
+    break;
+  case BTR_DELETE_MARK:
+    btr_op= BTR_DELMARK_OP;
+    break;
+  }
 
-	ut_ad(!s_latch_by_caller
-	      || latch_mode == BTR_SEARCH_LEAF
-	      || latch_mode == BTR_SEARCH_TREE
-	      || latch_mode == BTR_MODIFY_LEAF);
+  /* Operations on the insert buffer tree cannot be buffered. */
+  ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf());
+  /* Operations on the clustered index cannot be buffered. */
+  ut_ad(btr_op == BTR_NO_OP || !index()->is_clust());
+  /* Operations on the temporary table(indexes) cannot be buffered. */
+  ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary());
 
-	ut_ad(autoinc == 0 || dict_index_is_clust(index));
-	ut_ad(autoinc == 0
-	      || latch_mode == BTR_MODIFY_TREE
-	      || latch_mode == BTR_MODIFY_LEAF);
-	ut_ad(autoinc == 0 || level == 0);
+  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+  lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
+  latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
 
-	cursor->flag = BTR_CUR_BINARY;
-	cursor->index = index;
+  ut_ad(!latch_by_caller
+        || latch_mode == BTR_SEARCH_LEAF
+        || latch_mode == BTR_MODIFY_LEAF
+        || latch_mode == BTR_MODIFY_TREE
+        || latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
 
+  flag= BTR_CUR_BINARY;
 #ifndef BTR_CUR_ADAPT
-	guess = NULL;
+  guess= nullptr;
 #else
-	info = btr_search_get_info(index);
-	guess = info->root_guess;
-
-#ifdef BTR_CUR_HASH_ADAPT
+  btr_search_t *info= btr_search_get_info(index());
+  guess= info->root_guess;
+
+# ifdef BTR_CUR_HASH_ADAPT
+#  ifdef UNIV_SEARCH_PERF_STAT
+  info->n_searches++;
+#  endif
+  /* We do a dirty read of btr_search_enabled below,
+     and btr_search_guess_on_hash() will have to check it again. */
+  if (!btr_search_enabled);
+  else if (btr_search_guess_on_hash(index(), info, tuple, mode,
+                                    latch_mode, this, mtr))
+  {
+    /* Search using the hash index succeeded */
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ++btr_cur_n_sea;
 
-# ifdef UNIV_SEARCH_PERF_STAT
-	info->n_searches++;
-# endif
-	if (autoinc == 0
-	    && latch_mode <= BTR_MODIFY_LEAF
-	    && info->last_hash_succ
-# ifdef MYSQL_INDEX_DISABLE_AHI
-	    && !index->disable_ahi
+    return DB_SUCCESS;
+  }
+  else
+    ++btr_cur_n_non_sea;
 # endif
-	    && !estimate
-# ifdef PAGE_CUR_LE_OR_EXTENDS
-	    && mode != PAGE_CUR_LE_OR_EXTENDS
-# endif /* PAGE_CUR_LE_OR_EXTENDS */
-	    && !dict_index_is_spatial(index)
-	    /* We do a dirty read of
-	    btr_search_enabled below, and btr_search_guess_on_hash()
-	    will have to check it again. */
-	    && btr_search_enabled
-	    && !modify_external
-	    && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
-	    && btr_search_guess_on_hash(index, info, tuple, mode,
-					latch_mode, cursor, mtr)) {
-
-		/* Search using the hash index succeeded */
-
-		ut_ad(cursor->up_match != ULINT_UNDEFINED
-		      || mode != PAGE_CUR_GE);
-		ut_ad(cursor->up_match != ULINT_UNDEFINED
-		      || mode != PAGE_CUR_LE);
-		ut_ad(cursor->low_match != ULINT_UNDEFINED
-		      || mode != PAGE_CUR_LE);
-		btr_cur_n_sea++;
-
-		DBUG_RETURN(err);
-	}
-# endif /* BTR_CUR_HASH_ADAPT */
-#endif /* BTR_CUR_ADAPT */
-	btr_cur_n_non_sea++;
-
-	/* If the hash search did not succeed, do binary search down the
-	tree */
-
-	/* Store the position of the tree latch we push to mtr so that we
-	know how to release it when we have latched leaf node(s) */
-
-	ulint savepoint = mtr_set_savepoint(mtr);
-
-	rw_lock_type_t upper_rw_latch;
-
-	switch (latch_mode) {
-	case BTR_MODIFY_TREE:
-		/* Most of delete-intended operations are purging.
-		Free blocks and read IO bandwidth should be prior
-		for them, when the history list is glowing huge. */
-		if (lock_intention == BTR_INTENTION_DELETE
-		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
-		    && buf_pool.n_pend_reads) {
-x_latch_index:
-			mtr_x_lock_index(index, mtr);
-		} else if (index->is_spatial()
-			   && lock_intention <= BTR_INTENTION_BOTH) {
-			/* X lock the if there is possibility of
-			pessimistic delete on spatial index. As we could
-			lock upward for the tree */
-			goto x_latch_index;
-		} else {
-			mtr_sx_lock_index(index, mtr);
-		}
-		upper_rw_latch = RW_X_LATCH;
-		break;
-	case BTR_CONT_MODIFY_TREE:
-	case BTR_CONT_SEARCH_TREE:
-		/* Do nothing */
-		ut_ad(srv_read_only_mode
-		      || mtr->memo_contains_flagged(&index->lock,
-						    MTR_MEMO_X_LOCK
-						    | MTR_MEMO_SX_LOCK));
-		if (dict_index_is_spatial(index)
-		    && latch_mode == BTR_CONT_MODIFY_TREE) {
-			/* If we are about to locating parent page for split
-			and/or merge operation for R-Tree index, X latch
-			the parent */
-			upper_rw_latch = RW_X_LATCH;
-		} else {
-			upper_rw_latch = RW_NO_LATCH;
-		}
-		break;
-	default:
-		if (!srv_read_only_mode) {
-			if (s_latch_by_caller) {
-				ut_ad(rw_lock_own(dict_index_get_lock(index),
-				              RW_LOCK_S));
-			} else if (!modify_external) {
-				/* BTR_SEARCH_TREE is intended to be used with
-				BTR_ALREADY_S_LATCHED */
-				ut_ad(latch_mode != BTR_SEARCH_TREE);
-
-				mtr_s_lock_index(index, mtr);
-			} else {
-				/* BTR_MODIFY_EXTERNAL needs to be excluded */
-				mtr_sx_lock_index(index, mtr);
-			}
-			upper_rw_latch = RW_S_LATCH;
-		} else {
-			upper_rw_latch = RW_NO_LATCH;
-		}
-	}
-	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
-		latch_mode);
-
-	page_cursor = btr_cur_get_page_cur(cursor);
-
-	const ulint		zip_size = index->table->space->zip_size();
-
-	/* Start with the root page. */
-	page_id_t		page_id(index->table->space_id, index->page);
-
-	if (root_leaf_rw_latch == RW_X_LATCH) {
-		node_ptr_max_size = btr_node_ptr_max_size(index);
-	}
-
-	up_match = 0;
-	up_bytes = 0;
-	low_match = 0;
-	low_bytes = 0;
-
-	height = ULINT_UNDEFINED;
-
-	/* We use these modified search modes on non-leaf levels of the
-	B-tree. These let us end up in the right B-tree leaf. In that leaf
-	we use the original search mode. */
-
-	switch (mode) {
-	case PAGE_CUR_GE:
-		page_mode = PAGE_CUR_L;
-		break;
-	case PAGE_CUR_G:
-		page_mode = PAGE_CUR_LE;
-		break;
-	default:
-#ifdef PAGE_CUR_LE_OR_EXTENDS
-		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
-		      || RTREE_SEARCH_MODE(mode)
-		      || mode == PAGE_CUR_LE_OR_EXTENDS);
-#else /* PAGE_CUR_LE_OR_EXTENDS */
-		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
-		      || RTREE_SEARCH_MODE(mode));
-#endif /* PAGE_CUR_LE_OR_EXTENDS */
-		page_mode = mode;
-		break;
-	}
-
-	/* Loop and search until we arrive at the desired level */
-	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
-
-search_loop:
-	buf_mode = BUF_GET;
-	rw_latch = RW_NO_LATCH;
-	rtree_parent_modified = false;
-
-	if (height != 0) {
-		/* We are about to fetch the root or a non-leaf page. */
-		if ((latch_mode != BTR_MODIFY_TREE || height == level)
-		    && !retrying_for_search_prev) {
-			/* If doesn't have SX or X latch of index,
-			each pages should be latched before reading. */
-			if (height == ULINT_UNDEFINED
-			    && upper_rw_latch == RW_S_LATCH
-			    && (modify_external || autoinc)) {
-				/* needs sx-latch of root page
-				for fseg operation or for writing
-				PAGE_ROOT_AUTO_INC */
-				rw_latch = RW_SX_LATCH;
-			} else {
-				rw_latch = upper_rw_latch;
-			}
-		}
-	} else if (latch_mode <= BTR_MODIFY_LEAF) {
-		rw_latch = latch_mode;
-
-		if (btr_op != BTR_NO_OP
-		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
-
-			/* Try to buffer the operation if the leaf
-			page is not in the buffer pool. */
-
-			buf_mode = btr_op == BTR_DELETE_OP
-				? BUF_GET_IF_IN_POOL_OR_WATCH
-				: BUF_GET_IF_IN_POOL;
-		}
-	}
-
-retry_page_get:
-	ut_ad(n_blocks < BTR_MAX_LEVELS);
-	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
-	block = buf_page_get_gen(page_id, zip_size, rw_latch, guess,
-				 buf_mode, file, line, mtr, &err,
-				 height == 0 && !index->is_clust());
-	tree_blocks[n_blocks] = block;
-
-	/* Note that block==NULL signifies either an error or change
-	buffering. */
-
-	if (err != DB_SUCCESS) {
-		ut_ad(block == NULL);
-		if (err == DB_DECRYPTION_FAILED) {
-			ib_push_warning((void *)NULL,
-				DB_DECRYPTION_FAILED,
-				"Table %s is encrypted but encryption service or"
-				" used key_id is not available. "
-				" Can't continue reading table.",
-				index->table->name.m_name);
-			index->table->file_unreadable = true;
-		}
-
-		goto func_exit;
-	}
-
-	if (block == NULL) {
-		/* This must be a search to perform an insert/delete
-		mark/ delete; try using the insert/delete buffer */
-
-		ut_ad(height == 0);
-		ut_ad(cursor->thr);
-
-		switch (btr_op) {
-		case BTR_INSERT_OP:
-		case BTR_INSERT_IGNORE_UNIQUE_OP:
-			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
-			ut_ad(!dict_index_is_spatial(index));
-
-			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
-					page_id, zip_size, cursor->thr)) {
-
-				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
-
-				goto func_exit;
-			}
-			break;
-
-		case BTR_DELMARK_OP:
-			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
-			ut_ad(!dict_index_is_spatial(index));
-
-			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
-					index, page_id, zip_size,
-					cursor->thr)) {
-
-				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
-
-				goto func_exit;
-			}
-
-			break;
-
-		case BTR_DELETE_OP:
-			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
-			ut_ad(!dict_index_is_spatial(index));
-
-			if (!row_purge_poss_sec(cursor->purge_node,
-						index, tuple)) {
-
-				/* The record cannot be purged yet. */
-				cursor->flag = BTR_CUR_DELETE_REF;
-			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
-					       index, page_id, zip_size,
-					       cursor->thr)) {
-
-				/* The purge was buffered. */
-				cursor->flag = BTR_CUR_DELETE_IBUF;
-			} else {
-				/* The purge could not be buffered. */
-				buf_pool.watch_unset(page_id);
-				break;
-			}
-
-			buf_pool.watch_unset(page_id);
-			goto func_exit;
-
-		default:
-			ut_error;
-		}
-
-		/* Insert to the insert/delete buffer did not succeed, we
-		must read the page from disk. */
-
-		buf_mode = BUF_GET;
-
-		goto retry_page_get;
-	}
-
-	if (retrying_for_search_prev && height != 0) {
-		/* also latch left sibling */
-		uint32_t	left_page_no;
-		buf_block_t*	get_block;
-
-		ut_ad(rw_latch == RW_NO_LATCH);
-
-		rw_latch = upper_rw_latch;
-
-		rw_lock_s_lock(&block->lock);
-		left_page_no = btr_page_get_prev(buf_block_get_frame(block));
-		rw_lock_s_unlock(&block->lock);
-
-		if (left_page_no != FIL_NULL) {
-			ut_ad(prev_n_blocks < leftmost_from_level);
-
-			prev_tree_savepoints[prev_n_blocks]
-				= mtr_set_savepoint(mtr);
-			get_block = buf_page_get_gen(
-				page_id_t(page_id.space(), left_page_no),
-				zip_size, rw_latch, NULL, buf_mode,
-				file, line, mtr, &err);
-			prev_tree_blocks[prev_n_blocks] = get_block;
-			prev_n_blocks++;
-
-			if (err != DB_SUCCESS) {
-				if (err == DB_DECRYPTION_FAILED) {
-					ib_push_warning((void *)NULL,
-						DB_DECRYPTION_FAILED,
-						"Table %s is encrypted but encryption service or"
-						" used key_id is not available. "
-						" Can't continue reading table.",
-						index->table->name.m_name);
-					index->table->file_unreadable = true;
-				}
+#endif
 
-				goto func_exit;
-			}
+  /* If the hash search did not succeed, do binary search down the
+     tree */
 
-			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
-			without their parent page's lock. So, not needed to
-			retry here, because we have the parent page's lock. */
-		}
+  /* Store the position of the tree latch we push to mtr so that we
+     know how to release it when we have latched leaf node(s) */
 
-		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
-		mtr_release_block_at_savepoint(
-			mtr, tree_savepoints[n_blocks],
-			tree_blocks[n_blocks]);
+  const ulint savepoint= mtr->get_savepoint();
 
-		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
-		block = buf_page_get_gen(page_id, zip_size,
-					 rw_latch, NULL, buf_mode,
-					 file, line, mtr, &err);
-		tree_blocks[n_blocks] = block;
+  ulint node_ptr_max_size= 0, compress_limit= 0;
+  rw_lock_type_t rw_latch= RW_S_LATCH;
 
-		if (err != DB_SUCCESS) {
-			if (err == DB_DECRYPTION_FAILED) {
-				ib_push_warning((void *)NULL,
-					DB_DECRYPTION_FAILED,
-					"Table %s is encrypted but encryption service or"
-					" used key_id is not available. "
-					" Can't continue reading table.",
-					index->table->name.m_name);
-				index->table->file_unreadable = true;
-			}
-
-			goto func_exit;
-		}
-	}
+  switch (latch_mode) {
+  case BTR_MODIFY_TREE:
+    rw_latch= RW_X_LATCH;
+    node_ptr_max_size= btr_node_ptr_max_size(index());
+    if (latch_by_caller)
+    {
+      ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK));
+      break;
+    }
+    if (lock_intention == BTR_INTENTION_DELETE)
+    {
+      compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index());
+      if (os_aio_pending_reads_approx() &&
+          trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
+      {
+        /* Most delete-intended operations are due to the purge of history.
+        Prioritize them when the history list is growing huge. */
+        mtr_x_lock_index(index(), mtr);
+        break;
+      }
+    }
+    mtr_sx_lock_index(index(), mtr);
+    break;
+#ifdef UNIV_DEBUG
+  case BTR_CONT_MODIFY_TREE:
+    ut_ad("invalid mode" == 0);
+    break;
+#endif
+  case BTR_MODIFY_ROOT_AND_LEAF:
+    rw_latch= RW_SX_LATCH;
+    /* fall through */
+  default:
+    if (!latch_by_caller)
+      mtr_s_lock_index(index(), mtr);
+  }
 
-	page = buf_block_get_frame(block);
+  const ulint zip_size= index()->table->space->zip_size();
+
+  /* Start with the root page. */
+  page_id_t page_id(index()->table->space_id, index()->page);
+
+  const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode);
+  ulint height= ULINT_UNDEFINED;
+  up_match= 0;
+  up_bytes= 0;
+  low_match= 0;
+  low_bytes= 0;
+  ulint buf_mode= BUF_GET;
+ search_loop:
+  dberr_t err;
+  auto block_savepoint= mtr->get_savepoint();
+  buf_block_t *block=
+    buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr,
+                     &err, height == 0 && !index()->is_clust());
+  if (!block)
+  {
+    switch (err) {
+    case DB_DECRYPTION_FAILED:
+      btr_decryption_failed(*index());
+      /* fall through */
+    default:
+    func_exit:
+      if (UNIV_LIKELY_NULL(heap))
+        mem_heap_free(heap);
+      return err;
+    case DB_SUCCESS:
+      /* This must be a search to perform an insert, delete mark, or delete;
+      try using the change buffer */
+      ut_ad(height == 0);
+      ut_ad(thr);
+      break;
+    }
 
-	if (height == ULINT_UNDEFINED
-	    && page_is_leaf(page)
-	    && rw_latch != RW_NO_LATCH
-	    && rw_latch != root_leaf_rw_latch) {
-		/* The root page is also a leaf page (root_leaf).
-		We should reacquire the page, because the root page
-		is latched differently from leaf pages. */
-		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
-		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
-		ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
-		ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
+    switch (btr_op) {
+    default:
+      MY_ASSERT_UNREACHABLE();
+      break;
+    case BTR_INSERT_OP:
+    case BTR_INSERT_IGNORE_UNIQUE_OP:
+      ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+      if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr))
+      {
+        flag= BTR_CUR_INSERT_TO_IBUF;
+        goto func_exit;
+      }
+      break;
+
+    case BTR_DELMARK_OP:
+      ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+      if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+                      index(), page_id, zip_size, thr))
+      {
+        flag = BTR_CUR_DEL_MARK_IBUF;
+        goto func_exit;
+      }
+
+      break;
+
+    case BTR_DELETE_OP:
+      ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+      auto& chain = buf_pool.page_hash.cell_get(page_id.fold());
+
+      if (!row_purge_poss_sec(purge_node, index(), tuple))
+        /* The record cannot be purged yet. */
+        flag= BTR_CUR_DELETE_REF;
+      else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(),
+                           page_id, zip_size, thr))
+        /* The purge was buffered. */
+        flag= BTR_CUR_DELETE_IBUF;
+      else
+      {
+        /* The purge could not be buffered. */
+        buf_pool.watch_unset(page_id, chain);
+        break;
+      }
+
+      buf_pool.watch_unset(page_id, chain);
+      goto func_exit;
+    }
 
-		ut_ad(n_blocks == 0);
-		mtr_release_block_at_savepoint(
-			mtr, tree_savepoints[n_blocks],
-			tree_blocks[n_blocks]);
+    /* Change buffering did not succeed, we must read the page. */
+    buf_mode= BUF_GET;
+    goto search_loop;
+  }
 
-		upper_rw_latch = root_leaf_rw_latch;
-		goto search_loop;
-	}
+  if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
+      btr_page_get_index_id(block->page.frame) != index()->id ||
+      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+      !fil_page_index_page_check(block->page.frame))
+  {
+  corrupted:
+    ut_ad("corrupted" == 0); // FIXME: remove this
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
 
-	if (rw_latch != RW_NO_LATCH) {
+  page_cur.block= block;
+  ut_ad(block == mtr->at_savepoint(block_savepoint));
 #ifdef UNIV_ZIP_DEBUG
-		const page_zip_des_t*	page_zip
-			= buf_block_get_page_zip(block);
-		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+  if (rw_latch == RW_NO_LATCH);
+  else if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
+    ut_a(page_zip_validate(page_zip, block->page.frame, index()));
 #endif /* UNIV_ZIP_DEBUG */
+  const uint32_t page_level= btr_page_get_level(block->page.frame);
 
-		buf_block_dbg_add_level(
-			block, dict_index_is_ibuf(index)
-			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
-	}
-
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(index->id == btr_page_get_index_id(page));
-
-	if (height == ULINT_UNDEFINED) {
-		/* We are in the root node */
-
-		height = btr_page_get_level(page);
-		root_height = height;
-		cursor->tree_height = root_height + 1;
-
-		if (dict_index_is_spatial(index)) {
-			ut_ad(cursor->rtr_info);
-
-			/* If SSN in memory is not initialized, fetch
-			it from root page */
-			if (!rtr_get_current_ssn_id(index)) {
-				/* FIXME: do this in dict_load_table_one() */
-				index->set_ssn(page_get_ssn_id(page) + 1);
-			}
-
-			/* Save the MBR */
-			cursor->rtr_info->thr = cursor->thr;
-			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
-		}
-
+  if (height == ULINT_UNDEFINED)
+  {
+    /* We are in the B-tree index root page. */
 #ifdef BTR_CUR_ADAPT
-		info->root_guess = block;
+    info->root_guess= block;
 #endif
-	}
-
-	if (height == 0) {
-		if (rw_latch == RW_NO_LATCH) {
-			latch_leaves = btr_cur_latch_leaves(
-				block, latch_mode, cursor, mtr);
-		}
-
-		switch (latch_mode) {
-		case BTR_MODIFY_TREE:
-		case BTR_CONT_MODIFY_TREE:
-		case BTR_CONT_SEARCH_TREE:
-			break;
-		default:
-			if (!s_latch_by_caller
-			    && !srv_read_only_mode
-			    && !modify_external) {
-				/* Release the tree s-latch */
-				/* NOTE: BTR_MODIFY_EXTERNAL
-				needs to keep tree sx-latch */
-				mtr_release_s_latch_at_savepoint(
-					mtr, savepoint,
-					dict_index_get_lock(index));
-			}
-
-			/* release upper blocks */
-			if (retrying_for_search_prev) {
-				ut_ad(!autoinc);
-				for (;
-				     prev_n_releases < prev_n_blocks;
-				     prev_n_releases++) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						prev_tree_savepoints[
-							prev_n_releases],
-						prev_tree_blocks[
-							prev_n_releases]);
-				}
-			}
-
-			for (; n_releases < n_blocks; n_releases++) {
-				if (n_releases == 0
-				    && (modify_external || autoinc)) {
-					/* keep the root page latch */
-					ut_ad(mtr->memo_contains_flagged(
-						      tree_blocks[n_releases],
-						      MTR_MEMO_PAGE_SX_FIX
-						      | MTR_MEMO_PAGE_X_FIX));
-					continue;
-				}
+    height= page_level;
+    tree_height= height + 1;
 
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
-		}
-
-		page_mode = mode;
-	}
-
-	if (dict_index_is_spatial(index)) {
-		/* Remember the page search mode */
-		search_mode = page_mode;
-
-		/* Some adjustment on search mode, when the
-		page search mode is PAGE_CUR_RTREE_LOCATE
-		or PAGE_CUR_RTREE_INSERT, as we are searching
-		with MBRs. When it is not the target level, we
-		should search all sub-trees that "CONTAIN" the
-		search range/MBR. When it is at the target
-		level, the search becomes PAGE_CUR_LE */
-		if (page_mode == PAGE_CUR_RTREE_LOCATE
-		    && level == height) {
-			if (level == 0) {
-				page_mode = PAGE_CUR_LE;
-			} else {
-				page_mode = PAGE_CUR_RTREE_GET_FATHER;
-			}
-		}
-
-		if (page_mode == PAGE_CUR_RTREE_INSERT) {
-			page_mode = (level == height)
-					? PAGE_CUR_LE
-					: PAGE_CUR_RTREE_INSERT;
-
-			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
-		}
-
-		/* "need_path" indicates if we need to tracking the parent
-		pages, if it is not spatial comparison, then no need to
-		track it */
-		if (page_mode < PAGE_CUR_CONTAIN) {
-			need_path = false;
-		}
-
-		up_match = 0;
-		low_match = 0;
-
-		if (latch_mode == BTR_MODIFY_TREE
-		    || latch_mode == BTR_CONT_MODIFY_TREE
-		    || latch_mode == BTR_CONT_SEARCH_TREE) {
-			/* Tree are locked, no need for Page Lock to protect
-			the "path" */
-			cursor->rtr_info->need_page_lock = false;
-		}
+    if (!height)
+    {
+      /* The root page is also a leaf page.
+      We may have to reacquire the page latch in a different mode. */
+      switch (rw_latch) {
+      case RW_S_LATCH:
+        if ((latch_mode & ~12) != RW_S_LATCH)
+        {
+          ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH);
+          goto relatch_x;
         }
+        if (latch_mode != BTR_MODIFY_PREV)
+        {
+          if (!latch_by_caller)
+            /* Release the tree s-latch */
+            mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+          goto reached_latched_leaf;
+        }
+        /* fall through */
+      case RW_SX_LATCH:
+        ut_ad(rw_latch == RW_S_LATCH ||
+              latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
+      relatch_x:
+        mtr->rollback_to_savepoint(block_savepoint);
+        height= ULINT_UNDEFINED;
+        rw_latch= RW_X_LATCH;
+        goto search_loop;
+      case RW_X_LATCH:
+        if (latch_mode == BTR_MODIFY_TREE)
+          goto reached_index_root_and_leaf;
+        goto reached_root_and_leaf;
+      case RW_NO_LATCH:
+        ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK));
+      }
+      goto reached_leaf;
+    }
+  }
+  else if (UNIV_UNLIKELY(height != page_level))
+    goto corrupted;
+  else
+    switch (latch_mode) {
+    case BTR_MODIFY_TREE:
+      break;
+    case BTR_MODIFY_ROOT_AND_LEAF:
+      ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() ==
+             index()->page) == (tree_height <= height + 2));
+      if (tree_height <= height + 2)
+        /* Retain the root page latch. */
+        break;
+      goto release_parent_page;
+    default:
+      if (rw_latch == RW_NO_LATCH)
+      {
+        ut_ad(!height);
+        break;
+      }
+    release_parent_page:
+      ut_ad(block_savepoint > savepoint);
+      mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
+      block_savepoint--;
+    }
 
-	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
-		ut_ad(need_path);
-		found = rtr_cur_search_with_match(
-			block, index, tuple, page_mode, page_cursor,
-			cursor->rtr_info);
-
-		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
-		if (search_mode == PAGE_CUR_RTREE_INSERT
-		    && cursor->rtr_info->mbr_adj) {
-			if (latch_mode & BTR_MODIFY_LEAF) {
-				/* Parent MBR needs updated, should retry
-				with BTR_MODIFY_TREE */
-				goto func_exit;
-			} else if (latch_mode & BTR_MODIFY_TREE) {
-				rtree_parent_modified = true;
-				cursor->rtr_info->mbr_adj = false;
-				mbr_adj = true;
-			} else {
-				ut_ad(0);
-			}
-		}
+  if (!height)
+  {
+  reached_leaf:
+    /* We reached the leaf level. */
+    ut_ad(block == mtr->at_savepoint(block_savepoint));
 
-		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
-			cursor->low_match =
-				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
-		}
+    if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF)
+    {
+    reached_root_and_leaf:
+      if (!latch_by_caller)
+        mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+    reached_index_root_and_leaf:
+      ut_ad(rw_latch == RW_X_LATCH);
 #ifdef BTR_CUR_HASH_ADAPT
-	} else if (height == 0 && btr_search_enabled
-		   && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
-		   && !dict_index_is_spatial(index)) {
-		/* The adaptive hash index is only used when searching
-		for leaf pages (height==0), but not in r-trees.
-		We only need the byte prefix comparison for the purpose
-		of updating the adaptive hash index. */
-		page_cur_search_with_match_bytes(
-			block, index, tuple, page_mode, &up_match, &up_bytes,
-			&low_match, &low_bytes, page_cursor);
-#endif /* BTR_CUR_HASH_ADAPT */
-	} else {
-		/* Search for complete index fields. */
-		up_bytes = low_bytes = 0;
-		page_cur_search_with_match(
-			block, index, tuple, page_mode, &up_match,
-			&low_match, page_cursor,
-			need_path ? cursor->rtr_info : NULL);
-	}
-
-	if (estimate) {
-		btr_cur_add_path_info(cursor, height, root_height);
-	}
-
-	/* If this is the desired level, leave the loop */
-
-	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor)));
-
-	/* Add Predicate lock if it is serializable isolation
-	and only if it is in the search case */
-	if (dict_index_is_spatial(index)
-	    && cursor->rtr_info->need_prdt_lock
-	    && mode != PAGE_CUR_RTREE_INSERT
-	    && mode != PAGE_CUR_RTREE_LOCATE
-	    && mode >= PAGE_CUR_CONTAIN) {
-		trx_t*		trx = thr_get_trx(cursor->thr);
-		lock_prdt_t	prdt;
-
-		lock_mutex_enter();
-		lock_init_prdt_from_mbr(
-			&prdt, &cursor->rtr_info->mbr, mode,
-			trx->lock.lock_heap);
-		lock_mutex_exit();
-
-		if (rw_latch == RW_NO_LATCH && height != 0) {
-			rw_lock_s_lock(&(block->lock));
-		}
-
-		lock_prdt_lock(block, &prdt, index, LOCK_S,
-			       LOCK_PREDICATE, cursor->thr);
-
-		if (rw_latch == RW_NO_LATCH && height != 0) {
-			rw_lock_s_unlock(&(block->lock));
-		}
-	}
-
-	if (level != height) {
-
-		const rec_t*	node_ptr;
-		ut_ad(height > 0);
-
-		height--;
-		guess = NULL;
-
-		node_ptr = page_cur_get_rec(page_cursor);
-
-		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
-					  ULINT_UNDEFINED, &heap);
-
-		/* If the rec is the first or last in the page for
-		pessimistic delete intention, it might cause node_ptr insert
-		for the upper level. We should change the intention and retry.
-		*/
-		if (latch_mode == BTR_MODIFY_TREE
-		    && btr_cur_need_opposite_intention(
-			page, lock_intention, node_ptr)) {
-
-need_opposite_intention:
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-
-			if (n_releases > 0) {
-				/* release root block */
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[0],
-					tree_blocks[0]);
-			}
-
-			/* release all blocks */
-			for (; n_releases <= n_blocks; n_releases++) {
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
-
-			lock_intention = BTR_INTENTION_BOTH;
-
-			page_id.set_page_no(index->page);
-			up_match = 0;
-			low_match = 0;
-			height = ULINT_UNDEFINED;
-
-			n_blocks = 0;
-			n_releases = 0;
-
-			goto search_loop;
-		}
-
-		if (dict_index_is_spatial(index)) {
-			if (page_rec_is_supremum(node_ptr)) {
-				cursor->low_match = 0;
-				cursor->up_match = 0;
-				goto func_exit;
-			}
-
-			/* If we are doing insertion or record locating,
-			remember the tree nodes we visited */
-			if (page_mode == PAGE_CUR_RTREE_INSERT
-			    || (search_mode == PAGE_CUR_RTREE_LOCATE
-			        && (latch_mode != BTR_MODIFY_LEAF))) {
-				bool		add_latch = false;
-
-				if (latch_mode == BTR_MODIFY_TREE
-				    && rw_latch == RW_NO_LATCH) {
-					ut_ad(mtr->memo_contains_flagged(
-						&index->lock, MTR_MEMO_X_LOCK
-						| MTR_MEMO_SX_LOCK));
-					rw_lock_s_lock(&block->lock);
-					add_latch = true;
-				}
-
-				/* Store the parent cursor location */
-#ifdef UNIV_DEBUG
-				ulint	num_stored = rtr_store_parent_path(
-					block, cursor, latch_mode,
-					height + 1, mtr);
-#else
-				rtr_store_parent_path(
-					block, cursor, latch_mode,
-					height + 1, mtr);
+      btr_search_drop_page_hash_index(block, true);
 #endif
+      if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+                                     &page_cur, nullptr))
+        goto corrupted;
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+      ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+      goto func_exit;
+    }
 
-				if (page_mode == PAGE_CUR_RTREE_INSERT) {
-					btr_pcur_t*     r_cursor =
-						rtr_get_parent_cursor(
-							cursor, height + 1,
-							true);
-					/* If it is insertion, there should
-					be only one parent for each level
-					traverse */
-#ifdef UNIV_DEBUG
-					ut_ad(num_stored == 1);
-#endif
-
-					node_ptr = btr_pcur_get_rec(r_cursor);
-
-				}
-
-				if (add_latch) {
-					rw_lock_s_unlock(&block->lock);
-				}
-
-				ut_ad(!page_rec_is_supremum(node_ptr));
-			}
-
-			ut_ad(page_mode == search_mode
-			      || (page_mode == PAGE_CUR_WITHIN
-				  && search_mode == PAGE_CUR_RTREE_LOCATE));
-
-			page_mode = search_mode;
-		}
-
-		/* If the first or the last record of the page
-		or the same key value to the first record or last record,
-		the another page might be chosen when BTR_CONT_MODIFY_TREE.
-		So, the parent page should not released to avoiding deadlock
-		with blocking the another search with the same key value. */
-		if (!detected_same_key_root
-		    && lock_intention == BTR_INTENTION_BOTH
-		    && !dict_index_is_unique(index)
-		    && latch_mode == BTR_MODIFY_TREE
-		    && (up_match >= rec_offs_n_fields(offsets) - 1
-			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
-			const rec_t*	first_rec = page_rec_get_next_const(
-				page_get_infimum_rec(page));
-			ulint		matched_fields;
-
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-
-			if (node_ptr == first_rec
-			    || page_rec_is_last(node_ptr, page)) {
-				detected_same_key_root = true;
-			} else {
-				matched_fields = 0;
-
-				offsets2 = rec_get_offsets(
-					first_rec, index, offsets2,
-					0, ULINT_UNDEFINED, &heap);
-				cmp_rec_rec(node_ptr, first_rec,
-					    offsets, offsets2, index, false,
-					    &matched_fields);
-
-				if (matched_fields
-				    >= rec_offs_n_fields(offsets) - 1) {
-					detected_same_key_root = true;
-				} else {
-					const rec_t*	last_rec;
-
-					last_rec = page_rec_get_prev_const(
-						page_get_supremum_rec(page));
-
-					matched_fields = 0;
-
-					offsets2 = rec_get_offsets(
-						last_rec, index, offsets2,
-						0, ULINT_UNDEFINED, &heap);
-					cmp_rec_rec(
-						node_ptr, last_rec,
-						offsets, offsets2, index,
-						false, &matched_fields);
-					if (matched_fields
-					    >= rec_offs_n_fields(offsets) - 1) {
-						detected_same_key_root = true;
-					}
-				}
-			}
-		}
-
-		/* If the page might cause modify_tree,
-		we should not release the parent page's lock. */
-		if (!detected_same_key_root
-		    && latch_mode == BTR_MODIFY_TREE
-		    && !btr_cur_will_modify_tree(
-				index, page, lock_intention, node_ptr,
-				node_ptr_max_size, zip_size, mtr)
-		    && !rtree_parent_modified) {
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			ut_ad(n_releases <= n_blocks);
-
-			/* we can release upper blocks */
-			for (; n_releases < n_blocks; n_releases++) {
-				if (n_releases == 0) {
-					/* we should not release root page
-					to pin to same block. */
-					continue;
-				}
-
-				/* release unused blocks to unpin */
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
-		}
-
-		if (height == level
-		    && latch_mode == BTR_MODIFY_TREE) {
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			/* we should sx-latch root page, if released already.
-			It contains seg_header. */
-			if (n_releases > 0) {
-				mtr_block_sx_latch_at_savepoint(
-					mtr, tree_savepoints[0],
-					tree_blocks[0]);
-			}
-
-			/* x-latch the branch blocks not released yet. */
-			for (ulint i = n_releases; i <= n_blocks; i++) {
-				mtr_block_x_latch_at_savepoint(
-					mtr, tree_savepoints[i],
-					tree_blocks[i]);
-			}
-		}
-
-		/* We should consider prev_page of parent page, if the node_ptr
-		is the leftmost of the page. because BTR_SEARCH_PREV and
-		BTR_MODIFY_PREV latches prev_page of the leaf page. */
-		if ((latch_mode == BTR_SEARCH_PREV
-		     || latch_mode == BTR_MODIFY_PREV)
-		    && !retrying_for_search_prev) {
-			/* block should be latched for consistent
-			   btr_page_get_prev() */
-			ut_ad(mtr->memo_contains_flagged(
-				      block, MTR_MEMO_PAGE_S_FIX
-				      | MTR_MEMO_PAGE_X_FIX));
-
-			if (page_has_prev(page)
-			    && page_rec_is_first(node_ptr, page)) {
-
-				if (leftmost_from_level == 0) {
-					leftmost_from_level = height + 1;
-				}
-			} else {
-				leftmost_from_level = 0;
-			}
-
-			if (height == 0 && leftmost_from_level > 0) {
-				/* should retry to get also prev_page
-				from level==leftmost_from_level. */
-				retrying_for_search_prev = true;
-
-				prev_tree_blocks = static_cast<buf_block_t**>(
-					ut_malloc_nokey(sizeof(buf_block_t*)
-							* leftmost_from_level));
-
-				prev_tree_savepoints = static_cast<ulint*>(
-					ut_malloc_nokey(sizeof(ulint)
-							* leftmost_from_level));
-
-				/* back to the level (leftmost_from_level+1) */
-				ulint	idx = n_blocks
-					- (leftmost_from_level - 1);
-
-				page_id.set_page_no(
-					tree_blocks[idx]->page.id().page_no());
-
-				for (ulint i = n_blocks
-					       - (leftmost_from_level - 1);
-				     i <= n_blocks; i++) {
-					mtr_release_block_at_savepoint(
-						mtr, tree_savepoints[i],
-						tree_blocks[i]);
-				}
-
-				n_blocks -= (leftmost_from_level - 1);
-				height = leftmost_from_level;
-				ut_ad(n_releases == 0);
-
-				/* replay up_match, low_match */
-				up_match = 0;
-				low_match = 0;
-				rtr_info_t*	rtr_info	= need_path
-					? cursor->rtr_info : NULL;
-
-				for (ulint i = 0; i < n_blocks; i++) {
-					page_cur_search_with_match(
-						tree_blocks[i], index, tuple,
-						page_mode, &up_match,
-						&low_match, page_cursor,
-						rtr_info);
-				}
-
-				goto search_loop;
-			}
-		}
-
-		/* Go to the child node */
-		page_id.set_page_no(
-			btr_node_ptr_get_child_page_no(node_ptr, offsets));
-
-		n_blocks++;
-
-		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
-			/* We're doing a search on an ibuf tree and we're one
-			level above the leaf page. */
-
-			ut_ad(level == 0);
-
-			buf_mode = BUF_GET;
-			rw_latch = RW_NO_LATCH;
-			goto retry_page_get;
-		}
-
-		if (dict_index_is_spatial(index)
-		    && page_mode >= PAGE_CUR_CONTAIN
-		    && page_mode != PAGE_CUR_RTREE_INSERT) {
-			ut_ad(need_path);
-			rtr_node_path_t* path =
-				cursor->rtr_info->path;
-
-			if (!path->empty() && found) {
-				ut_ad(path->back().page_no
-				      == page_id.page_no());
-				path->pop_back();
-#ifdef UNIV_DEBUG
-				if (page_mode == PAGE_CUR_RTREE_LOCATE
-				    && (latch_mode != BTR_MODIFY_LEAF)) {
-					btr_pcur_t*	cur
-					= cursor->rtr_info->parent_path->back(
-					  ).cursor;
-					rec_t*	my_node_ptr
-						= btr_pcur_get_rec(cur);
-
-					offsets = rec_get_offsets(
-						my_node_ptr, index, offsets,
-						0, ULINT_UNDEFINED, &heap);
-
-					ulint	my_page_no
-					= btr_node_ptr_get_child_page_no(
-						my_node_ptr, offsets);
-
-					ut_ad(page_id.page_no() == my_page_no);
-				}
-#endif
-			}
-		}
-
-		goto search_loop;
-	} else if (!dict_index_is_spatial(index)
-		   && latch_mode == BTR_MODIFY_TREE
-		   && lock_intention == BTR_INTENTION_INSERT
-		   && page_has_next(page)
-		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
-
-		/* btr_insert_into_right_sibling() might cause
-		deleting node_ptr at upper level */
-
-		guess = NULL;
-
-		if (height == 0) {
-			/* release the leaf pages if latched */
-			for (uint i = 0; i < 3; i++) {
-				if (latch_leaves.blocks[i] != NULL) {
-					mtr_release_block_at_savepoint(
-						mtr, latch_leaves.savepoints[i],
-						latch_leaves.blocks[i]);
-					latch_leaves.blocks[i] = NULL;
-				}
-			}
-		}
-
-		goto need_opposite_intention;
-	}
-
-	if (level != 0) {
-		ut_ad(!autoinc);
-
-		if (upper_rw_latch == RW_NO_LATCH) {
-			ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
-			      || latch_mode == BTR_CONT_SEARCH_TREE);
-			buf_block_t* child_block = btr_block_get(
-				*index, page_id.page_no(),
-				latch_mode == BTR_CONT_MODIFY_TREE
-				? RW_X_LATCH : RW_SX_LATCH, false, mtr);
-			btr_assert_not_corrupted(child_block, index);
-		} else {
-			ut_ad(mtr->memo_contains_flagged(block,
-							 upper_rw_latch));
-			btr_assert_not_corrupted(block, index);
-
-			if (s_latch_by_caller) {
-				ut_ad(latch_mode == BTR_SEARCH_TREE);
-				/* to exclude modifying tree operations
-				should sx-latch the index. */
-				ut_ad(mtr->memo_contains(index->lock,
-							 MTR_MEMO_SX_LOCK));
-				/* because has sx-latch of index,
-				can release upper blocks. */
-				for (; n_releases < n_blocks; n_releases++) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						tree_savepoints[n_releases],
-						tree_blocks[n_releases]);
-				}
-			}
-		}
-
-		if (page_mode <= PAGE_CUR_LE) {
-			cursor->low_match = low_match;
-			cursor->up_match = up_match;
-		}
-	} else {
-		cursor->low_match = low_match;
-		cursor->low_bytes = low_bytes;
-		cursor->up_match = up_match;
-		cursor->up_bytes = up_bytes;
-
-		if (autoinc) {
-			page_set_autoinc(tree_blocks[0], autoinc, mtr, false);
-		}
+    switch (latch_mode) {
+    case BTR_SEARCH_PREV:
+    case BTR_MODIFY_PREV:
+      static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
+      static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
+      ut_ad(!latch_by_caller);
+
+      if (rw_latch == RW_NO_LATCH)
+      {
+        /* latch also siblings from left to right */
+        rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
+        if (page_has_prev(block->page.frame) &&
+            !btr_block_get(*index(), btr_page_get_prev(block->page.frame),
+                           rw_latch, false, mtr, &err))
+          goto func_exit;
+        mtr->upgrade_buffer_fix(block_savepoint, rw_latch);
+        if (page_has_next(block->page.frame) &&
+            !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+                           rw_latch, false, mtr, &err))
+          goto func_exit;
+      }
+      goto release_tree;
+    case BTR_SEARCH_LEAF:
+    case BTR_MODIFY_LEAF:
+      if (rw_latch == RW_NO_LATCH)
+      {
+        ut_ad(index()->is_ibuf());
+        mtr->upgrade_buffer_fix(block_savepoint, rw_lock_type_t(latch_mode));
+      }
+      if (!latch_by_caller)
+      {
+release_tree:
+        /* Release the tree s-latch */
+        block_savepoint--;
+        mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+      }
+      /* release upper blocks */
+      if (savepoint < block_savepoint)
+        mtr->rollback_to_savepoint(savepoint, block_savepoint);
+      break;
+    default:
+      ut_ad(latch_mode == BTR_MODIFY_TREE);
+      ut_ad(rw_latch == RW_NO_LATCH);
+      /* x-latch also siblings from left to right */
+      if (page_has_prev(block->page.frame) &&
+          !btr_block_get(*index(), btr_page_get_prev(block->page.frame),
+                         RW_X_LATCH, false, mtr, &err))
+        goto func_exit;
+      mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
+      if (page_has_next(block->page.frame) &&
+          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+                         RW_X_LATCH, false, mtr, &err))
+        goto func_exit;
+      if (btr_cur_need_opposite_intention(block->page.frame, lock_intention,
+                                          node_ptr_max_size, compress_limit,
+                                          page_cur.rec))
+        goto need_opposite_intention;
+    }
 
+  reached_latched_leaf:
 #ifdef BTR_CUR_HASH_ADAPT
-		/* We do a dirty read of btr_search_enabled here.  We
-		will properly check btr_search_enabled again in
-		btr_search_build_page_hash_index() before building a
-		page hash index, while holding search latch. */
-		if (!btr_search_enabled) {
-# ifdef MYSQL_INDEX_DISABLE_AHI
-		} else if (index->disable_ahi) {
-# endif
-		} else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) {
-			ut_ad(index->is_instant());
-			/* This may be a search tuple for
-			btr_pcur_restore_position(). */
-			ut_ad(tuple->is_metadata()
-			      || (tuple->is_metadata(tuple->info_bits
-						     ^ REC_STATUS_INSTANT)));
-		} else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
-			/* Only user records belong in the adaptive
-			hash index. */
-		} else {
-			btr_search_info_update(index, cursor);
-		}
+    if (btr_search_enabled && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG))
+    {
+      if (page_cur_search_with_match_bytes(tuple, mode,
+                                           &up_match, &up_bytes,
+                                           &low_match, &low_bytes, &page_cur))
+        goto corrupted;
+    }
+    else
 #endif /* BTR_CUR_HASH_ADAPT */
-		ut_ad(cursor->up_match != ULINT_UNDEFINED
-		      || mode != PAGE_CUR_GE);
-		ut_ad(cursor->up_match != ULINT_UNDEFINED
-		      || mode != PAGE_CUR_LE);
-		ut_ad(cursor->low_match != ULINT_UNDEFINED
-		      || mode != PAGE_CUR_LE);
-	}
-
-	/* For spatial index, remember  what blocks are still latched */
-	if (dict_index_is_spatial(index)
-	    && (latch_mode == BTR_MODIFY_TREE
-		|| latch_mode == BTR_MODIFY_LEAF)) {
-		for (ulint i = 0; i < n_releases; i++) {
-			cursor->rtr_info->tree_blocks[i] = NULL;
-			cursor->rtr_info->tree_savepoints[i] = 0;
-		}
-
-		for (ulint i = n_releases; i <= n_blocks; i++) {
-			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
-			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
-		}
-	}
-
-func_exit:
-
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
-
-	if (retrying_for_search_prev) {
-		ut_free(prev_tree_blocks);
-		ut_free(prev_tree_savepoints);
-	}
-
-	if (mbr_adj) {
-		/* remember that we will need to adjust parent MBR */
-		cursor->rtr_info->mbr_adj = true;
-	}
-
-	DBUG_RETURN(err);
-}
-
-/*****************************************************************//**
-Opens a cursor at either end of an index. */
-dberr_t
-btr_cur_open_at_index_side_func(
-/*============================*/
-	bool		from_left,	/*!< in: true if open to the low end,
-					false if to the high end */
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: latch mode */
-	btr_cur_t*	cursor,		/*!< in/out: cursor */
-	ulint		level,		/*!< in: level to search for
-					(0=leaf). */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-{
-	page_cur_t*	page_cursor;
-	ulint		node_ptr_max_size = srv_page_size / 2;
-	ulint		height;
-	ulint		root_height = 0; /* remove warning */
-	rec_t*		node_ptr;
-	ulint		estimate;
-	btr_intention_t	lock_intention;
-	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
-	ulint		tree_savepoints[BTR_MAX_LEVELS];
-	ulint		n_blocks = 0;
-	ulint		n_releases = 0;
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets		= offsets_;
-	dberr_t		err = DB_SUCCESS;
-
-	rec_offs_init(offsets_);
-
-	estimate = latch_mode & BTR_ESTIMATE;
-	latch_mode &= ulint(~BTR_ESTIMATE);
-
-	ut_ad(level != ULINT_UNDEFINED);
-
-	bool	s_latch_by_caller;
-
-	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
-	latch_mode &= ulint(~BTR_ALREADY_S_LATCHED);
-
-	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
-
-	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
-
-	/* This function doesn't need to lock left page of the leaf page */
-	if (latch_mode == BTR_SEARCH_PREV) {
-		latch_mode = BTR_SEARCH_LEAF;
-	} else if (latch_mode == BTR_MODIFY_PREV) {
-		latch_mode = BTR_MODIFY_LEAF;
-	}
-
-	/* Store the position of the tree latch we push to mtr so that we
-	know how to release it when we have latched the leaf node */
-
-	ulint savepoint = mtr_set_savepoint(mtr);
-
-	rw_lock_type_t upper_rw_latch;
-
-	switch (latch_mode) {
-	case BTR_CONT_MODIFY_TREE:
-	case BTR_CONT_SEARCH_TREE:
-		upper_rw_latch = RW_NO_LATCH;
-		break;
-	case BTR_MODIFY_TREE:
-		/* Most of delete-intended operations are purging.
-		Free blocks and read IO bandwidth should be prior
-		for them, when the history list is glowing huge. */
-		if (lock_intention == BTR_INTENTION_DELETE
-		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
-		    && buf_pool.n_pend_reads) {
-			mtr_x_lock_index(index, mtr);
-		} else {
-			mtr_sx_lock_index(index, mtr);
-		}
-		upper_rw_latch = RW_X_LATCH;
-		break;
-	default:
-		ut_ad(!s_latch_by_caller
-		      || mtr->memo_contains_flagged(&index->lock,
-						    MTR_MEMO_SX_LOCK
-						    | MTR_MEMO_S_LOCK));
-		if (!srv_read_only_mode) {
-			if (!s_latch_by_caller) {
-				/* BTR_SEARCH_TREE is intended to be used with
-				BTR_ALREADY_S_LATCHED */
-				ut_ad(latch_mode != BTR_SEARCH_TREE);
-
-				mtr_s_lock_index(index, mtr);
-			}
-			upper_rw_latch = RW_S_LATCH;
-		} else {
-			upper_rw_latch = RW_NO_LATCH;
-		}
-	}
-
-	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
-		latch_mode);
-
-	page_cursor = btr_cur_get_page_cur(cursor);
-	cursor->index = index;
+    if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+                                   &page_cur, nullptr))
+      goto corrupted;
 
-	page_id_t		page_id(index->table->space_id, index->page);
-	const ulint		zip_size = index->table->space->zip_size();
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
 
-	if (root_leaf_rw_latch == RW_X_LATCH) {
-		node_ptr_max_size = btr_node_ptr_max_size(index);
-	}
-
-	height = ULINT_UNDEFINED;
-
-	for (;;) {
-		ut_ad(n_blocks < BTR_MAX_LEVELS);
-		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
-
-		const ulint rw_latch = height
-			&& (latch_mode != BTR_MODIFY_TREE || height == level)
-			? upper_rw_latch : RW_NO_LATCH;
-		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
-						      rw_latch, NULL, BUF_GET,
-						      file, line, mtr, &err,
-						      height == 0
-						      && !index->is_clust());
-		ut_ad((block != NULL) == (err == DB_SUCCESS));
-		tree_blocks[n_blocks] = block;
-
-		if (err != DB_SUCCESS) {
-			if (err == DB_DECRYPTION_FAILED) {
-				ib_push_warning((void *)NULL,
-					DB_DECRYPTION_FAILED,
-					"Table %s is encrypted but encryption service or"
-					" used key_id is not available. "
-					" Can't continue reading table.",
-					index->table->name.m_name);
-				index->table->file_unreadable = true;
-			}
-
-			goto exit_loop;
-		}
-
-		const page_t* page = buf_block_get_frame(block);
-
-		if (height == ULINT_UNDEFINED
-		    && page_is_leaf(page)
-		    && rw_latch != RW_NO_LATCH
-		    && rw_latch != root_leaf_rw_latch) {
-			/* We should retry to get the page, because the root page
-			is latched with different level as a leaf page. */
-			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
-			ut_ad(rw_latch == RW_S_LATCH);
-
-			ut_ad(n_blocks == 0);
-			mtr_release_block_at_savepoint(
-				mtr, tree_savepoints[n_blocks],
-				tree_blocks[n_blocks]);
-
-			upper_rw_latch = root_leaf_rw_latch;
-			continue;
-		}
-
-		ut_ad(fil_page_index_page_check(page));
-		ut_ad(index->id == btr_page_get_index_id(page));
-
-		if (height == ULINT_UNDEFINED) {
-			/* We are in the root node */
-
-			height = btr_page_get_level(page);
-			root_height = height;
-			ut_a(height >= level);
-		} else {
-			/* TODO: flag the index corrupted if this fails */
-			ut_ad(height == btr_page_get_level(page));
-		}
-
-		if (height == 0) {
-			if (rw_latch == RW_NO_LATCH) {
-				btr_cur_latch_leaves(block, latch_mode,
-						     cursor, mtr);
-			}
-
-			/* In versions <= 3.23.52 we had forgotten to
-			release the tree latch here. If in an index
-			scan we had to scan far to find a record
-			visible to the current transaction, that could
-			starve others waiting for the tree latch. */
-
-			switch (latch_mode) {
-			case BTR_MODIFY_TREE:
-			case BTR_CONT_MODIFY_TREE:
-			case BTR_CONT_SEARCH_TREE:
-				break;
-			default:
-				if (UNIV_UNLIKELY(srv_read_only_mode)) {
-					break;
-				}
-				if (!s_latch_by_caller) {
-					/* Release the tree s-latch */
-					mtr_release_s_latch_at_savepoint(
-						mtr, savepoint, &index->lock);
-				}
-
-				/* release upper blocks */
-				for (; n_releases < n_blocks; n_releases++) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						tree_savepoints[n_releases],
-						tree_blocks[n_releases]);
-				}
-			}
-		} else if (height == level /* height != 0 */
-			   && UNIV_LIKELY(!srv_read_only_mode)) {
-			/* We already have the block latched. */
-			ut_ad(latch_mode == BTR_SEARCH_TREE);
-			ut_ad(s_latch_by_caller);
-			ut_ad(upper_rw_latch == RW_S_LATCH);
-			ut_ad(mtr->memo_contains_flagged(block,
-							 MTR_MEMO_PAGE_S_FIX));
-
-			if (s_latch_by_caller) {
-				/* to exclude modifying tree operations
-				should sx-latch the index. */
-				ut_ad(mtr->memo_contains(index->lock,
-							 MTR_MEMO_SX_LOCK));
-				/* because has sx-latch of index,
-				can release upper blocks. */
-				for (; n_releases < n_blocks; n_releases++) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						tree_savepoints[n_releases],
-						tree_blocks[n_releases]);
-				}
-			}
-		}
-
-		if (from_left) {
-			page_cur_set_before_first(block, page_cursor);
-		} else {
-			page_cur_set_after_last(block, page_cursor);
-		}
-
-		if (height == level) {
-			if (estimate) {
-				btr_cur_add_path_info(cursor, height,
-						      root_height);
-			}
-
-			break;
-		}
-
-		ut_ad(height > 0);
-
-		if (from_left) {
-			page_cur_move_to_next(page_cursor);
-		} else {
-			page_cur_move_to_prev(page_cursor);
-		}
-
-		if (estimate) {
-			btr_cur_add_path_info(cursor, height, root_height);
-		}
-
-		height--;
-
-		node_ptr = page_cur_get_rec(page_cursor);
-		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
-					  0, ULINT_UNDEFINED, &heap);
-
-		/* If the rec is the first or last in the page for
-		pessimistic delete intention, it might cause node_ptr insert
-		for the upper level. We should change the intention and retry.
-		*/
-		if (latch_mode == BTR_MODIFY_TREE
-		    && btr_cur_need_opposite_intention(
-			page, lock_intention, node_ptr)) {
-
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			/* release all blocks */
-			for (; n_releases <= n_blocks; n_releases++) {
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
-
-			lock_intention = BTR_INTENTION_BOTH;
-
-			page_id.set_page_no(dict_index_get_page(index));
-
-			height = ULINT_UNDEFINED;
+#ifdef BTR_CUR_HASH_ADAPT
+    /* We do a dirty read of btr_search_enabled here.  We will
+    properly check btr_search_enabled again in
+    btr_search_build_page_hash_index() before building a page hash
+    index, while holding search latch. */
+    if (!btr_search_enabled);
+    else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+      /* This may be a search tuple for btr_pcur_t::restore_position(). */
+      ut_ad(tuple->is_metadata() ||
+            (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
+    else if (index()->table->is_temporary());
+    else if (!rec_is_metadata(page_cur.rec, *index()))
+      btr_search_info_update(index(), this);
+#endif /* BTR_CUR_HASH_ADAPT */
 
-			n_blocks = 0;
-			n_releases = 0;
+    goto func_exit;
+  }
 
-			continue;
-		}
+  guess= nullptr;
+  if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
+                                 &page_cur, nullptr))
+    goto corrupted;
+  offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
+                           &heap);
 
-		if (latch_mode == BTR_MODIFY_TREE
-		    && !btr_cur_will_modify_tree(
-				cursor->index, page, lock_intention, node_ptr,
-				node_ptr_max_size, zip_size, mtr)) {
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			ut_ad(n_releases <= n_blocks);
-
-			/* we can release upper blocks */
-			for (; n_releases < n_blocks; n_releases++) {
-				if (n_releases == 0) {
-					/* we should not release root page
-					to pin to same block. */
-					continue;
-				}
+  ut_ad(block == mtr->at_savepoint(block_savepoint));
 
-				/* release unused blocks to unpin */
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
-		}
+  switch (latch_mode) {
+  default:
+    break;
+  case BTR_MODIFY_TREE:
+    if (btr_cur_need_opposite_intention(block->page.frame, lock_intention,
+                                        node_ptr_max_size, compress_limit,
+                                        page_cur.rec))
+      /* If the rec is the first or last in the page for pessimistic
+      delete intention, it might cause node_ptr insert for the upper
+      level. We should change the intention and retry. */
+    need_opposite_intention:
+      return pessimistic_search_leaf(tuple, mode, mtr);
+
+    if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH ||
+        index()->is_unique() ||
+        (up_match <= rec_offs_n_fields(offsets) &&
+         low_match <= rec_offs_n_fields(offsets)))
+      break;
+
+    /* If the first or the last record of the page or the same key
+    value to the first record or last record, then another page might
+    be chosen when BTR_CONT_MODIFY_TREE.  So, the parent page should
+    not released to avoiding deadlock with blocking the another search
+    with the same key value. */
+    const rec_t *first=
+      page_rec_get_next_const(page_get_infimum_rec(block->page.frame));
+    ulint matched_fields;
+
+    if (UNIV_UNLIKELY(!first))
+      goto corrupted;
+    if (page_cur.rec == first ||
+        page_rec_is_last(page_cur.rec, block->page.frame))
+    {
+    same_key_root:
+      detected_same_key_root= true;
+      break;
+    }
 
-		if (height == level
-		    && latch_mode == BTR_MODIFY_TREE) {
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			/* we should sx-latch root page, if released already.
-			It contains seg_header. */
-			if (n_releases > 0) {
-				mtr_block_sx_latch_at_savepoint(
-					mtr, tree_savepoints[0],
-					tree_blocks[0]);
-			}
+    matched_fields= 0;
+    offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED,
+                              &heap);
+    cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false,
+                &matched_fields);
+    if (matched_fields >= rec_offs_n_fields(offsets) - 1)
+      goto same_key_root;
+    if (const rec_t* last=
+        page_rec_get_prev_const(page_get_supremum_rec(block->page.frame)))
+    {
+      matched_fields= 0;
+      offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED,
+                                &heap);
+      cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false,
+                  &matched_fields);
+      if (matched_fields >= rec_offs_n_fields(offsets) - 1)
+        goto same_key_root;
+    }
+    else
+      goto corrupted;
 
-			/* x-latch the branch blocks not released yet. */
-			for (ulint i = n_releases; i <= n_blocks; i++) {
-				mtr_block_x_latch_at_savepoint(
-					mtr, tree_savepoints[i],
-					tree_blocks[i]);
-			}
-		}
+    /* Release the non-root parent page unless it may need to be modified. */
+    if (tree_height > height + 1 &&
+        !btr_cur_will_modify_tree(index(), block->page.frame, lock_intention,
+                                  page_cur.rec, node_ptr_max_size,
+                                  zip_size, mtr))
+    {
+      mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
+      block_savepoint--;
+    }
+  }
 
-		/* Go to the child node */
-		page_id.set_page_no(
-			btr_node_ptr_get_child_page_no(node_ptr, offsets));
+  /* Go to the child node */
+  page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
 
-		n_blocks++;
-	}
+  if (!--height)
+  {
+    /* We are about to access the leaf level. */
+
+    switch (latch_mode) {
+    case BTR_MODIFY_ROOT_AND_LEAF:
+      rw_latch= RW_X_LATCH;
+      break;
+    case BTR_MODIFY_PREV: /* ibuf_insert() or btr_pcur_move_to_prev() */
+    case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
+      ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+      if (page_has_prev(block->page.frame) &&
+          page_rec_is_first(page_cur.rec, block->page.frame))
+      {
+        ut_ad(block_savepoint + 1 == mtr->get_savepoint());
+        /* Latch the previous page if the node pointer is the leftmost
+        of the current page. */
+        buf_block_t *left= btr_block_get(*index(),
+                                         btr_page_get_prev(block->page.frame),
+                                         RW_NO_LATCH, false, mtr, &err);
+        if (UNIV_UNLIKELY(!left))
+          goto func_exit;
+        ut_ad(block_savepoint + 2 == mtr->get_savepoint());
+        if (UNIV_LIKELY(left->page.lock.s_lock_try()))
+          mtr->lock_register(block_savepoint + 1, MTR_MEMO_PAGE_S_FIX);
+        else
+        {
+          if (rw_latch == RW_S_LATCH)
+            block->page.lock.s_unlock();
+          else
+            block->page.lock.x_unlock();
+          mtr->upgrade_buffer_fix(block_savepoint + 1, RW_S_LATCH);
+          mtr->lock_register(block_savepoint, MTR_MEMO_BUF_FIX);
+          mtr->upgrade_buffer_fix(block_savepoint, RW_S_LATCH);
+          /* While our latch on the level-2 page prevents splits or
+          merges of this level-1 block, other threads may have
+          modified it due to splitting or merging some level-0 (leaf)
+          pages underneath it. Thus, we must search again. */
+          if (page_cur_search_with_match(tuple, page_mode,
+                                         &up_match, &low_match,
+                                         &page_cur, nullptr))
+            goto corrupted;
+          offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0,
+                                   ULINT_UNDEFINED, &heap);
+          page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec,
+                                                             offsets));
+        }
+      }
+      goto leaf_with_no_latch;
+    case BTR_MODIFY_LEAF:
+    case BTR_SEARCH_LEAF:
+      if (index()->is_ibuf())
+        goto leaf_with_no_latch;
+      rw_latch= rw_lock_type_t(latch_mode);
+      if (btr_op != BTR_NO_OP &&
+          ibuf_should_try(index(), btr_op != BTR_INSERT_OP))
+        /* Try to buffer the operation if the leaf page
+        is not in the buffer pool. */
+        buf_mode= btr_op == BTR_DELETE_OP
+          ? BUF_GET_IF_IN_POOL_OR_WATCH
+          : BUF_GET_IF_IN_POOL;
+      break;
+    case BTR_MODIFY_TREE:
+      ut_ad(rw_latch == RW_X_LATCH);
+
+      if (lock_intention == BTR_INTENTION_INSERT &&
+          page_has_next(block->page.frame) &&
+          page_rec_is_last(page_cur.rec, block->page.frame))
+      {
+        /* btr_insert_into_right_sibling() might cause deleting node_ptr
+        at upper level */
+        mtr->rollback_to_savepoint(block_savepoint);
+        goto need_opposite_intention;
+      }
+      /* fall through */
+    default:
+    leaf_with_no_latch:
+      rw_latch= RW_NO_LATCH;
+    }
+  }
 
- exit_loop:
-	if (heap) {
-		mem_heap_free(heap);
-	}
+  goto search_loop;
+}
 
-	return err;
+ATTRIBUTE_COLD void mtr_t::index_lock_upgrade()
+{
+  auto &slot= m_memo[get_savepoint() - 1];
+  if (slot.type == MTR_MEMO_X_LOCK)
+    return;
+  ut_ad(slot.type == MTR_MEMO_SX_LOCK);
+  index_lock *lock= static_cast<index_lock*>(slot.object);
+  lock->u_x_upgrade(SRW_LOCK_CALL);
+  slot.type= MTR_MEMO_X_LOCK;
 }
 
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-bool
-btr_cur_open_at_rnd_pos_func(
-/*=========================*/
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
-	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in: mtr */
+ATTRIBUTE_COLD
+dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
+                                           page_cur_mode_t mode, mtr_t *mtr)
 {
-	page_cur_t*	page_cursor;
-	ulint		node_ptr_max_size = srv_page_size / 2;
-	ulint		height;
-	rec_t*		node_ptr;
-	btr_intention_t	lock_intention;
-	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
-	ulint		tree_savepoints[BTR_MAX_LEVELS];
-	ulint		n_blocks = 0;
-	ulint		n_releases = 0;
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets		= offsets_;
-	rec_offs_init(offsets_);
+  ut_ad(index()->is_btree() || index()->is_ibuf());
+  ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+
+  rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs*	offsets		= offsets_;
+  rec_offs_init(offsets_);
+
+  ut_ad(flag == BTR_CUR_BINARY);
+  ut_ad(dict_index_check_search_tuple(index(), tuple));
+  ut_ad(dtuple_check_typed(tuple));
+  buf_block_t *block= mtr->at_savepoint(1);
+  ut_ad(block->page.id().page_no() == index()->page);
+  block->page.fix();
+  mtr->rollback_to_savepoint(1);
+  mtr->index_lock_upgrade();
+
+  const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)};
+
+  mtr->page_lock(block, RW_X_LATCH);
+
+  up_match= 0;
+  up_bytes= 0;
+  low_match= 0;
+  low_bytes= 0;
+  ulint height= btr_page_get_level(block->page.frame);
+  tree_height= height + 1;
+  mem_heap_t *heap= nullptr;
+
+ search_loop:
+  dberr_t err;
+  page_cur.block= block;
+
+  if (UNIV_UNLIKELY(!height))
+  {
+    if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+                                   &page_cur, nullptr))
+    corrupted:
+      err= DB_CORRUPTION;
+    else
+    {
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+      ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
 
-	ut_ad(!index->is_spatial());
+#ifdef BTR_CUR_HASH_ADAPT
+      /* We do a dirty read of btr_search_enabled here.  We will
+      properly check btr_search_enabled again in
+      btr_search_build_page_hash_index() before building a page hash
+      index, while holding search latch. */
+      if (!btr_search_enabled);
+      else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+        /* This may be a search tuple for btr_pcur_t::restore_position(). */
+        ut_ad(tuple->is_metadata() ||
+              (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
+      else if (index()->table->is_temporary());
+      else if (!rec_is_metadata(page_cur.rec, *index()))
+        btr_search_info_update(index(), this);
+#endif /* BTR_CUR_HASH_ADAPT */
+      err= DB_SUCCESS;
+    }
 
-	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
+  func_exit:
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+    return err;
+  }
 
-	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
+  if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
+                                 &page_cur, nullptr))
+    goto corrupted;
 
-	ulint savepoint = mtr_set_savepoint(mtr);
+  page_id_t page_id{block->page.id()};
 
-	rw_lock_type_t upper_rw_latch;
+  offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
+                           &heap);
+  /* Go to the child node */
+  page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
 
-	switch (latch_mode) {
-	case BTR_MODIFY_TREE:
-		/* Most of delete-intended operations are purging.
-		Free blocks and read IO bandwidth should be prior
-		for them, when the history list is glowing huge. */
-		if (lock_intention == BTR_INTENTION_DELETE
-		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
-		    && buf_pool.n_pend_reads) {
-			mtr_x_lock_index(index, mtr);
-		} else {
-			mtr_sx_lock_index(index, mtr);
-		}
-		upper_rw_latch = RW_X_LATCH;
-		break;
-	case BTR_SEARCH_PREV:
-	case BTR_MODIFY_PREV:
-		/* This function doesn't support left uncle
-		   page lock for left leaf page lock, when
-		   needed. */
-	case BTR_SEARCH_TREE:
-	case BTR_CONT_MODIFY_TREE:
-	case BTR_CONT_SEARCH_TREE:
-		ut_ad(0);
-		/* fall through */
-	default:
-		if (!srv_read_only_mode) {
-			mtr_s_lock_index(index, mtr);
-			upper_rw_latch = RW_S_LATCH;
-		} else {
-			upper_rw_latch = RW_NO_LATCH;
-		}
-	}
+  const auto block_savepoint= mtr->get_savepoint();
+  block=
+    buf_page_get_gen(page_id, block->zip_size(), RW_NO_LATCH, nullptr, BUF_GET,
+                     mtr, &err, !--height && !index()->is_clust());
 
-	DBUG_EXECUTE_IF("test_index_is_unavailable",
-			return(false););
+  if (!block)
+  {
+    if (err == DB_DECRYPTION_FAILED)
+      btr_decryption_failed(*index());
+    goto func_exit;
+  }
 
-	if (index->page == FIL_NULL) {
-		/* Since we don't hold index lock until just now, the index
-		could be modified by others, for example, if this is a
-		statistics updater for referenced table, it could be marked
-		as unavailable by 'DROP TABLE' in the mean time, since
-		we don't hold lock for statistics updater */
-		return(false);
-	}
+  if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
+      btr_page_get_index_id(block->page.frame) != index()->id ||
+      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+      !fil_page_index_page_check(block->page.frame))
+    goto corrupted;
 
-	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
-		latch_mode);
+  if (height != btr_page_get_level(block->page.frame))
+    goto corrupted;
 
-	page_cursor = btr_cur_get_page_cur(cursor);
-	cursor->index = index;
+  if (page_has_prev(block->page.frame) &&
+      !btr_block_get(*index(), btr_page_get_prev(block->page.frame),
+                     RW_X_LATCH, false, mtr, &err))
+    goto func_exit;
+  mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
+#ifdef UNIV_ZIP_DEBUG
+  const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
+  ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index()));
+#endif /* UNIV_ZIP_DEBUG */
+  if (page_has_next(block->page.frame) &&
+      !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+                     RW_X_LATCH, false, mtr, &err))
+    goto func_exit;
+  goto search_loop;
+}
 
-	page_id_t		page_id(index->table->space_id, index->page);
-	const ulint		zip_size = index->table->space->zip_size();
-	dberr_t			err = DB_SUCCESS;
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+cursor->up_match and cursor->low_match both will have sensible values.
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
+@param level      the tree level of search
+@param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
+                  it cannot get compared to the node ptr page number field!
+@param latch      RW_S_LATCH or RW_X_LATCH
+@param cursor     tree cursor; the cursor page is s- or x-latched, but see also
+                  above!
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise */
+TRANSACTIONAL_TARGET
+dberr_t btr_cur_search_to_nth_level(ulint level,
+                                    const dtuple_t *tuple,
+                                    rw_lock_type_t rw_latch,
+                                    btr_cur_t *cursor, mtr_t *mtr)
+{
+  dict_index_t *const index= cursor->index();
+
+  ut_ad(index->is_btree() || index->is_ibuf());
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+  ut_ad(level);
+  ut_ad(dict_index_check_search_tuple(index, tuple));
+  ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree());
+  ut_ad(dtuple_check_typed(tuple));
+  ut_ad(index->page != FIL_NULL);
+
+  MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
+  MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
+  cursor->up_match= 0;
+  cursor->low_match= 0;
+  cursor->flag= BTR_CUR_BINARY;
 
-	if (root_leaf_rw_latch == RW_X_LATCH) {
-		node_ptr_max_size = btr_node_ptr_max_size(index);
-	}
+#ifndef BTR_CUR_ADAPT
+  buf_block_t *block= nullptr;
+#else
+  btr_search_t *info= btr_search_get_info(index);
+  buf_block_t *block= info->root_guess;
+#endif /* BTR_CUR_ADAPT */
 
-	height = ULINT_UNDEFINED;
+  ut_ad(mtr->memo_contains_flagged(&index->lock,
+                                   MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
 
-	for (;;) {
-		page_t*		page;
+  const ulint zip_size= index->table->space->zip_size();
 
-		ut_ad(n_blocks < BTR_MAX_LEVELS);
-		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
+  /* Start with the root page. */
+  page_id_t page_id(index->table->space_id, index->page);
+  ulint height= ULINT_UNDEFINED;
 
-		const rw_lock_type_t rw_latch = height
-			&& latch_mode != BTR_MODIFY_TREE
-			? upper_rw_latch : RW_NO_LATCH;
-		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
-						      rw_latch, NULL, BUF_GET,
-						      file, line, mtr, &err,
-						      height == 0
-						      && !index->is_clust());
-		tree_blocks[n_blocks] = block;
+search_loop:
+  dberr_t err= DB_SUCCESS;
+  if (buf_block_t *b=
+      mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch)))
+    block= b;
+  else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch,
+                                     block, BUF_GET, mtr, &err)))
+  {
+    if (err == DB_DECRYPTION_FAILED)
+      btr_decryption_failed(*index);
+    goto func_exit;
+  }
 
-		ut_ad((block != NULL) == (err == DB_SUCCESS));
+#ifdef UNIV_ZIP_DEBUG
+  if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
+    ut_a(page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
 
-		if (err != DB_SUCCESS) {
-			if (err == DB_DECRYPTION_FAILED) {
-				ib_push_warning((void *)NULL,
-					DB_DECRYPTION_FAILED,
-					"Table %s is encrypted but encryption service or"
-					" used key_id is not available. "
-					" Can't continue reading table.",
-					index->table->name.m_name);
-				index->table->file_unreadable = true;
-			}
+  if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
+      btr_page_get_index_id(block->page.frame) != index->id ||
+      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+      !fil_page_index_page_check(block->page.frame))
+  {
+  corrupted:
+    err= DB_CORRUPTION;
+  func_exit:
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+    return err;
+  }
 
-			break;
-		}
+  const uint32_t page_level= btr_page_get_level(block->page.frame);
 
-		page = buf_block_get_frame(block);
+  if (height == ULINT_UNDEFINED)
+  {
+    /* We are in the root node */
+    height= page_level;
+    if (!height)
+      goto corrupted;
+    cursor->tree_height= height + 1;
+  }
+  else if (height != ulint{page_level})
+    goto corrupted;
+
+  cursor->page_cur.block= block;
+
+  /* Search for complete index fields. */
+  if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match,
+                                 &cursor->low_match, &cursor->page_cur,
+                                 nullptr))
+    goto corrupted;
+
+  /* If this is the desired level, leave the loop */
+  if (level == height)
+    goto func_exit;
+
+  ut_ad(height > level);
+  height--;
+
+  offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
+                            ULINT_UNDEFINED, &heap);
+  /* Go to the child node */
+  page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
+                                                     offsets));
+  block= nullptr;
+  goto search_loop;
+}
 
-		if (height == ULINT_UNDEFINED
-		    && page_is_leaf(page)
-		    && rw_latch != RW_NO_LATCH
-		    && rw_latch != root_leaf_rw_latch) {
-			/* We should retry to get the page, because the root page
-			is latched with different level as a leaf page. */
-			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
-			ut_ad(rw_latch == RW_S_LATCH);
-
-			ut_ad(n_blocks == 0);
-			mtr_release_block_at_savepoint(
-				mtr, tree_savepoints[n_blocks],
-				tree_blocks[n_blocks]);
-
-			upper_rw_latch = root_leaf_rw_latch;
-			continue;
-		}
+dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
+                             btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  ulint n_blocks= 0;
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  dberr_t err;
 
-		ut_ad(fil_page_index_page_check(page));
-		ut_ad(index->id == btr_page_get_index_id(page));
+  rec_offs_init(offsets_);
 
-		if (height == ULINT_UNDEFINED) {
-			/* We are in the root node */
+  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+  latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED);
 
-			height = btr_page_get_level(page);
-		}
+  btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
 
-		if (height == 0) {
-			if (rw_latch == RW_NO_LATCH
-			    || srv_read_only_mode) {
-				btr_cur_latch_leaves(block, latch_mode, cursor,
-						     mtr);
-			}
+  /* Store the position of the tree latch we push to mtr so that we
+  know how to release it when we have latched the leaf node */
 
-			/* btr_cur_open_at_index_side_func() and
-			btr_cur_search_to_nth_level() release
-			tree s-latch here.*/
-			switch (latch_mode) {
-			case BTR_MODIFY_TREE:
-			case BTR_CONT_MODIFY_TREE:
-			case BTR_CONT_SEARCH_TREE:
-				break;
-			default:
-				/* Release the tree s-latch */
-				if (!srv_read_only_mode) {
-					mtr_release_s_latch_at_savepoint(
-						mtr, savepoint,
-						dict_index_get_lock(index));
-				}
+  auto savepoint= mtr->get_savepoint();
 
-				/* release upper blocks */
-				for (; n_releases < n_blocks; n_releases++) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						tree_savepoints[n_releases],
-						tree_blocks[n_releases]);
-				}
-			}
-		}
+  rw_lock_type_t upper_rw_latch= RW_X_LATCH;
+  ulint node_ptr_max_size= 0, compress_limit= 0;
 
-		page_cur_open_on_rnd_user_rec(block, page_cursor);
+  if (latch_mode == BTR_MODIFY_TREE)
+  {
+    node_ptr_max_size= btr_node_ptr_max_size(index);
+    /* Most of delete-intended operations are purging. Free blocks
+    and read IO bandwidth should be prioritized for them, when the
+    history list is growing huge. */
+    savepoint++;
+    if (lock_intention == BTR_INTENTION_DELETE)
+    {
+      compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index);
+
+      if (os_aio_pending_reads_approx() &&
+          trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
+      {
+        mtr_x_lock_index(index, mtr);
+        goto index_locked;
+      }
+    }
+    mtr_sx_lock_index(index, mtr);
+  }
+  else
+  {
+    static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), "");
+    ut_ad(!(latch_mode & 8));
+    /* This function doesn't need to lock left page of the leaf page */
+    static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), "");
+    static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), "");
+    latch_mode= btr_latch_mode(latch_mode & ~4);
+    ut_ad(!latch_by_caller ||
+          mtr->memo_contains_flagged(&index->lock,
+                                     MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK));
+    upper_rw_latch= RW_S_LATCH;
+    if (!latch_by_caller)
+    {
+      savepoint++;
+      mtr_s_lock_index(index, mtr);
+    }
+  }
 
-		if (height == 0) {
+index_locked:
+  ut_ad(savepoint == mtr->get_savepoint());
 
-			break;
-		}
+  const rw_lock_type_t root_leaf_rw_latch=
+    rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH));
 
-		ut_ad(height > 0);
+  page_cur.index = index;
 
-		height--;
+  uint32_t page= index->page;
+  const auto zip_size= index->table->space->zip_size();
 
-		node_ptr = page_cur_get_rec(page_cursor);
-		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
-					  0, ULINT_UNDEFINED, &heap);
+  for (ulint height= ULINT_UNDEFINED;;)
+  {
+    ut_ad(n_blocks < BTR_MAX_LEVELS);
+    ut_ad(savepoint + n_blocks == mtr->get_savepoint());
 
-		/* If the rec is the first or last in the page for
-		pessimistic delete intention, it might cause node_ptr insert
-		for the upper level. We should change the intention and retry.
-		*/
-		if (latch_mode == BTR_MODIFY_TREE
-		    && btr_cur_need_opposite_intention(
-			page, lock_intention, node_ptr)) {
+    const rw_lock_type_t rw_latch= height && latch_mode != BTR_MODIFY_TREE
+      ? upper_rw_latch
+      : RW_NO_LATCH;
+    buf_block_t* block=
+      btr_block_get(*index, page, rw_latch, !height && !index->is_clust(), mtr,
+                    &err);
 
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			/* release all blocks */
-			for (; n_releases <= n_blocks; n_releases++) {
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
+    ut_ad(!block == (err != DB_SUCCESS));
 
-			lock_intention = BTR_INTENTION_BOTH;
+    if (!block)
+    {
+      if (err == DB_DECRYPTION_FAILED)
+        btr_decryption_failed(*index);
+      break;
+    }
 
-			page_id.set_page_no(dict_index_get_page(index));
+    if (first)
+      page_cur_set_before_first(block, &page_cur);
+    else
+      page_cur_set_after_last(block, &page_cur);
 
-			height = ULINT_UNDEFINED;
+    const uint32_t l= btr_page_get_level(block->page.frame);
 
-			n_blocks = 0;
-			n_releases = 0;
+    if (height == ULINT_UNDEFINED)
+    {
+      /* We are in the root node */
+      height= l;
+      if (height);
+      else if (upper_rw_latch != root_leaf_rw_latch)
+      {
+        /* We should retry to get the page, because the root page
+        is latched with different level as a leaf page. */
+        ut_ad(n_blocks == 0);
+        ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+        upper_rw_latch= root_leaf_rw_latch;
+        mtr->rollback_to_savepoint(savepoint);
+        height= ULINT_UNDEFINED;
+        continue;
+      }
+      else
+      {
+      reached_leaf:
+        const auto leaf_savepoint= mtr->get_savepoint();
+        ut_ad(leaf_savepoint);
+        ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1));
+
+        if (latch_mode == BTR_MODIFY_TREE)
+        {
+          ut_ad(rw_latch == RW_NO_LATCH);
+          /* x-latch also siblings from left to right */
+          if (page_has_prev(block->page.frame) &&
+              !btr_block_get(*index, btr_page_get_prev(block->page.frame),
+                             RW_X_LATCH, false, mtr, &err))
+            break;
+          mtr->upgrade_buffer_fix(leaf_savepoint - 1, RW_X_LATCH);
+          if (page_has_next(block->page.frame) &&
+              !btr_block_get(*index, btr_page_get_next(block->page.frame),
+                             RW_X_LATCH, false, mtr, &err))
+            break;
+
+          if (!index->lock.have_x() &&
+              btr_cur_need_opposite_intention(block->page.frame,
+                                              lock_intention,
+                                              node_ptr_max_size,
+                                              compress_limit, page_cur.rec))
+            goto need_opposite_intention;
+        }
+        else
+        {
+          if (rw_latch == RW_NO_LATCH)
+            mtr->upgrade_buffer_fix(leaf_savepoint - 1,
+                                    rw_lock_type_t(latch_mode &
+                                                   (RW_X_LATCH | RW_S_LATCH)));
+          if (latch_mode != BTR_CONT_MODIFY_TREE)
+          {
+            ut_ad(latch_mode == BTR_MODIFY_LEAF ||
+                  latch_mode == BTR_SEARCH_LEAF);
+            /* Release index->lock if needed, and the non-leaf pages. */
+            mtr->rollback_to_savepoint(savepoint - !latch_by_caller,
+                                       leaf_savepoint - 1);
+          }
+        }
+        break;
+      }
+    }
+    else if (UNIV_UNLIKELY(height != l))
+    {
+    corrupted:
+      err= DB_CORRUPTION;
+      break;
+    }
 
-			continue;
-		}
+    if (!height)
+      goto reached_leaf;
 
-		if (latch_mode == BTR_MODIFY_TREE
-		    && !btr_cur_will_modify_tree(
-				cursor->index, page, lock_intention, node_ptr,
-				node_ptr_max_size, zip_size, mtr)) {
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			ut_ad(n_releases <= n_blocks);
-
-			/* we can release upper blocks */
-			for (; n_releases < n_blocks; n_releases++) {
-				if (n_releases == 0) {
-					/* we should not release root page
-					to pin to same block. */
-					continue;
-				}
+    height--;
 
-				/* release unused blocks to unpin */
-				mtr_release_block_at_savepoint(
-					mtr, tree_savepoints[n_releases],
-					tree_blocks[n_releases]);
-			}
-		}
+    if (first
+        ? !page_cur_move_to_next(&page_cur)
+        : !page_cur_move_to_prev(&page_cur))
+      goto corrupted;
 
-		if (height == 0
-		    && latch_mode == BTR_MODIFY_TREE) {
-			ut_ad(upper_rw_latch == RW_X_LATCH);
-			/* we should sx-latch root page, if released already.
-			It contains seg_header. */
-			if (n_releases > 0) {
-				mtr_block_sx_latch_at_savepoint(
-					mtr, tree_savepoints[0],
-					tree_blocks[0]);
-			}
+    offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED,
+                             &heap);
 
-			/* x-latch the branch blocks not released yet. */
-			for (ulint i = n_releases; i <= n_blocks; i++) {
-				mtr_block_x_latch_at_savepoint(
-					mtr, tree_savepoints[i],
-					tree_blocks[i]);
-			}
-		}
+    ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH);
 
-		/* Go to the child node */
-		page_id.set_page_no(
-			btr_node_ptr_get_child_page_no(node_ptr, offsets));
+    if (latch_mode != BTR_MODIFY_TREE);
+    else if (btr_cur_need_opposite_intention(block->page.frame, lock_intention,
+                                             node_ptr_max_size, compress_limit,
+                                             page_cur.rec))
+    {
+    need_opposite_intention:
+      /* If the rec is the first or last in the page for pessimistic
+      delete intention, it might cause node_ptr insert for the upper
+      level. We should change the intention and retry. */
+
+      mtr->rollback_to_savepoint(savepoint);
+      mtr->index_lock_upgrade();
+      /* X-latch all pages from now on */
+      latch_mode= BTR_CONT_MODIFY_TREE;
+      page= index->page;
+      height= ULINT_UNDEFINED;
+      n_blocks= 0;
+      continue;
+    }
+    else
+    {
+      if (!btr_cur_will_modify_tree(index, block->page.frame,
+                                    lock_intention, page_cur.rec,
+                                    node_ptr_max_size, zip_size, mtr))
+      {
+        ut_ad(n_blocks);
+        /* release buffer-fixes on pages that will not be modified
+        (except the root) */
+        if (n_blocks > 1)
+        {
+          mtr->rollback_to_savepoint(savepoint + 1, savepoint + n_blocks - 1);
+          n_blocks= 1;
+        }
+      }
+
+      if (!height)
+      {
+        if (page == index->page)
+          mtr->upgrade_buffer_fix(savepoint, RW_X_LATCH);
+        else
+        {
+          /* The U-latch protects BTR_SEG_HEAP, BTR_SEG_TOP. */
+          mtr->upgrade_buffer_fix(savepoint, RW_SX_LATCH);
+
+          /* Upgrade buffer-fix to exclusive latches on all remaining pages. */
+          for (ulint i= 1; i <= n_blocks; i++)
+            mtr->upgrade_buffer_fix(savepoint + i, RW_X_LATCH);
+        }
+      }
+    }
 
-		n_blocks++;
-	}
+    /* Go to the child node */
+    page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
+    n_blocks++;
+  }
 
-	if (UNIV_LIKELY_NULL(heap)) {
-		mem_heap_free(heap);
-	}
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
 
-	return err == DB_SUCCESS;
+  return err;
 }
 
 /*==================== B-TREE INSERT =========================*/
@@ -3182,26 +2093,25 @@ btr_cur_insert_if_possible(
 	page_cursor = btr_cur_get_page_cur(cursor);
 
 	/* Now, try the insert */
-	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
-				    offsets, heap, n_ext, mtr);
+	rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext,
+				    mtr);
 
 	/* If the record did not fit, reorganize.
 	For compressed pages, page_cur_tuple_insert()
 	attempted this already. */
 	if (!rec && !page_cur_get_page_zip(page_cursor)
-	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
-		rec = page_cur_tuple_insert(
-			page_cursor, tuple, cursor->index,
-			offsets, heap, n_ext, mtr);
+	    && btr_page_reorganize(page_cursor, mtr) == DB_SUCCESS) {
+		rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap,
+					    n_ext, mtr);
 	}
 
-	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
 	return(rec);
 }
 
 /*************************************************************//**
 For an insert, checks the locks and does the undo logging if desired.
-@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
 dberr_t
 btr_cur_ins_lock_and_undo(
@@ -3217,20 +2127,22 @@ btr_cur_ins_lock_and_undo(
 				should inherit LOCK_GAP type locks from the
 				successor record */
 {
-	dict_index_t*	index;
-	dberr_t		err = DB_SUCCESS;
-	rec_t*		rec;
-	roll_ptr_t	roll_ptr;
+	if (!(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))) {
+		return DB_SUCCESS;
+	}
 
 	/* Check if we have to wait for a lock: enqueue an explicit lock
 	request if yes */
 
-	rec = btr_cur_get_rec(cursor);
-	index = cursor->index;
+	rec_t* rec = btr_cur_get_rec(cursor);
+	dict_index_t* index = cursor->index();
 
 	ut_ad(!dict_index_is_online_ddl(index)
 	      || dict_index_is_clust(index)
 	      || (flags & BTR_CREATE_FLAG));
+	ut_ad((flags & BTR_NO_UNDO_LOG_FLAG)
+	      || !index->table->skip_alter_undo);
+
 	ut_ad(mtr->is_named_space(index->table->space));
 
 	/* Check if there is predicate or GAP lock preventing the insertion */
@@ -3245,14 +2157,18 @@ btr_cur_ins_lock_and_undo(
 			/* Use on stack MBR variable to test if a lock is
 			needed. If so, the predicate (MBR) will be allocated
 			from lock heap in lock_prdt_insert_check_and_lock() */
-			lock_init_prdt_from_mbr(
-				&prdt, &mbr, 0, NULL);
+			lock_init_prdt_from_mbr(&prdt, &mbr, 0, nullptr);
 
-			err = lock_prdt_insert_check_and_lock(
-				flags, rec, btr_cur_get_block(cursor),
-				index, thr, mtr, &prdt);
+			if (dberr_t err = lock_prdt_insert_check_and_lock(
+				    rec, btr_cur_get_block(cursor),
+				    index, thr, mtr, &prdt)) {
+				return err;
+			}
 			*inherit = false;
 		} else {
+			ut_ad(!dict_index_is_online_ddl(index)
+			      || index->is_primary()
+			      || (flags & BTR_CREATE_FLAG));
 #ifdef WITH_WSREP
 			trx_t* trx= thr_get_trx(thr);
 			/* If transaction scanning an unique secondary
@@ -3268,45 +2184,48 @@ btr_cur_ins_lock_and_undo(
 			if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
 			    && trx->is_wsrep()
 			    && wsrep_thd_is_BF(trx->mysql_thd, false)) {
-				trx->wsrep_UK_scan= true;
+				trx->wsrep = 3;
 			}
 #endif /* WITH_WSREP */
-			err = lock_rec_insert_check_and_lock(
-				flags, rec, btr_cur_get_block(cursor),
-				index, thr, mtr, inherit);
-#ifdef WITH_WSREP
-			trx->wsrep_UK_scan= false;
-#endif /* WITH_WSREP */
+			if (dberr_t err = lock_rec_insert_check_and_lock(
+				    rec, btr_cur_get_block(cursor),
+				    index, thr, mtr, inherit)) {
+				return err;
+			}
 		}
 	}
 
-	if (err != DB_SUCCESS
-	    || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
-	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
-
-		return(err);
+	if (!index->is_primary() || !page_is_leaf(page_align(rec))) {
+		return DB_SUCCESS;
 	}
 
-	if (flags & BTR_NO_UNDO_LOG_FLAG) {
-		roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
-		if (!(flags & BTR_KEEP_SYS_FLAG)) {
-upd_sys:
-			dfield_t* r = dtuple_get_nth_field(
-				entry, index->db_roll_ptr());
-			ut_ad(r->len == DATA_ROLL_PTR_LEN);
-			trx_write_roll_ptr(static_cast<byte*>(r->data),
-					   roll_ptr);
+	constexpr roll_ptr_t dummy_roll_ptr = roll_ptr_t{1}
+		<< ROLL_PTR_INSERT_FLAG_POS;
+	roll_ptr_t roll_ptr = dummy_roll_ptr;
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+		if (dberr_t err = trx_undo_report_row_operation(
+			    thr, index, entry, NULL, 0, NULL, NULL,
+			    &roll_ptr)) {
+			return err;
 		}
-	} else {
-		err = trx_undo_report_row_operation(thr, index, entry,
-						    NULL, 0, NULL, NULL,
-						    &roll_ptr);
-		if (err == DB_SUCCESS) {
-			goto upd_sys;
+
+		if (roll_ptr != dummy_roll_ptr) {
+			dfield_t* r = dtuple_get_nth_field(entry,
+							   index->db_trx_id());
+			trx_write_trx_id(static_cast<byte*>(r->data),
+					 thr_get_trx(thr)->id);
 		}
 	}
 
-	return(err);
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		dfield_t* r = dtuple_get_nth_field(
+			entry, index->db_roll_ptr());
+		ut_ad(r->len == DATA_ROLL_PTR_LEN);
+		trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+	}
+
+	return DB_SUCCESS;
 }
 
 /**
@@ -3316,12 +2235,12 @@ Prefetch siblings of the leaf for the pessimistic operation.
 static void btr_cur_prefetch_siblings(const buf_block_t *block,
                                       const dict_index_t *index)
 {
-  ut_ad(page_is_leaf(block->frame));
+  ut_ad(page_is_leaf(block->page.frame));
 
   if (index->is_ibuf())
     return;
 
-  const page_t *page= block->frame;
+  const page_t *page= block->page.frame;
   uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
   uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
 
@@ -3343,7 +2262,7 @@ It is assumed that mtr holds an x-latch on the page. The operation does
 not succeed if there is too little space on the page. If there is just
 one record on the page, the insert will always succeed; this is to
 prevent trying to split a page with just one record.
-@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
 dberr_t
 btr_cur_optimistic_insert(
 /*======================*/
@@ -3388,7 +2307,7 @@ btr_cur_optimistic_insert(
 
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
-	index = cursor->index;
+	index = cursor->index();
 
 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!dict_index_is_online_ddl(index)
@@ -3437,8 +2356,7 @@ convert_big_rec:
 		return(DB_TOO_BIG_RECORD);
 	}
 
-	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
-				      goto fail);
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail);
 
 	if (block->page.zip.data && leaf
 	    && (page_get_data_size(page) + rec_size
@@ -3452,7 +2370,7 @@ fail:
 
 		/* prefetch siblings of the leaf for the pessimistic
 		operation, if the page is leaf. */
-		if (page_is_leaf(page)) {
+		if (leaf) {
 			btr_cur_prefetch_siblings(block, index);
 		}
 fail_err:
@@ -3504,8 +2422,8 @@ fail_err:
 		 << ib::hex(thr ? thr->graph->trx->id : 0)
 		 << ' ' << rec_printer(entry).str());
 	DBUG_EXECUTE_IF("do_page_reorganize",
-			if (n_recs)
-			btr_page_reorganize(page_cursor, index, mtr););
+			ut_a(!n_recs || btr_page_reorganize(page_cursor, mtr)
+			     == DB_SUCCESS););
 
 	/* Now, try the insert */
 	{
@@ -3521,7 +2439,7 @@ fail_err:
 
 #ifdef UNIV_DEBUG
 		if (!(flags & BTR_CREATE_FLAG)
-		    && index->is_primary() && page_is_leaf(page)) {
+		    && leaf && index->is_primary()) {
 			const dfield_t* trx_id = dtuple_get_nth_field(
 				entry, dict_col_get_clust_pos(
 					dict_table_get_sys_col(index->table,
@@ -3537,7 +2455,8 @@ fail_err:
 					      DATA_TRX_ID_LEN));
 			} else {
 				ut_ad(thr->graph->trx->id);
-				ut_ad(thr->graph->trx->id
+				ut_ad(thr->graph->trx->bulk_insert
+				      || thr->graph->trx->id
 				      == trx_read_trx_id(
 					      static_cast<const byte*>(
 							trx_id->data))
@@ -3546,9 +2465,8 @@ fail_err:
 		}
 #endif
 
-		*rec = page_cur_tuple_insert(
-			page_cursor, entry, index, offsets, heap,
-			n_ext, mtr);
+		*rec = page_cur_tuple_insert(page_cursor, entry, offsets, heap,
+					     n_ext, mtr);
 
 		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
 	}
@@ -3567,39 +2485,29 @@ fail_err:
 		goto fail;
 	} else {
 		ut_ad(!reorg);
+		reorg = true;
 
 		/* If the record did not fit, reorganize */
-		if (!btr_page_reorganize(page_cursor, index, mtr)) {
-			ut_ad(0);
-			goto fail;
-		}
-
-		ut_ad(page_get_max_insert_size(page, 1) == max_size);
-
-		reorg = TRUE;
-
-		*rec = page_cur_tuple_insert(page_cursor, entry, index,
-					     offsets, heap, n_ext, mtr);
-
-		if (UNIV_UNLIKELY(!*rec)) {
-			ib::fatal() <<  "Cannot insert tuple " << *entry
-				<< "into index " << index->name
-				<< " of table " << index->table->name
-				<< ". Max size: " << max_size;
+		err = btr_page_reorganize(page_cursor, mtr);
+		if (err != DB_SUCCESS
+		    || page_get_max_insert_size(page, 1) != max_size
+		    || !(*rec = page_cur_tuple_insert(page_cursor, entry,
+						      offsets, heap, n_ext,
+						      mtr))) {
+			err = DB_CORRUPTION;
+			goto fail_err;
 		}
 	}
 
 #ifdef BTR_CUR_HASH_ADAPT
 	if (!leaf) {
-# ifdef MYSQL_INDEX_DISABLE_AHI
-	} else if (index->disable_ahi) {
-# endif
 	} else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
 		ut_ad(entry->is_metadata());
 		ut_ad(index->is_instant());
 		ut_ad(flags == BTR_NO_LOCKING_FLAG);
+	} else if (index->table->is_temporary()) {
 	} else {
-		rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
+		srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index);
 		if (!reorg && cursor->flag == BTR_CUR_HASH) {
 			btr_search_update_hash_node_on_insert(
 				cursor, ahi_latch);
@@ -3679,11 +2587,9 @@ btr_cur_pessimistic_insert(
 				| BTR_NO_UNDO_LOG_FLAG)) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	dict_index_t*	index		= cursor->index;
+	dict_index_t*	index		= cursor->index();
 	big_rec_t*	big_rec_vec	= NULL;
-	dberr_t		err;
 	bool		inherit = false;
-	bool		success;
 	uint32_t	n_reserved	= 0;
 
 	ut_ad(dtuple_check_typed(entry));
@@ -3703,27 +2609,24 @@ btr_cur_pessimistic_insert(
 
 	/* Check locks and write to undo log, if specified */
 
-	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
-					thr, mtr, &inherit);
+	dberr_t err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+						thr, mtr, &inherit);
 
 	if (err != DB_SUCCESS) {
-
 		return(err);
 	}
 
-	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
-		/* First reserve enough free space for the file segments
-		of the index tree, so that the insert will not fail because
-		of lack of space */
-
-		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
+	/* First reserve enough free space for the file segments of
+	the index tree, so that the insert will not fail because of
+	lack of space */
 
-		success = fsp_reserve_free_extents(&n_reserved,
-						   index->table->space,
-						   n_extents, FSP_NORMAL, mtr);
-		if (!success) {
-			return(DB_OUT_OF_FILE_SPACE);
-		}
+	if (!index->is_ibuf()
+	    && (err = fsp_reserve_free_extents(&n_reserved, index->table->space,
+					       uint32_t(cursor->tree_height / 16
+							+ 3),
+					       FSP_NORMAL, mtr))
+	    != DB_SUCCESS) {
+		return err;
 	}
 
 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
@@ -3754,19 +2657,14 @@ btr_cur_pessimistic_insert(
 		}
 	}
 
-	if (dict_index_get_page(index)
-	    == btr_cur_get_block(cursor)->page.id().page_no()) {
+	*rec = index->page == btr_cur_get_block(cursor)->page.id().page_no()
+		? btr_root_raise_and_insert(flags, cursor, offsets, heap,
+					    entry, n_ext, mtr, &err)
+		: btr_page_split_and_insert(flags, cursor, offsets, heap,
+					    entry, n_ext, mtr, &err);
 
-		/* The page is the root page */
-		*rec = btr_root_raise_and_insert(
-			flags, cursor, offsets, heap, entry, n_ext, mtr);
-	} else {
-		*rec = btr_page_split_and_insert(
-			flags, cursor, offsets, heap, entry, n_ext, mtr);
-	}
-
-	if (*rec == NULL && os_has_said_disk_full) {
-		return(DB_OUT_OF_FILE_SPACE);
+	if (!*rec) {
+		goto func_exit;
 	}
 
 	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
@@ -3800,14 +2698,12 @@ btr_cur_pessimistic_insert(
 		ut_ad(!big_rec_vec);
 	} else {
 #ifdef BTR_CUR_HASH_ADAPT
-# ifdef MYSQL_INDEX_DISABLE_AHI
-		if (index->disable_ahi); else
-# endif
 		if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
 			ut_ad(entry->is_metadata());
 			ut_ad(index->is_instant());
 			ut_ad(flags & BTR_NO_LOCKING_FLAG);
 			ut_ad(!(flags & BTR_CREATE_FLAG));
+		} else if (index->table->is_temporary()) {
 		} else {
 			btr_search_update_hash_on_insert(
 				cursor, btr_search_sys.get_latch(*index));
@@ -3819,17 +2715,19 @@ btr_cur_pessimistic_insert(
 		}
 	}
 
+	err = DB_SUCCESS;
+func_exit:
 	index->table->space->release_free_extents(n_reserved);
 	*big_rec = big_rec_vec;
 
-	return(DB_SUCCESS);
+	return err;
 }
 
 /*==================== B-TREE UPDATE =========================*/
 
 /*************************************************************//**
 For an update, checks the locks and does the undo logging.
-@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
 dberr_t
 btr_cur_upd_lock_and_undo(
@@ -3852,7 +2750,7 @@ btr_cur_upd_lock_and_undo(
 	ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
 
 	rec = btr_cur_get_rec(cursor);
-	index = cursor->index;
+	index = cursor->index();
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(mtr->is_named_space(index->table->space));
@@ -3873,7 +2771,7 @@ btr_cur_upd_lock_and_undo(
 
 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
 		err = lock_clust_rec_modify_check_and_lock(
-			flags, btr_cur_get_block(cursor), rec, index,
+			btr_cur_get_block(cursor), rec, index,
 			offsets, thr);
 		if (err != DB_SUCCESS) {
 			return(err);
@@ -3908,6 +2806,7 @@ static void btr_cur_write_sys(
 	trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
 }
 
+MY_ATTRIBUTE((warn_unused_result))
 /** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
 @param[in,out]  block           clustered index leaf page
 @param[in,out]  rec             clustered index record
@@ -3915,11 +2814,12 @@ static void btr_cur_write_sys(
 @param[in]      offsets         rec_get_offsets(rec, index)
 @param[in]      trx             transaction
 @param[in]      roll_ptr        DB_ROLL_PTR value
-@param[in,out]  mtr             mini-transaction */
-static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
-                                dict_index_t *index, const rec_offs *offsets,
-                                const trx_t *trx, roll_ptr_t roll_ptr,
-                                mtr_t *mtr)
+@param[in,out]  mtr             mini-transaction
+@return error code */
+static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+                                   dict_index_t *index, const rec_offs *offsets,
+                                   const trx_t *trx, roll_ptr_t roll_ptr,
+                                   mtr_t *mtr)
 {
   ut_ad(index->is_primary());
   ut_ad(rec_offs_validate(rec, index, offsets));
@@ -3928,7 +2828,7 @@ static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
   {
     page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
                                        trx->id, roll_ptr, mtr);
-    return;
+    return DB_SUCCESS;
   }
 
   ulint offset= index->trx_id_offset;
@@ -3958,8 +2858,8 @@ static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
   if (UNIV_LIKELY(index->trx_id_offset))
   {
     const rec_t *prev= page_rec_get_prev_const(rec);
-    if (UNIV_UNLIKELY(prev == rec))
-      ut_ad(0);
+    if (UNIV_UNLIKELY(!prev || prev == rec))
+      return DB_CORRUPTION;
     else if (page_rec_is_infimum(prev));
     else
       for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
@@ -3997,6 +2897,8 @@ static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
 
   if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
     mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
+
+  return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -4016,7 +2918,6 @@ btr_cur_update_alloc_zip_func(
 /*==========================*/
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
 	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
-	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
 #ifdef UNIV_DEBUG
 	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
 #endif /* UNIV_DEBUG */
@@ -4025,6 +2926,7 @@ btr_cur_update_alloc_zip_func(
 				false=update-in-place */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+	dict_index_t*	index = cursor->index;
 
 	/* Have a local copy of the variables as these can change
 	dynamically. */
@@ -4051,32 +2953,26 @@ btr_cur_update_alloc_zip_func(
 		return(false);
 	}
 
-	if (!btr_page_reorganize(cursor, index, mtr)) {
-		goto out_of_space;
-	}
+	if (btr_page_reorganize(cursor, mtr) == DB_SUCCESS) {
+		rec_offs_make_valid(page_cur_get_rec(cursor), index,
+				    page_is_leaf(page), offsets);
 
-	rec_offs_make_valid(page_cur_get_rec(cursor), index,
-			    page_is_leaf(page), offsets);
+		/* After recompressing a page, we must make sure that the free
+		bits in the insert buffer bitmap will not exceed the free
+		space on the page.  Because this function will not attempt
+		recompression unless page_zip_available() fails above, it is
+		safe to reset the free bits if page_zip_available() fails
+		again, below.  The free bits can safely be reset in a separate
+		mini-transaction.  If page_zip_available() succeeds below, we
+		can be sure that the btr_page_reorganize() above did not reduce
+		the free space available on the page. */
 
-	/* After recompressing a page, we must make sure that the free
-	bits in the insert buffer bitmap will not exceed the free
-	space on the page.  Because this function will not attempt
-	recompression unless page_zip_available() fails above, it is
-	safe to reset the free bits if page_zip_available() fails
-	again, below.  The free bits can safely be reset in a separate
-	mini-transaction.  If page_zip_available() succeeds below, we
-	can be sure that the btr_page_reorganize() above did not reduce
-	the free space available on the page. */
-
-	if (page_zip_available(page_zip, dict_index_is_clust(index),
-			       length, create)) {
-		return(true);
+		if (page_zip_available(page_zip, dict_index_is_clust(index),
+				       length, create)) {
+			return true;
+		}
 	}
 
-out_of_space:
-	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
-
-	/* Out of space: reset the free bits. */
 	if (!dict_index_is_clust(index)
 	    && !index->table->is_temporary()
 	    && page_is_leaf(page)) {
@@ -4264,7 +3160,7 @@ ATTRIBUTE_COLD static
 rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets,
                                          const upd_t& update, mtr_t *mtr)
 {
-  dict_index_t *index= cur->index;
+  dict_index_t *index= cur->index();
   ut_ad(!index->table->is_temporary());
 
   switch (update.n_fields) {
@@ -4290,7 +3186,6 @@ rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets,
   default:
     if (!btr_cur_update_alloc_zip(btr_cur_get_page_zip(cur),
                                   btr_cur_get_page_cur(cur),
-                                  index,
                                   offsets, rec_offs_size(offsets),
                                   false, mtr))
       return nullptr;
@@ -4330,9 +3225,10 @@ btr_cur_update_in_place(
 	roll_ptr_t	roll_ptr	= 0;
 	ulint		was_delete_marked;
 
-	ut_ad(page_is_leaf(cursor->page_cur.block->frame));
+	ut_ad(page_is_leaf(cursor->page_cur.block->page.frame));
 	rec = btr_cur_get_rec(cursor);
-	index = cursor->index;
+	index = cursor->index();
+	ut_ad(!index->is_ibuf());
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
@@ -4376,8 +3272,11 @@ btr_cur_update_in_place(
 	}
 
 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
-		btr_cur_upd_rec_sys(block, rec, index, offsets,
-				    thr_get_trx(thr), roll_ptr, mtr);
+		err = btr_cur_upd_rec_sys(block, rec, index, offsets,
+					  thr_get_trx(thr), roll_ptr, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto func_exit;
+		}
 	}
 
 	was_delete_marked = rec_get_deleted_flag(
@@ -4390,7 +3289,7 @@ btr_cur_update_in_place(
 
 #ifdef BTR_CUR_HASH_ADAPT
 	{
-		rw_lock_t* ahi_latch = block->index
+		srw_spin_lock* ahi_latch = block->index
 			? btr_search_sys.get_latch(*index) : NULL;
 		if (ahi_latch) {
 			/* TO DO: Can we skip this if none of the fields
@@ -4410,7 +3309,7 @@ btr_cur_update_in_place(
 				btr_search_update_hash_on_delete(cursor);
 			}
 
-			rw_lock_x_lock(ahi_latch);
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
 		}
 
 		assert_block_ahi_valid(block);
@@ -4421,7 +3320,7 @@ btr_cur_update_in_place(
 
 #ifdef BTR_CUR_HASH_ADAPT
 		if (ahi_latch) {
-			rw_lock_x_unlock(ahi_latch);
+			ahi_latch->wr_unlock();
 		}
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
@@ -4493,16 +3392,20 @@ static void btr_cur_trim_alter_metadata(dtuple_t* entry,
 		page_id_t(index->table->space->id,
 			  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
 		0, RW_S_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
-	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB);
-	ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
-					     + BTR_BLOB_HDR_NEXT_PAGE_NO])
+	if (!block) {
+		ut_ad("corruption" == 0);
+		mtr.commit();
+		return;
+	}
+	ut_ad(fil_page_get_type(block->page.frame) == FIL_PAGE_TYPE_BLOB);
+	ut_ad(mach_read_from_4(&block->page.frame
+			       [FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO])
 	      == FIL_NULL);
-	ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
-					     + BTR_BLOB_HDR_PART_LEN])
+	ut_ad(mach_read_from_4(&block->page.frame
+			       [FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN])
 	      == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
 	n_fields = mach_read_from_4(
-		&block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
+		&block->page.frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
 		+ index->first_user_field();
 	/* Rollback should not increase the number of fields. */
 	ut_ad(n_fields <= index->n_fields);
@@ -4626,7 +3529,8 @@ btr_cur_optimistic_update(
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
 	rec = btr_cur_get_rec(cursor);
-	index = cursor->index;
+	index = cursor->index();
+	ut_ad(index->has_locking());
 	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
 	      || index->table->is_temporary());
 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
@@ -4729,7 +3633,7 @@ any_extern:
 		}
 
 		if (!btr_cur_update_alloc_zip(
-			    page_zip, page_cursor, index, *offsets,
+			    page_zip, page_cursor, *offsets,
 			    new_rec_size, true, mtr)) {
 			return(DB_ZIP_OVERFLOW);
 		}
@@ -4742,7 +3646,6 @@ any_extern:
 			(!dict_table_is_comp(index->table)
 			 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
 		err = DB_OVERFLOW;
-
 		goto func_exit;
 	}
 
@@ -4810,7 +3713,7 @@ any_extern:
 	/* Ok, we may do the replacement. Store on the page infimum the
 	explicit locks on rec, before deleting rec (see the comment in
 	btr_cur_pessimistic_update). */
-	if (!dict_table_is_locking_disabled(index->table)) {
+	if (index->has_locking()) {
 		lock_rec_store_on_page_infimum(block, rec);
 	}
 
@@ -4825,32 +3728,42 @@ any_extern:
 		btr_search_update_hash_on_delete(cursor);
 	}
 
-	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+	page_cur_delete_rec(page_cursor, *offsets, mtr);
 
-	page_cur_move_to_prev(page_cursor);
+	if (!page_cur_move_to_prev(page_cursor)) {
+		return DB_CORRUPTION;
+	}
 
 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
 		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
 	}
 
-	/* There are no externally stored columns in new_entry */
-	rec = btr_cur_insert_if_possible(
-		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
-	ut_a(rec); /* <- We calculated above the insert would fit */
+	rec = btr_cur_insert_if_possible(cursor, new_entry, offsets, heap,
+					 0/*n_ext*/, mtr);
+	if (UNIV_UNLIKELY(!rec)) {
+		goto corrupted;
+	}
 
 	if (UNIV_UNLIKELY(update->is_metadata())) {
 		/* We must empty the PAGE_FREE list, because if this
 		was a rollback, the shortened metadata record
 		would have too many fields, and we would be unable to
 		know the size of the freed record. */
-		btr_page_reorganize(page_cursor, index, mtr);
-	} else if (!dict_table_is_locking_disabled(index->table)) {
+		err = btr_page_reorganize(page_cursor, mtr);
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+	} else {
 		/* Restore the old explicit lock state on the record */
-		lock_rec_restore_from_page_infimum(block, rec, block);
+		lock_rec_restore_from_page_infimum(*block, rec,
+						   block->page.id());
 	}
 
-	page_cur_move_to_next(page_cursor);
 	ut_ad(err == DB_SUCCESS);
+	if (!page_cur_move_to_next(page_cursor)) {
+corrupted:
+		err = DB_CORRUPTION;
+	}
 
 func_exit:
 	if (!(flags & BTR_KEEP_IBUF_BITMAP)
@@ -4880,7 +3793,7 @@ updated record. In the split it may have inherited locks from the successor
 of the updated record, which is not correct. This function restores the
 right locks for the new supremum. */
 static
-void
+dberr_t
 btr_cur_pess_upd_restore_supremum(
 /*==============================*/
 	buf_block_t*	block,	/*!< in: buffer block of rec */
@@ -4888,34 +3801,38 @@ btr_cur_pess_upd_restore_supremum(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	page_t*		page;
-	buf_block_t*	prev_block;
 
 	page = buf_block_get_frame(block);
 
 	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
 		/* Updated record is not the first user record on its page */
-
-		return;
+		return DB_SUCCESS;
 	}
 
 	const uint32_t	prev_page_no = btr_page_get_prev(page);
 
-	const page_id_t	page_id(block->page.id().space(), prev_page_no);
-
-	ut_ad(prev_page_no != FIL_NULL);
-	prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(),
-						mtr);
-#ifdef UNIV_BTR_DEBUG
-	ut_a(btr_page_get_next(prev_block->frame)
-	     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
+	const page_id_t block_id{block->page.id()};
+	const page_id_t	prev_id(block_id.space(), prev_page_no);
+	dberr_t err;
+	buf_block_t* prev_block
+		= buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr,
+				   BUF_PEEK_IF_IN_POOL, mtr, &err);
+	/* Since we already held an x-latch on prev_block, it must
+	be available and not be corrupted unless the buffer pool got
+	corrupted somehow. */
+	if (UNIV_UNLIKELY(!prev_block)) {
+		return err;
+	}
+	ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT,
+				 block->page.frame + FIL_PAGE_OFFSET, 4));
 
 	/* We must already have an x-latch on prev_block! */
 	ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
 
-	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
+	lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id,
 					     PAGE_HEAP_NO_SUPREMUM,
 					     page_rec_get_heap_no(rec));
+	return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -4971,13 +3888,15 @@ btr_cur_pessimistic_update(
 
 	block = btr_cur_get_block(cursor);
 	page_zip = buf_block_get_page_zip(block);
-	index = cursor->index;
+	index = cursor->index();
+	ut_ad(index->has_locking());
 
 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
 					 MTR_MEMO_SX_LOCK));
 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+	ut_a(!page_zip
+	     || page_zip_validate(page_zip, block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 	ut_ad(!page_zip || !index->table->is_temporary());
 	/* The insert buffer tree should never be updated in place. */
@@ -5010,7 +3929,7 @@ btr_cur_pessimistic_update(
 		if (page_zip
 		    && optim_err != DB_ZIP_OVERFLOW
 		    && !dict_index_is_clust(index)
-		    && page_is_leaf(block->frame)) {
+		    && page_is_leaf(block->page.frame)) {
 			ut_ad(!index->table->is_temporary());
 			ibuf_update_free_bits_zip(block, mtr);
 		}
@@ -5057,7 +3976,7 @@ btr_cur_pessimistic_update(
 	/* We have to set appropriate extern storage bits in the new
 	record to be inserted: we have to remember which fields were such */
 
-	ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec));
+	ut_ad(!page_is_comp(block->page.frame) || !rec_get_node_ptr_flag(rec));
 	ut_ad(rec_offs_validate(rec, index, *offsets));
 
 	if ((flags & BTR_NO_UNDO_LOG_FLAG)
@@ -5083,7 +4002,7 @@ btr_cur_pessimistic_update(
 
 	if (page_zip_rec_needs_ext(
 		    rec_get_converted_size(index, new_entry, n_ext),
-		    page_is_comp(block->frame),
+		    page_is_comp(block->page.frame),
 		    dict_index_get_n_fields(index),
 		    block->zip_size())
 	    || (UNIV_UNLIKELY(update->is_alter_metadata())
@@ -5099,7 +4018,7 @@ btr_cur_pessimistic_update(
 			BTR_KEEP_IBUF_BITMAP. */
 #ifdef UNIV_ZIP_DEBUG
 			ut_a(!page_zip
-			     || page_zip_validate(page_zip, block->frame,
+			     || page_zip_validate(page_zip, block->page.frame,
 						  index));
 #endif /* UNIV_ZIP_DEBUG */
 			index->table->space->release_free_extents(n_reserved);
@@ -5107,7 +4026,7 @@ btr_cur_pessimistic_update(
 			goto err_exit;
 		}
 
-		ut_ad(page_is_leaf(block->frame));
+		ut_ad(page_is_leaf(block->page.frame));
 		ut_ad(dict_index_is_clust(index));
 		if (UNIV_UNLIKELY(!(flags & BTR_KEEP_POS_FLAG))) {
 			ut_ad(page_zip != NULL);
@@ -5127,18 +4046,17 @@ btr_cur_pessimistic_update(
 	}
 
 	if (optim_err == DB_OVERFLOW) {
-
 		/* First reserve enough free space for the file segments
 		of the index tree, so that the update will not fail because
 		of lack of space */
 
-		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
-
-		if (!fsp_reserve_free_extents(
-		            &n_reserved, index->table->space, n_extents,
-		            flags & BTR_NO_UNDO_LOG_FLAG
-		            ? FSP_CLEANING : FSP_NORMAL,
-		            mtr)) {
+		err = fsp_reserve_free_extents(
+			&n_reserved, index->table->space,
+			uint32_t(cursor->tree_height / 16 + 3),
+			flags & BTR_NO_UNDO_LOG_FLAG
+			? FSP_CLEANING : FSP_NORMAL,
+			mtr);
+                if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 			err = DB_OUT_OF_FILE_SPACE;
 			goto err_exit;
 		}
@@ -5149,8 +4067,9 @@ btr_cur_pessimistic_update(
 	}
 
 	const ulint max_ins_size = page_zip
-		? 0 : page_get_max_insert_size_after_reorganize(block->frame,
-								1);
+		? 0
+		: page_get_max_insert_size_after_reorganize(block->page.frame,
+							    1);
 
 	if (UNIV_UNLIKELY(is_metadata)) {
 		ut_ad(new_entry->is_metadata());
@@ -5172,19 +4091,21 @@ btr_cur_pessimistic_update(
 		in the lock system delete the lock structs set on the
 		root page even if the root page carries just node
 		pointers. */
-		if (!dict_table_is_locking_disabled(index->table)) {
-			lock_rec_store_on_page_infimum(block, rec);
-		}
+		lock_rec_store_on_page_infimum(block, rec);
 	}
 
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
+	ut_a(!page_zip
+	     || page_zip_validate(page_zip, block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 	page_cursor = btr_cur_get_page_cur(cursor);
 
-	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
+	page_cur_delete_rec(page_cursor, *offsets, mtr);
 
-	page_cur_move_to_prev(page_cursor);
+	if (!page_cur_move_to_prev(page_cursor)) {
+		err = DB_CORRUPTION;
+		goto return_after_reservations;
+	}
 
 	rec = btr_cur_insert_if_possible(cursor, new_entry,
 					 offsets, offsets_heap, n_ext, mtr);
@@ -5197,7 +4118,10 @@ btr_cur_pessimistic_update(
 			was a rollback, the shortened metadata record
 			would have too many fields, and we would be unable to
 			know the size of the freed record. */
-			btr_page_reorganize(page_cursor, index, mtr);
+			err = btr_page_reorganize(page_cursor, mtr);
+			if (err != DB_SUCCESS) {
+				goto return_after_reservations;
+			}
 			rec = page_cursor->rec;
 			rec_offs_make_valid(rec, index, true, *offsets);
 			if (page_cursor->block->page.id().page_no()
@@ -5205,9 +4129,10 @@ btr_cur_pessimistic_update(
 				btr_set_instant(page_cursor->block, *index,
 						mtr);
 			}
-		} else if (!dict_table_is_locking_disabled(index->table)) {
+		} else {
 			lock_rec_restore_from_page_infimum(
-				btr_cur_get_block(cursor), rec, block);
+				*btr_cur_get_block(cursor), rec,
+				block->page.id());
 		}
 
 		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
@@ -5223,7 +4148,7 @@ btr_cur_pessimistic_update(
 		}
 
 		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
-		ut_ad(!adjust || page_is_leaf(block->frame));
+		ut_ad(!adjust || page_is_leaf(block->page.frame));
 
 		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
 			if (adjust) {
@@ -5231,7 +4156,7 @@ btr_cur_pessimistic_update(
 						    true, *offsets);
 			}
 		} else if (!dict_index_is_clust(index)
-			   && page_is_leaf(block->frame)) {
+			   && page_is_leaf(block->page.frame)) {
 			/* Update the free bits in the insert buffer.
 			This is the same block which was skipped by
 			BTR_KEEP_IBUF_BITMAP. */
@@ -5244,17 +4169,15 @@ btr_cur_pessimistic_update(
 			}
 		}
 
-		if (!srv_read_only_mode
-		    && !big_rec_vec
-		    && page_is_leaf(block->frame)
+#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
+		if (!big_rec_vec
+		    && page_is_leaf(block->page.frame)
 		    && !dict_index_is_online_ddl(index)) {
-
-			mtr_memo_release(mtr, dict_index_get_lock(index),
-					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
-
+			mtr->release(index->lock);
 			/* NOTE: We cannot release root block latch here, because it
 			has segment header and already modified in most of cases.*/
 		}
+#endif
 
 		err = DB_SUCCESS;
 		goto return_after_reservations;
@@ -5271,19 +4194,19 @@ btr_cur_pessimistic_update(
 		BTR_KEEP_IBUF_BITMAP. */
 		if (!dict_index_is_clust(index)
 		    && !index->table->is_temporary()
-		    && page_is_leaf(block->frame)) {
+		    && page_is_leaf(block->page.frame)) {
 			ibuf_reset_free_bits(block);
 		}
 	}
 
 	if (big_rec_vec != NULL) {
-		ut_ad(page_is_leaf(block->frame));
+		ut_ad(page_is_leaf(block->page.frame));
 		ut_ad(dict_index_is_clust(index));
 		ut_ad(flags & BTR_KEEP_POS_FLAG);
 
 		/* btr_page_split_and_insert() in
 		btr_cur_pessimistic_insert() invokes
-		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
+		mtr->release(index->lock).
 		We must keep the index->lock when we created a
 		big_rec, so that row_upd_clust_rec() can store the
 		big_rec in the same mini-transaction. */
@@ -5308,10 +4231,10 @@ btr_cur_pessimistic_update(
 					 cursor, offsets, offsets_heap,
 					 new_entry, &rec,
 					 &dummy_big_rec, n_ext, NULL, mtr);
-	ut_a(rec);
 	ut_a(err == DB_SUCCESS);
+	ut_a(rec);
 	ut_a(dummy_big_rec == NULL);
-	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
 	page_cursor->rec = rec;
 
 	/* Multiple transactions cannot simultaneously operate on the
@@ -5332,8 +4255,8 @@ btr_cur_pessimistic_update(
 		/* The new inserted record owns its possible externally
 		stored fields */
 #ifdef UNIV_ZIP_DEBUG
-		ut_a(!page_zip || page_zip_validate(page_zip, block->frame,
-						    index));
+		ut_a(!page_zip
+		     || page_zip_validate(page_zip, block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 		btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
 					     index, *offsets, mtr);
@@ -5348,11 +4271,14 @@ btr_cur_pessimistic_update(
 		was a rollback, the shortened metadata record
 		would have too many fields, and we would be unable to
 		know the size of the freed record. */
-		btr_page_reorganize(page_cursor, index, mtr);
+		err = btr_page_reorganize(page_cursor, mtr);
+		if (err != DB_SUCCESS) {
+			goto return_after_reservations;
+		}
 		rec = page_cursor->rec;
-	} else if (!dict_table_is_locking_disabled(index->table)) {
+	} else {
 		lock_rec_restore_from_page_infimum(
-			btr_cur_get_block(cursor), rec, block);
+			*btr_cur_get_block(cursor), rec, block->page.id());
 	}
 
 	/* If necessary, restore also the correct lock state for a new,
@@ -5360,14 +4286,15 @@ btr_cur_pessimistic_update(
 	record was nonexistent, the supremum might have inherited its locks
 	from a wrong record. */
 
-	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
-		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
-						  rec, mtr);
+	if (!was_first) {
+		err = btr_cur_pess_upd_restore_supremum(
+			btr_cur_get_block(cursor), rec, mtr);
 	}
 
 return_after_reservations:
 #ifdef UNIV_ZIP_DEBUG
-	ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
+	ut_a(err ||
+	     !page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
 					    btr_cur_get_page(cursor), index));
 #endif /* UNIV_ZIP_DEBUG */
 
@@ -5452,14 +4379,6 @@ btr_cur_del_mark_set_clust_rec(
 		return(DB_SUCCESS);
 	}
 
-	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
-						   rec, index, offsets, thr);
-
-	if (err != DB_SUCCESS) {
-
-		return(err);
-	}
-
 	err = trx_undo_report_row_operation(thr, index,
 					    entry, NULL, 0, rec, offsets,
 					    &roll_ptr);
@@ -5479,15 +4398,11 @@ btr_cur_del_mark_set_clust_rec(
 	DBUG_LOG("ib_cur",
 		 "delete-mark clust " << index->table->name
 		 << " (" << index->id << ") by "
-		 << ib::hex(trx_get_id_for_print(trx)) << ": "
+		 << ib::hex(trx->id) << ": "
 		 << rec_printer(rec, offsets).str());
 
-	if (dict_index_is_online_ddl(index)) {
-		row_log_table_delete(rec, index, offsets, NULL);
-	}
-
-	btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr);
-	return(err);
+	return btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr,
+				   mtr);
 }
 
 /*==================== B-TREE RECORD REMOVE =========================*/
@@ -5498,23 +4413,23 @@ that mtr holds an x-latch on the tree and on the cursor page. To avoid
 deadlocks, mtr must also own x-latches to brothers of page, if those
 brothers exist. NOTE: it is assumed that the caller has reserved enough
 free extents so that the compression will always succeed if done!
-@return TRUE if compression occurred */
-ibool
+@return whether compression occurred */
+bool
 btr_cur_compress_if_useful(
 /*=======================*/
 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
 				cursor does not stay valid if !adjust and
 				compression occurs */
-	ibool		adjust,	/*!< in: TRUE if should adjust the
-				cursor position even if compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
 					 MTR_MEMO_PAGE_X_FIX));
 
-	if (cursor->index->is_spatial()) {
+	if (cursor->index()->is_spatial()) {
 		const trx_t*	trx = cursor->rtr_info->thr
 			? thr_get_trx(cursor->rtr_info->thr)
 			: NULL;
@@ -5526,25 +4441,24 @@ btr_cur_compress_if_useful(
 		}
 	}
 
-	return(btr_cur_compress_recommendation(cursor, mtr)
-	       && btr_compress(cursor, adjust, mtr));
+	return btr_cur_compress_recommendation(cursor, mtr)
+		&& btr_compress(cursor, adjust, mtr) == DB_SUCCESS;
 }
 
 /*******************************************************//**
 Removes the record on which the tree cursor is positioned on a leaf page.
 It is assumed that the mtr has an x-latch on the page where the cursor is
 positioned, but no latch on the whole tree.
-@return TRUE if success, i.e., the page did not become too empty */
-ibool
-btr_cur_optimistic_delete_func(
-/*===========================*/
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
 	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
 				delete; cursor stays valid: if deletion
 				succeeds, on function exit it points to the
 				successor of the deleted record */
-#ifdef UNIV_DEBUG
 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
-#endif /* UNIV_DEBUG */
 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
 				TRUE on a leaf page of a secondary
 				index, the mtr must be committed
@@ -5560,50 +4474,56 @@ btr_cur_optimistic_delete_func(
 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
 					 MTR_MEMO_PAGE_X_FIX));
-	ut_ad(mtr->is_named_space(cursor->index->table->space));
-	ut_ad(!cursor->index->is_dummy);
+	ut_ad(mtr->is_named_space(cursor->index()->table->space));
+	ut_ad(!cursor->index()->is_dummy);
 
 	/* This is intended only for leaf page deletions */
 
 	block = btr_cur_get_block(cursor);
 
-	ut_ad(block->page.id().space() == cursor->index->table->space->id);
+	ut_ad(block->page.id().space() == cursor->index()->table->space->id);
 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
-	ut_ad(!dict_index_is_online_ddl(cursor->index)
-	      || dict_index_is_clust(cursor->index)
+	ut_ad(!dict_index_is_online_ddl(cursor->index())
+	      || cursor->index()->is_clust()
 	      || (flags & BTR_CREATE_FLAG));
 
 	rec = btr_cur_get_rec(cursor);
 
-	offsets = rec_get_offsets(rec, cursor->index, offsets,
-				  cursor->index->n_core_fields,
+	offsets = rec_get_offsets(rec, cursor->index(), offsets,
+				  cursor->index()->n_core_fields,
 				  ULINT_UNDEFINED, &heap);
 
-	const ibool no_compress_needed = !rec_offs_any_extern(offsets)
-		&& btr_cur_can_delete_without_compress(
-			cursor, rec_offs_size(offsets), mtr);
-
-	if (!no_compress_needed) {
+	dberr_t err = DB_SUCCESS;
+	if (rec_offs_any_extern(offsets)
+	    || !btr_cur_can_delete_without_compress(cursor,
+						    rec_offs_size(offsets),
+						    mtr)) {
 		/* prefetch siblings of the leaf for the pessimistic
 		operation. */
-		btr_cur_prefetch_siblings(block, cursor->index);
+		btr_cur_prefetch_siblings(block, cursor->index());
+		err = DB_FAIL;
 		goto func_exit;
 	}
 
-	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page
-			  && page_get_n_recs(block->frame) == 1
-			  + (cursor->index->is_instant()
-			     && !rec_is_metadata(rec, *cursor->index))
-			  && !cursor->index->must_avoid_clear_instant_add())) {
+	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index()->page
+			  && page_get_n_recs(block->page.frame) == 1
+			  + (cursor->index()->is_instant()
+			     && !rec_is_metadata(rec, *cursor->index()))
+			  && !cursor->index()
+			  ->must_avoid_clear_instant_add())) {
 		/* The whole index (and table) becomes logically empty.
 		Empty the whole page. That is, if we are deleting the
 		only user record, also delete the metadata record
 		if one exists for instant ADD COLUMN (not generic ALTER TABLE).
 		If we are deleting the metadata record and the
 		table becomes empty, clean up the whole page. */
-		dict_index_t* index = cursor->index;
+		dict_index_t* index = cursor->index();
 		const rec_t* first_rec = page_rec_get_next_const(
-			page_get_infimum_rec(block->frame));
+			page_get_infimum_rec(block->page.frame));
+		if (UNIV_UNLIKELY(!first_rec)) {
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
 		ut_ad(!index->is_instant()
 		      || rec_is_metadata(first_rec, *index));
 		const bool is_metadata = rec_is_metadata(rec, *index);
@@ -5616,7 +4536,7 @@ btr_cur_optimistic_delete_func(
 			|| (first_rec != rec
 			    && rec_is_add_metadata(first_rec, *index));
 		if (UNIV_LIKELY(empty_table)) {
-			if (UNIV_LIKELY(!is_metadata)) {
+			if (UNIV_LIKELY(!is_metadata && !flags)) {
 				lock_update_delete(block, rec);
 			}
 			btr_page_empty(block, buf_block_get_page_zip(block),
@@ -5625,6 +4545,7 @@ btr_cur_optimistic_delete_func(
 				/* MDEV-17383: free metadata BLOBs! */
 				index->clear_instant_alter();
 			}
+
 			page_cur_set_after_last(block,
 						btr_cur_get_page_cur(cursor));
 			goto func_exit;
@@ -5641,32 +4562,36 @@ btr_cur_optimistic_delete_func(
 			If this is a recovered transaction, then
 			index->is_instant() will hold until the
 			insert into SYS_COLUMNS is rolled back. */
-			ut_ad(cursor->index->table->supports_instant());
-			ut_ad(cursor->index->is_primary());
+			ut_ad(cursor->index()->table->supports_instant());
+			ut_ad(cursor->index()->is_primary());
 			ut_ad(!page_zip);
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
-					    cursor->index, offsets, mtr);
+					    offsets, mtr);
 			/* We must empty the PAGE_FREE list, because
 			after rollback, this deleted metadata record
 			would have too many fields, and we would be
 			unable to know the size of the freed record. */
-			btr_page_reorganize(btr_cur_get_page_cur(cursor),
-					    cursor->index, mtr);
+			err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+						  mtr);
 			goto func_exit;
 		} else {
-			lock_update_delete(block, rec);
+			if (!flags) {
+				lock_update_delete(block, rec);
+			}
 
 			btr_search_update_hash_on_delete(cursor);
 		}
 
 		if (page_zip) {
 #ifdef UNIV_ZIP_DEBUG
-			ut_a(page_zip_validate(page_zip, page, cursor->index));
+			ut_a(page_zip_validate(page_zip, page,
+					       cursor->index()));
 #endif /* UNIV_ZIP_DEBUG */
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
-					    cursor->index, offsets, mtr);
+					    offsets, mtr);
 #ifdef UNIV_ZIP_DEBUG
-			ut_a(page_zip_validate(page_zip, page, cursor->index));
+			ut_a(page_zip_validate(page_zip, page,
+					       cursor->index()));
 #endif /* UNIV_ZIP_DEBUG */
 
 			/* On compressed pages, the IBUF_BITMAP_FREE
@@ -5680,14 +4605,14 @@ btr_cur_optimistic_delete_func(
 					page, 1);
 
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
-					    cursor->index, offsets, mtr);
+					    offsets, mtr);
 
 			/* The change buffer does not handle inserts
 			into non-leaf pages, into clustered indexes,
 			or into the change buffer. */
-			if (!dict_index_is_clust(cursor->index)
-			    && !cursor->index->table->is_temporary()
-			    && !dict_index_is_ibuf(cursor->index)) {
+			if (!cursor->index()->is_clust()
+			    && !cursor->index()->table->is_temporary()
+			    && !dict_index_is_ibuf(cursor->index())) {
 				ibuf_update_free_bits_low(block, max_ins, mtr);
 			}
 		}
@@ -5698,7 +4623,7 @@ func_exit:
 		mem_heap_free(heap);
 	}
 
-	return(no_compress_needed);
+	return err;
 }
 
 /*************************************************************//**
@@ -5736,7 +4661,6 @@ btr_cur_pessimistic_delete(
 	dict_index_t*	index;
 	rec_t*		rec;
 	uint32_t	n_reserved	= 0;
-	bool		success;
 	ibool		ret		= FALSE;
 	mem_heap_t*	heap;
 	rec_offs*	offsets;
@@ -5766,13 +4690,11 @@ btr_cur_pessimistic_delete(
 
 		uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
 
-		success = fsp_reserve_free_extents(&n_reserved,
-						   index->table->space,
-						   n_extents,
-						   FSP_CLEANING, mtr);
-		if (!success) {
-			*err = DB_OUT_OF_FILE_SPACE;
-
+		*err = fsp_reserve_free_extents(&n_reserved,
+						index->table->space,
+						n_extents,
+						FSP_CLEANING, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
 			return(FALSE);
 		}
 	}
@@ -5833,6 +4755,10 @@ btr_cur_pessimistic_delete(
 
 			const rec_t* first_rec = page_rec_get_next_const(
 				page_get_infimum_rec(page));
+			if (UNIV_UNLIKELY(!first_rec)) {
+				*err = DB_CORRUPTION;
+				goto err_exit;
+			}
 			ut_ad(!index->is_instant()
 			      || rec_is_metadata(first_rec, *index));
 			if (is_metadata || !index->is_instant()
@@ -5843,6 +4769,7 @@ btr_cur_pessimistic_delete(
 					/* MDEV-17383: free metadata BLOBs! */
 					index->clear_instant_alter();
 				}
+
 				page_cur_set_after_last(
 					block,
 					btr_cur_get_page_cur(cursor));
@@ -5855,15 +4782,15 @@ btr_cur_pessimistic_delete(
 			btr_search_update_hash_on_delete(cursor);
 		} else {
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
-					    index, offsets, mtr);
+					    offsets, mtr);
 			/* We must empty the PAGE_FREE list, because
 			after rollback, this deleted metadata record
 			would carry too many fields, and we would be
 			unable to know the size of the freed record. */
-			btr_page_reorganize(btr_cur_get_page_cur(cursor),
-					    index, mtr);
+			*err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+						   mtr);
 			ut_ad(!ret);
-			goto return_after_reservations;
+			goto err_exit;
 		}
 	} else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
 		if (page_rec_is_last(rec, page)) {
@@ -5878,7 +4805,15 @@ discard_page:
 			goto return_after_reservations;
 		}
 
-		next_rec = page_rec_get_next(rec);
+		if (UNIV_UNLIKELY(!(next_rec = page_rec_get_next(rec)))) {
+			ut_ad(!ret);
+			*err = DB_CORRUPTION;
+			goto err_exit;
+		}
+
+		btr_cur_t cursor;
+		cursor.page_cur.index = index;
+		cursor.page_cur.block = block;
 
 		if (!page_has_prev(page)) {
 			/* If we delete the leftmost node pointer on a
@@ -5891,22 +4826,19 @@ discard_page:
 			we need to update parent page. */
 			rtr_mbr_t	father_mbr;
 			rec_t*		father_rec;
-			btr_cur_t	father_cursor;
 			rec_offs*	offsets;
 			ulint		len;
 
-			rtr_page_get_father_block(NULL, heap, index,
-						  block, mtr, NULL,
-						  &father_cursor);
-			offsets = rec_get_offsets(
-				btr_cur_get_rec(&father_cursor), index, NULL,
-				0, ULINT_UNDEFINED, &heap);
+			rtr_page_get_father_block(NULL, heap, mtr, NULL,
+						  &cursor);
+			father_rec = btr_cur_get_rec(&cursor);
+			offsets = rec_get_offsets(father_rec, index, NULL,
+						  0, ULINT_UNDEFINED, &heap);
 
-			father_rec = btr_cur_get_rec(&father_cursor);
 			rtr_read_mbr(rec_get_nth_field(
 				father_rec, offsets, 0, &len), &father_mbr);
 
-			rtr_update_mbr_field(&father_cursor, offsets, NULL,
+			rtr_update_mbr_field(&cursor, offsets, NULL,
 					     page, &father_mbr, next_rec, mtr);
 			ut_d(parent_latched = true);
 		} else {
@@ -5914,23 +4846,36 @@ discard_page:
 			on a page, we have to change the parent node pointer
 			so that it is equal to the new leftmost node pointer
 			on the page */
-			btr_cur_t cursor;
-			btr_page_get_father(index, block, mtr, &cursor);
-			btr_cur_node_ptr_delete(&cursor, mtr);
+			ret = btr_page_get_father(mtr, &cursor);
+			if (!ret) {
+				*err = DB_CORRUPTION;
+				goto err_exit;
+			}
+			*err = btr_cur_node_ptr_delete(&cursor, mtr);
+			if (*err != DB_SUCCESS) {
+got_err:
+				ret = FALSE;
+				goto err_exit;
+			}
+
 			const ulint	level = btr_page_get_level(page);
 			// FIXME: reuse the node_ptr from above
 			dtuple_t*	node_ptr = dict_index_build_node_ptr(
 				index, next_rec, block->page.id().page_no(),
 				heap, level);
 
-			btr_insert_on_non_leaf_level(
+			*err = btr_insert_on_non_leaf_level(
 				flags, index, level + 1, node_ptr, mtr);
+			if (*err != DB_SUCCESS) {
+				ret = FALSE;
+				goto got_err;
+			}
 
 			ut_d(parent_latched = true);
 		}
 	}
 
-	/* SPATIAL INDEX never use SX locks; we can allow page merges
+	/* SPATIAL INDEX never use U locks; we can allow page merges
 	while holding X lock on the spatial index tree.
 	Do not allow merges of non-leaf B-tree pages unless it is
 	safe to do so. */
@@ -5941,7 +4886,7 @@ discard_page:
 				index, page, BTR_INTENTION_DELETE, rec,
 				btr_node_ptr_max_size(index),
 				block->zip_size(), mtr);
-		page_cur_delete_rec(btr_cur_get_page_cur(cursor), index,
+		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
 				    offsets, mtr);
 
 		if (min_mark_next_rec) {
@@ -5971,19 +4916,17 @@ discard_page:
 
 return_after_reservations:
 	*err = DB_SUCCESS;
-
+err_exit:
 	mem_heap_free(heap);
 
-	if (!srv_read_only_mode
-	    && page_is_leaf(page)
+#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
+	if (page_is_leaf(page)
 	    && !dict_index_is_online_ddl(index)) {
-
-		mtr_memo_release(mtr, dict_index_get_lock(index),
-				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
-
+		mtr->release(index->lock);
 		/* NOTE: We cannot release root block latch here, because it
 		has segment header and already modified in most of cases.*/
 	}
+#endif
 
 	index->table->space->release_free_extents(n_reserved);
 	return(ret);
@@ -5992,7 +4935,7 @@ return_after_reservations:
 /** Delete the node pointer in a parent page.
 @param[in,out]	parent	cursor pointing to parent record
 @param[in,out]	mtr	mini-transaction */
-void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
 {
 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
 					 MTR_MEMO_PAGE_X_FIX));
@@ -6000,948 +4943,661 @@ void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
 	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
 						      BTR_CREATE_FLAG, false,
 						      mtr);
-	ut_a(err == DB_SUCCESS);
-	if (!compressed) {
+	if (err == DB_SUCCESS && !compressed) {
 		btr_cur_compress_if_useful(parent, FALSE, mtr);
 	}
-}
-
-/*******************************************************************//**
-Adds path information to the cursor for the current page, for which
-the binary search has been performed. */
-static
-void
-btr_cur_add_path_info(
-/*==================*/
-	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
-	ulint		height,		/*!< in: height of the page in tree;
-					0 means leaf node */
-	ulint		root_height)	/*!< in: root node height in tree */
-{
-	btr_path_t*	slot;
-
-	ut_a(cursor->path_arr);
-
-	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
-		/* Do nothing; return empty path */
-
-		slot = cursor->path_arr;
-		slot->nth_rec = ULINT_UNDEFINED;
 
-		return;
-	}
-
-	if (height == 0) {
-		/* Mark end of slots for path */
-		slot = cursor->path_arr + root_height + 1;
-		slot->nth_rec = ULINT_UNDEFINED;
-	}
-
-	slot = cursor->path_arr + (root_height - height);
-
-	const buf_block_t* block = btr_cur_get_block(cursor);
-
-	slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
-	slot->n_recs = page_get_n_recs(block->frame);
-	slot->page_no = block->page.id().page_no();
-	slot->page_level = btr_page_get_level(block->frame);
+	return err;
 }
 
-/*******************************************************************//**
-Estimate the number of rows between slot1 and slot2 for any level on a
-B-tree. This function starts from slot1->page and reads a few pages to
-the right, counting their records. If we reach slot2->page quickly then
-we know exactly how many records there are between slot1 and slot2 and
-we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
-then we calculate the average number of records in the pages scanned
-so far and assume that all pages that we did not scan up to slot2->page
-contain the same number of records, then we multiply that average to
-the number of pages between slot1->page and slot2->page (which is
-n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
-@return number of rows, not including the borders (exact or estimated) */
-static
-ha_rows
-btr_estimate_n_rows_in_range_on_level(
-/*==================================*/
-	dict_index_t*	index,			/*!< in: index */
-	btr_path_t*	slot1,			/*!< in: left border */
-	btr_path_t*	slot2,			/*!< in: right border */
-	ha_rows		n_rows_on_prev_level,	/*!< in: number of rows
-						on the previous level for the
-						same descend paths; used to
-						determine the number of pages
-						on this level */
-	bool*		is_n_rows_exact)	/*!< out: TRUE if the returned
-						value is exact i.e. not an
-						estimation */
+/** Represents the cursor for the number of rows estimation. The
+content is used for level-by-level diving and estimation the number of rows
+on each level. */
+class btr_est_cur_t
 {
-	ha_rows		n_rows = 0;
-	uint		n_pages_read = 0;
-	ulint		level;
-
-	/* Assume by default that we will scan all pages between
-	slot1->page_no and slot2->page_no. */
-	*is_n_rows_exact = true;
-
-	/* Add records from slot1->page_no which are to the right of
-	the record which serves as a left border of the range, if any
-	(we don't include the record itself in this count). */
-	if (slot1->nth_rec <= slot1->n_recs) {
-		n_rows += slot1->n_recs - slot1->nth_rec;
-	}
-
-	/* Add records from slot2->page_no which are to the left of
-	the record which servers as a right border of the range, if any
-	(we don't include the record itself in this count). */
-	if (slot2->nth_rec > 1) {
-		n_rows += slot2->nth_rec - 1;
-	}
-
-	/* Count the records in the pages between slot1->page_no and
-	slot2->page_no (non inclusive), if any. */
-
-	/* Do not read more than this number of pages in order not to hurt
-	performance with this code which is just an estimation. If we read
-	this many pages before reaching slot2->page_no then we estimate the
-	average from the pages scanned so far. */
-#	define N_PAGES_READ_LIMIT	10
-
-	const fil_space_t*	space = index->table->space;
-	page_id_t		page_id(space->id, slot1->page_no);
-	const ulint		zip_size = space->zip_size();
-
-	level = slot1->page_level;
+  /* Assume a page like:
+  records:             (inf, a, b, c, d, sup)
+  index of the record:    0, 1, 2, 3, 4, 5
+  */
+
+  /** Index of the record where the page cursor stopped on this level
+  (index in alphabetical order). In the above example, if the search stopped on
+  record 'c', then nth_rec will be 3. */
+  ulint m_nth_rec;
+
+  /** Number of the records on the page, not counting inf and sup.
+  In the above example n_recs will be 4. */
+  ulint m_n_recs;
+
+  /** Search tuple */
+  const dtuple_t &m_tuple;
+  /** Cursor search mode */
+  page_cur_mode_t m_mode;
+  /** Page cursor which is used for search */
+  page_cur_t m_page_cur;
+  /** Page id of the page to get on level down, can differ from
+  m_block->page.id at the moment when the child's page id is already found, but
+  the child's block has not fetched yet */
+  page_id_t m_page_id;
+  /** Current block */
+  buf_block_t *m_block;
+  /** mtr savepoint of the current block */
+  ulint m_savepoint;
+  /** Page search mode, can differ from m_mode for non-leaf pages, see c-tor
+  comments for details */
+  page_cur_mode_t m_page_mode;
+
+  /** Matched fields and bytes which are used for on-page search, see
+  btr_cur_t::(up|low)_(match|bytes) comments for details */
+  ulint m_up_match= 0;
+  ulint m_up_bytes= 0;
+  ulint m_low_match= 0;
+  ulint m_low_bytes= 0;
+
+public:
+  btr_est_cur_t(dict_index_t *index, const dtuple_t &tuple,
+                page_cur_mode_t mode)
+      : m_tuple(tuple), m_mode(mode),
+        m_page_id(index->table->space_id, index->page), m_block(nullptr)
+  {
 
-	do {
-		mtr_t		mtr;
-		page_t*		page;
-		buf_block_t*	block;
-		dberr_t		err=DB_SUCCESS;
+    ut_ad(dict_index_check_search_tuple(index, &tuple));
+    ut_ad(dtuple_check_typed(&tuple));
+
+    m_page_cur.index = index;
+    /* We use these modified search modes on non-leaf levels of the B-tree.
+    These let us end up in the right B-tree leaf. In that leaf we use the
+    original search mode. */
+    switch (mode) {
+    case PAGE_CUR_GE:
+      m_page_mode= PAGE_CUR_L;
+      break;
+    case PAGE_CUR_G:
+      m_page_mode= PAGE_CUR_LE;
+      break;
+    default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+      ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE ||
+            mode == PAGE_CUR_LE_OR_EXTENDS);
+#else  /* PAGE_CUR_LE_OR_EXTENDS */
+      ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+      m_page_mode= mode;
+      break;
+    }
+  }
 
-		mtr_start(&mtr);
+  /** Retrieve block with m_page_id, release the previously gotten block
+  if necessary. If this is a left border block cursor and both left and right
+  border blocks have the same parent, don't unlatch the parent, as it must be
+  latched to get the right block, and will be unlatched after the right block
+  is fetched.
+  @param  level distance from the leaf page level; ULINT_UNDEFINED when
+          fetching the root page
+  @param  mtr mtr
+  @param  right_parent right border block parent, nullptr if the function
+          is called for the right block itself
+  @return true on success or false otherwise. */
+  bool fetch_child(ulint level, mtr_t &mtr, const buf_block_t *right_parent)
+  {
+    buf_block_t *parent_block= m_block;
+    ulint parent_savepoint= m_savepoint;
 
-		/* Fetch the page. Because we are not holding the
-		index->lock, the tree may have changed and we may be
-		attempting to read a page that is no longer part of
-		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
-		silence a debug assertion about this. */
-		block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH,
-					 NULL, BUF_GET_POSSIBLY_FREED,
-					 __FILE__, __LINE__, &mtr, &err);
+    m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level,
+                           &mtr, nullptr);
+    if (!m_block)
+      return false;
 
-		ut_ad((block != NULL) == (err == DB_SUCCESS));
+    if (parent_block && parent_block != right_parent)
+      mtr.rollback_to_savepoint(parent_savepoint, parent_savepoint + 1);
 
-		if (!block) {
-			if (err == DB_DECRYPTION_FAILED) {
-				ib_push_warning((void *)NULL,
-					DB_DECRYPTION_FAILED,
-					"Table %s is encrypted but encryption service or"
-					" used key_id is not available. "
-					" Can't continue reading table.",
-					index->table->name.m_name);
-				index->table->file_unreadable = true;
-			}
+    m_savepoint= mtr.get_savepoint() - 1;
 
-			mtr_commit(&mtr);
-			goto inexact;
-		}
+    return level == ULINT_UNDEFINED ||
+      btr_page_get_level(m_block->page.frame) == level;
+  }
 
-		page = buf_block_get_frame(block);
+  /** Sets page mode for leaves */
+  void set_page_mode_for_leaves() { m_page_mode= m_mode; }
 
-		/* It is possible that the tree has been reorganized in the
-		meantime and this is a different page. If this happens the
-		calculated estimate will be bogus, which is not fatal as
-		this is only an estimate. We are sure that a page with
-		page_no exists because InnoDB never frees pages, only
-		reuses them. */
-		if (!fil_page_index_page_check(page)
-		    || btr_page_get_index_id(page) != index->id
-		    || btr_page_get_level(page) != level) {
-
-			/* The page got reused for something else */
-			mtr_commit(&mtr);
-			goto inexact;
-		}
+  /** Does search on the current page. If there is no border in m_tuple, then
+  just move the cursor to the most left or right record.
+  @param level current level on tree.
+  @param root_height root height
+  @param left true if this is left border, false otherwise.
+  @return true on success, false otherwise. */
+  bool search_on_page(ulint level, ulint root_height, bool left)
+  {
+    if (level != btr_page_get_level(m_block->page.frame))
+      return false;
 
-		/* It is possible but highly unlikely that the page was
-		originally written by an old version of InnoDB that did
-		not initialize FIL_PAGE_TYPE on other than B-tree pages.
-		For example, this could be an almost-empty BLOB page
-		that happens to contain the magic values in the fields
-		that we checked above. */
+    m_n_recs= page_get_n_recs(m_block->page.frame);
 
-		n_pages_read++;
+    if (dtuple_get_n_fields(&m_tuple) > 0)
+    {
+      m_up_bytes= m_low_bytes= 0;
+      m_page_cur.block= m_block;
+      if (page_cur_search_with_match(&m_tuple, m_page_mode,
+                                     &m_up_match, &m_low_match, &m_page_cur,
+                                     nullptr))
+        return false;
+      m_nth_rec= page_rec_get_n_recs_before(page_cur_get_rec(&m_page_cur));
+    }
+    else if (left)
+    {
+      page_cur_set_before_first(m_block, &m_page_cur);
+      if (level)
+      {
+        if (!page_cur_move_to_next(&m_page_cur))
+          return false;
+        m_nth_rec= 1;
+      }
+      else
+        m_nth_rec= 0;
+    }
+    else
+    {
+      m_nth_rec= m_n_recs;
+      if (!level)
+      {
+        page_cur_set_after_last(m_block, &m_page_cur);
+        ++m_nth_rec;
+      }
+      else
+      {
+        m_page_cur.block= m_block;
+        m_page_cur.rec= page_rec_get_nth(m_block->page.frame, m_nth_rec);
+      }
+    }
 
-		if (page_id.page_no() != slot1->page_no) {
-			/* Do not count the records on slot1->page_no,
-			we already counted them before this loop. */
-			n_rows += page_get_n_recs(page);
-		}
+    return true;
+  }
 
-		page_id.set_page_no(btr_page_get_next(page));
+  /** Gets page id of the current record child.
+  @param offsets offsets array.
+  @param heap heap for offsets array */
+  void get_child(rec_offs **offsets, mem_heap_t **heap)
+  {
+    const rec_t *node_ptr= page_cur_get_rec(&m_page_cur);
 
-		mtr_commit(&mtr);
+    /* FIXME: get the child page number directly without computing offsets */
+    *offsets= rec_get_offsets(node_ptr, index(), *offsets, 0, ULINT_UNDEFINED,
+                              heap);
 
-		if (n_pages_read == N_PAGES_READ_LIMIT
-		    || page_id.page_no() == FIL_NULL) {
-			/* Either we read too many pages or
-			we reached the end of the level without passing
-			through slot2->page_no, the tree must have changed
-			in the meantime */
-			goto inexact;
-		}
+    /* Go to the child node */
+    m_page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, *offsets));
+  }
 
-	} while (page_id.page_no() != slot2->page_no);
+  /** @return true if left border should be counted */
+  bool should_count_the_left_border() const
+  {
+    if (dtuple_get_n_fields(&m_tuple) > 0)
+    {
+      ut_ad(!page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
+      return !page_rec_is_supremum(page_cur_get_rec(&m_page_cur));
+    }
+    ut_ad(page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
+    return false;
+  }
 
-	return(n_rows);
+  /** @return true if right border should be counted */
+  bool should_count_the_right_border() const
+  {
+    if (dtuple_get_n_fields(&m_tuple) > 0)
+    {
+      const rec_t *rec= page_cur_get_rec(&m_page_cur);
+      ut_ad(!(m_mode == PAGE_CUR_L && page_rec_is_supremum(rec)));
+
+      return (m_mode == PAGE_CUR_LE /* if the range is '<=' */
+              /* and the record was found */
+              && m_low_match >= dtuple_get_n_fields(&m_tuple)) ||
+             (m_mode == PAGE_CUR_L /* or if the range is '<' */
+              /* and there are any records to match the criteria, i.e. if the
+              minimum record on the tree is 5 and x < 7 is specified then the
+              cursor will be positioned at 5 and we should count the border,
+              but if x < 2 is specified, then the cursor will be positioned at
+              'inf' and we should not count the border */
+              && !page_rec_is_infimum(rec));
+      /* Notice that for "WHERE col <= 'foo'" the server passes to
+      ha_innobase::records_in_range(): min_key=NULL (left-unbounded) which is
+      expected max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
+      unexpected - one would expect flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In
+      this case the cursor will be positioned on the first record to the right
+      of the requested one (can also be positioned on the 'sup') and we should
+      not count the right border. */
+    }
+    ut_ad(page_rec_is_supremum(page_cur_get_rec(&m_page_cur)));
 
-inexact:
+    /* The range specified is without a right border, just 'x > 123'
+    or 'x >= 123' and search_on_page() positioned the cursor on the
+    supremum record on the rightmost page, which must not be counted. */
+    return false;
+  }
 
-	*is_n_rows_exact = false;
+  /** @return index */
+  const dict_index_t *index() const { return m_page_cur.index; }
 
-	/* We did interrupt before reaching slot2->page */
+  /** @return current block */
+  const buf_block_t *block() const { return m_block; }
 
-	if (n_pages_read > 0) {
-		/* The number of pages on this level is
-		n_rows_on_prev_level, multiply it by the
-		average number of recs per page so far */
-		n_rows = n_rows_on_prev_level * n_rows / n_pages_read;
-	} else {
-		/* The tree changed before we could even
-		start with slot1->page_no */
-		n_rows = 10;
-	}
+  /** @return current page id */
+  page_id_t page_id() const { return m_page_id; }
 
-	return(n_rows);
-}
+  /** Copies block pointer and savepoint from another btr_est_cur_t in the case
+  if both left and right border cursors point to the same block.
+  @param o reference to the other btr_est_cur_t object. */
+  void set_block(const btr_est_cur_t &o)
+  {
+    m_block= o.m_block;
+    m_savepoint= o.m_savepoint;
+  }
 
-/** If the tree gets changed too much between the two dives for the left
-and right boundary then btr_estimate_n_rows_in_range_low() will retry
-that many times before giving up and returning the value stored in
-rows_in_range_arbitrary_ret_val. */
-static const unsigned	rows_in_range_max_retries = 4;
+  /** @return current record number. */
+  ulint nth_rec() const { return m_nth_rec; }
 
-/** We pretend that a range has that many records if the tree keeps changing
-for rows_in_range_max_retries retries while we try to estimate the records
-in a given range. */
-static const ha_rows	rows_in_range_arbitrary_ret_val = 10;
+  /** @return number of records in the current page. */
+  ulint n_recs() const { return m_n_recs; }
+};
 
-/** Estimates the number of rows in a given index range.
-@param[in]	index		index
-@param[in]	tuple1		range start
-@param[in]	tuple2		range end
-@param[in]	nth_attempt	if the tree gets modified too much while
-we are trying to analyze it, then we will retry (this function will call
-itself, incrementing this parameter)
-@return estimated number of rows; if after rows_in_range_max_retries
-retries the tree keeps changing, then we will just return
-rows_in_range_arbitrary_ret_val as a result (if
-nth_attempt >= rows_in_range_max_retries and the tree is modified between
-the two dives). */
-static
-ha_rows
-btr_estimate_n_rows_in_range_low(
-	dict_index_t*	index,
-	btr_pos_t*	tuple1,
-	btr_pos_t*	tuple2,
-	unsigned	nth_attempt)
+/** Estimate the number of rows between the left record of the path and the
+right one(non-inclusive) for the certain level on a B-tree. This function
+starts from the page next to the left page and reads a few pages to the right,
+counting their records. If we reach the right page quickly then we know exactly
+how many records there are between left and right records and we set
+is_n_rows_exact to true. After some page is latched, the previous page is
+unlatched. If we cannot reach the right page quickly then we calculate the
+average number of records in the pages scanned so far and assume that all pages
+that we did not scan up to the right page contain the same number of records,
+then we multiply that average to the number of pages between right and left
+records (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to
+false.
+@param level current level.
+@param left_cur the cursor of the left page.
+@param right_page_no right page number.
+@param n_rows_on_prev_level number of rows on the previous level.
+@param[out] is_n_rows_exact true if exact rows number is returned.
+@param[in,out] mtr mtr,
+@return number of rows, not including the borders (exact or estimated). */
+static ha_rows btr_estimate_n_rows_in_range_on_level(
+    ulint level, btr_est_cur_t &left_cur, uint32_t right_page_no,
+    ha_rows n_rows_on_prev_level, bool &is_n_rows_exact, mtr_t &mtr)
 {
-	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
-	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
-	btr_cur_t	cursor;
-	btr_path_t*	slot1;
-	btr_path_t*	slot2;
-	bool		diverged;
-	bool		diverged_lot;
-	ulint		divergence_level;
-	ha_rows		n_rows;
-	bool		is_n_rows_exact;
-	ulint		i;
-	mtr_t		mtr;
-	ha_rows		table_n_rows;
-        page_cur_mode_t mode2= tuple2->mode;
-
-	table_n_rows = dict_table_get_n_rows(index->table);
-
-	/* Below we dive to the two records specified by tuple1 and tuple2 and
-	we remember the entire dive paths from the tree root. The place where
-	the tuple1 path ends on the leaf level we call "left border" of our
-	interval and the place where the tuple2 path ends on the leaf level -
-	"right border". We take care to either include or exclude the interval
-	boundaries depending on whether <, <=, > or >= was specified. For
-	example if "5 < x AND x <= 10" then we should not include the left
-	boundary, but should include the right one. */
-
-	mtr_start(&mtr);
-
-	cursor.path_arr = path1;
-
-	bool	should_count_the_left_border;
-
-	if (dtuple_get_n_fields(tuple1->tuple) > 0) {
-
-              btr_cur_search_to_nth_level(index, 0, tuple1->tuple,
-                                            tuple1->mode,
-					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
-					    &cursor, __FILE__, __LINE__, &mtr);
-
-		ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
-
-		/* We should count the border if there are any records to
-		match the criteria, i.e. if the maximum record on the tree is
-		5 and x > 3 is specified then the cursor will be positioned at
-		5 and we should count the border, but if x > 7 is specified,
-		then the cursor will be positioned at 'sup' on the rightmost
-		leaf page in the tree and we should not count the border. */
-		should_count_the_left_border
-			= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
-	} else {
-		dberr_t err = DB_SUCCESS;
-
-		err = btr_cur_open_at_index_side(true, index,
-					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
-					   &cursor, 0, &mtr);
-
-		if (err != DB_SUCCESS) {
-			ib::warn() << " Error code: " << err
-				   << " btr_estimate_n_rows_in_range_low "
-				   << " called from file: "
-				   << __FILE__ << " line: " << __LINE__
-				   << " table: " << index->table->name
-				   << " index: " << index->name;
-		}
-
-		ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
-
-		/* The range specified is wihout a left border, just
-		'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
-		positioned the cursor on the infimum record on the leftmost
-		page, which must not be counted. */
-		should_count_the_left_border = false;
-	}
-
-        tuple1->page_id= cursor.page_cur.block->page.id();
-
-	mtr_commit(&mtr);
-
-	if (!index->is_readable()) {
-		return 0;
-	}
-
-	mtr_start(&mtr);
-
-	cursor.path_arr = path2;
-
-	bool	should_count_the_right_border;
-
-	if (dtuple_get_n_fields(tuple2->tuple) > 0) {
-
-		btr_cur_search_to_nth_level(index, 0, tuple2->tuple,
-                                            mode2,
-					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
-					    &cursor, __FILE__, __LINE__, &mtr);
-
-		const rec_t*	rec = btr_cur_get_rec(&cursor);
-
-		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
-
-		should_count_the_right_border
-			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
-			   /* and the record was found */
-			   && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple))
-			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
-			    /* and there are any records to match the criteria,
-			    i.e. if the minimum record on the tree is 5 and
-			    x < 7 is specified then the cursor will be
-			    positioned at 5 and we should count the border, but
-			    if x < 2 is specified, then the cursor will be
-			    positioned at 'inf' and we should not count the
-			    border */
-			    && !page_rec_is_infimum(rec));
-		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
-		ha_innobase::records_in_range():
-		min_key=NULL (left-unbounded) which is expected
-		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
-		unexpected - one would expect
-		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
-		cursor will be positioned on the first record to the right of
-		the requested one (can also be positioned on the 'sup') and
-		we should not count the right border. */
-	} else {
-		dberr_t err = DB_SUCCESS;
-
-		err = btr_cur_open_at_index_side(false, index,
-					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
-					   &cursor, 0, &mtr);
-
-		if (err != DB_SUCCESS) {
-			ib::warn() << " Error code: " << err
-				   << " btr_estimate_n_rows_in_range_low "
-				   << " called from file: "
-				   << __FILE__ << " line: " << __LINE__
-				   << " table: " << index->table->name
-				   << " index: " << index->name;
-		}
-
-		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
-
-		/* The range specified is wihout a right border, just
-		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
-		positioned the cursor on the supremum record on the rightmost
-		page, which must not be counted. */
-		should_count_the_right_border = false;
-	}
-
-        tuple2->page_id= cursor.page_cur.block->page.id();
-
-	mtr_commit(&mtr);
-
-	/* We have the path information for the range in path1 and path2 */
-
-	n_rows = 0;
-	is_n_rows_exact = true;
-
-	/* This becomes true when the two paths do not pass through the
-	same pages anymore. */
-	diverged = false;
-
-	/* This becomes true when the paths are not the same or adjacent
-	any more. This means that they pass through the same or
-	neighboring-on-the-same-level pages only. */
-	diverged_lot = false;
-
-	/* This is the level where paths diverged a lot. */
-	divergence_level = 1000000;
-
-	for (i = 0; ; i++) {
-		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
-
-		slot1 = path1 + i;
-		slot2 = path2 + i;
-
-		if (slot1->nth_rec == ULINT_UNDEFINED
-		    || slot2->nth_rec == ULINT_UNDEFINED) {
-
-			/* Here none of the borders were counted. For example,
-			if on the leaf level we descended to:
-			(inf, a, b, c, d, e, f, sup)
-			         ^        ^
-			       path1    path2
-			then n_rows will be 2 (c and d). */
-
-			if (is_n_rows_exact) {
-				/* Only fiddle to adjust this off-by-one
-				if the number is exact, otherwise we do
-				much grosser adjustments below. */
-
-				btr_path_t*	last1 = &path1[i - 1];
-				btr_path_t*	last2 = &path2[i - 1];
-
-				/* If both paths end up on the same record on
-				the leaf level. */
-				if (last1->page_no == last2->page_no
-				    && last1->nth_rec == last2->nth_rec) {
-
-					/* n_rows can be > 0 here if the paths
-					were first different and then converged
-					to the same record on the leaf level.
-					For example:
-					SELECT ... LIKE 'wait/synch/rwlock%'
-					mode1=PAGE_CUR_GE,
-					tuple1="wait/synch/rwlock"
-					path1[0]={nth_rec=58, n_recs=58,
-						  page_no=3, page_level=1}
-					path1[1]={nth_rec=56, n_recs=55,
-						  page_no=119, page_level=0}
-
-					mode2=PAGE_CUR_G
-					tuple2="wait/synch/rwlock"
-					path2[0]={nth_rec=57, n_recs=57,
-						  page_no=3, page_level=1}
-					path2[1]={nth_rec=56, n_recs=55,
-						  page_no=119, page_level=0} */
-
-					/* If the range is such that we should
-					count both borders, then avoid
-					counting that record twice - once as a
-					left border and once as a right
-					border. */
-					if (should_count_the_left_border
-					    && should_count_the_right_border) {
-
-						n_rows = 1;
-					} else {
-						/* Some of the borders should
-						not be counted, e.g. [3,3). */
-						n_rows = 0;
-					}
-				} else {
-					if (should_count_the_left_border) {
-						n_rows++;
-					}
-
-					if (should_count_the_right_border) {
-						n_rows++;
-					}
-				}
-			}
-
-			if (i > divergence_level + 1 && !is_n_rows_exact) {
-				/* In trees whose height is > 1 our algorithm
-				tends to underestimate: multiply the estimate
-				by 2: */
-
-				n_rows = n_rows * 2;
-			}
-
-			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
-
-			/* Do not estimate the number of rows in the range
-			to over 1 / 2 of the estimated rows in the whole
-			table */
+  ha_rows n_rows= 0;
+  uint n_pages_read= 0;
+  /* Do not read more than this number of pages in order not to hurt
+  performance with this code which is just an estimation. If we read this many
+  pages before reaching right_page_no, then we estimate the average from the
+  pages scanned so far. */
+  static constexpr uint n_pages_read_limit= 9;
+  ulint savepoint= 0;
+  buf_block_t *block= nullptr;
+  const dict_index_t *index= left_cur.index();
+
+  /* Assume by default that we will scan all pages between left and right(non
+  inclusive) pages */
+  is_n_rows_exact= true;
+
+  /* Add records from the left page which are to the right of the record which
+  serves as a left border of the range, if any (we don't include the record
+  itself in this count). */
+  if (left_cur.nth_rec() <= left_cur.n_recs())
+  {
+    n_rows+= left_cur.n_recs() - left_cur.nth_rec();
+  }
 
-			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
+  /* Count the records in the pages between left and right (non inclusive)
+  pages */
 
-				n_rows = table_n_rows / 2;
+  const fil_space_t *space= index->table->space;
+  page_id_t page_id(space->id,
+                    btr_page_get_next(buf_block_get_frame(left_cur.block())));
 
-				/* If there are just 0 or 1 rows in the table,
-				then we estimate all rows are in the range */
+  if (page_id.page_no() == FIL_NULL)
+    goto inexact;
 
-				if (n_rows == 0) {
-					n_rows = table_n_rows;
-				}
-			}
+  do
+  {
+    page_t *page;
+    buf_block_t *prev_block= block;
+    ulint prev_savepoint= savepoint;
 
-			return(n_rows);
-		}
+    savepoint= mtr.get_savepoint();
 
-		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+    /* Fetch the page. */
+    block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr,
+                         nullptr);
 
-			/* If both slots do not point to the same page,
-			this means that the tree must have changed between
-			the dive for slot1 and the dive for slot2 at the
-			beginning of this function. */
-			if (slot1->page_no != slot2->page_no
-			    || slot1->page_level != slot2->page_level) {
+    if (prev_block)
+    {
+      mtr.rollback_to_savepoint(prev_savepoint, prev_savepoint + 1);
+      if (block)
+        savepoint--;
+    }
 
-				/* If the tree keeps changing even after a
-				few attempts, then just return some arbitrary
-				number. */
-				if (nth_attempt >= rows_in_range_max_retries) {
-					return(rows_in_range_arbitrary_ret_val);
-				}
+    if (!block || btr_page_get_level(buf_block_get_frame(block)) != level)
+      goto inexact;
 
-				return btr_estimate_n_rows_in_range_low(
-                                       index, tuple1, tuple2,
-                                       nth_attempt + 1);
-			}
+    page= buf_block_get_frame(block);
 
-			diverged = true;
-
-			if (slot1->nth_rec < slot2->nth_rec) {
-				/* We do not count the borders (nor the left
-				nor the right one), thus "- 1". */
-				n_rows = slot2->nth_rec - slot1->nth_rec - 1;
-
-				if (n_rows > 0) {
-					/* There is at least one row between
-					the two borders pointed to by slot1
-					and slot2, so on the level below the
-					slots will point to non-adjacent
-					pages. */
-					diverged_lot = true;
-					divergence_level = i;
-				}
-			} else {
-				/* It is possible that
-				slot1->nth_rec >= slot2->nth_rec
-				if, for example, we have a single page
-				tree which contains (inf, 5, 6, supr)
-				and we select where x > 20 and x < 30;
-				in this case slot1->nth_rec will point
-				to the supr record and slot2->nth_rec
-				will point to 6. */
-				n_rows = 0;
-				should_count_the_left_border = false;
-				should_count_the_right_border = false;
-			}
+    /* It is possible but highly unlikely that the page was originally written
+    by an old version of InnoDB that did not initialize FIL_PAGE_TYPE on other
+    than B-tree pages. For example, this could be an almost-empty BLOB page
+    that happens to contain the magic values in the fields
+    that we checked above. */
 
-		} else if (diverged && !diverged_lot) {
+    n_pages_read++;
 
-			if (slot1->nth_rec < slot1->n_recs
-			    || slot2->nth_rec > 1) {
+    n_rows+= page_get_n_recs(page);
 
-				diverged_lot = true;
-				divergence_level = i;
+    page_id.set_page_no(btr_page_get_next(page));
 
-				n_rows = 0;
+    if (n_pages_read == n_pages_read_limit)
+    {
+      /* We read too many pages or we reached the end of the level
+      without passing through right_page_no. */
+      goto inexact;
+    }
 
-				if (slot1->nth_rec < slot1->n_recs) {
-					n_rows += slot1->n_recs
-						- slot1->nth_rec;
-				}
+  } while (page_id.page_no() != right_page_no);
 
-				if (slot2->nth_rec > 1) {
-					n_rows += slot2->nth_rec - 1;
-				}
-			}
-		} else if (diverged_lot) {
+  if (block)
+  {
+    ut_ad(block == mtr.at_savepoint(savepoint));
+    mtr.rollback_to_savepoint(savepoint, savepoint + 1);
+  }
 
-			n_rows = btr_estimate_n_rows_in_range_on_level(
-				index, slot1, slot2, n_rows,
-				&is_n_rows_exact);
-		}
-	}
-}
+  return (n_rows);
 
-/** Estimates the number of rows in a given index range.
-@param[in]	index	index
-@param[in]	tuple1	range start, may also be empty tuple
-@param[in]	mode1	search mode for range start
-@param[in]	tuple2	range end, may also be empty tuple
-@param[in]	mode2	search mode for range end
-@return estimated number of rows */
-ha_rows
-btr_estimate_n_rows_in_range(
-	dict_index_t*	index,
-        btr_pos_t       *tuple1,
-        btr_pos_t       *tuple2)
-{
-	return btr_estimate_n_rows_in_range_low(
-		index, tuple1, tuple2, 1);
-}
+inexact:
 
-/*******************************************************************//**
-Record the number of non_null key values in a given index for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are eventually stored in the array:
-index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
-static
-void
-btr_record_not_null_field_in_rec(
-/*=============================*/
-	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
-					number of columns uniquely determine
-					an index entry */
-	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
-					its size could be for all fields or
-					that of "n_unique" */
-	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
-					not null rows for n-column prefix */
-{
-	ulint	i;
+  if (block)
+  {
+    ut_ad(block == mtr.at_savepoint(savepoint));
+    mtr.rollback_to_savepoint(savepoint, savepoint + 1);
+  }
 
-	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+  is_n_rows_exact= false;
 
-	if (n_not_null == NULL) {
-		return;
-	}
+  /* We did interrupt before reaching right page */
 
-	for (i = 0; i < n_unique; i++) {
-		if (rec_offs_nth_sql_null(offsets, i)) {
-			break;
-		}
+  if (n_pages_read > 0)
+  {
+    /* The number of pages on this level is
+    n_rows_on_prev_level, multiply it by the
+    average number of recs per page so far */
+    n_rows= n_rows_on_prev_level * n_rows / n_pages_read;
+  }
+  else
+  {
+    n_rows= 10;
+  }
 
-		n_not_null[i]++;
-	}
+  return (n_rows);
 }
 
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-result.n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array result.n_non_null_key_vals.
-@param[in]	index	index
-@return vector with statistics information
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index)
+/** Estimates the number of rows in a given index range. Do search in the left
+page, then if there are pages between left and right ones, read a few pages to
+the right, if the right page is reached, count the exact number of rows without
+fetching the right page, the right page will be fetched in the caller of this
+function and the amount of its rows will be added. If the right page is not
+reached, count the estimated(see btr_estimate_n_rows_in_range_on_level() for
+details) rows number, and fetch the right page. If leaves are reached, unlatch
+non-leaf pages except the right leaf parent. After the right leaf page is
+fetched, commit mtr.
+@param[in]  index index
+@param[in]  range_start range start
+@param[in]  range_end   range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+                                     btr_pos_t *range_start,
+                                     btr_pos_t *range_end)
 {
-	btr_cur_t	cursor;
-	page_t*		page;
-	rec_t*		rec;
-	ulint		n_cols;
-	ib_uint64_t*	n_diff;
-	ib_uint64_t*	n_not_null;
-	ibool		stats_null_not_equal;
-	uintmax_t	n_sample_pages=1; /* number of pages to sample */
-	ulint		not_empty_flag	= 0;
-	ulint		total_external_size = 0;
-	ulint		i;
-	ulint		j;
-	uintmax_t	add_on;
-	mtr_t		mtr;
-	mem_heap_t*	heap		= NULL;
-	rec_offs*	offsets_rec	= NULL;
-	rec_offs*	offsets_next_rec = NULL;
-
-	std::vector<index_field_stats_t> result;
-
-	/* For spatial index, there is no such stats can be
-	fetched. */
-	ut_ad(!dict_index_is_spatial(index));
-
-	n_cols = dict_index_get_n_unique(index);
-
-	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
-			       * n_cols
-			       + dict_index_get_n_fields(index)
-			       * (sizeof *offsets_rec
-				  + sizeof *offsets_next_rec));
-
-	n_diff = (ib_uint64_t*) mem_heap_zalloc(
-		heap, n_cols * sizeof(n_diff[0]));
-
-	n_not_null = NULL;
-
-	/* Check srv_innodb_stats_method setting, and decide whether we
-	need to record non-null value and also decide if NULL is
-	considered equal (by setting stats_null_not_equal value) */
-	switch (srv_innodb_stats_method) {
-	case SRV_STATS_NULLS_IGNORED:
-		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
-			heap, n_cols * sizeof *n_not_null);
-		/* fall through */
+  DBUG_ENTER("btr_estimate_n_rows_in_range");
 
-	case SRV_STATS_NULLS_UNEQUAL:
-		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
-		case, we will treat NULLs as unequal value */
-		stats_null_not_equal = TRUE;
-		break;
-
-	case SRV_STATS_NULLS_EQUAL:
-		stats_null_not_equal = FALSE;
-		break;
+  if (UNIV_UNLIKELY(index->page == FIL_NULL || index->is_corrupted()))
+    DBUG_RETURN(0);
 
-	default:
-		ut_error;
-	}
+  ut_ad(index->is_btree());
 
-	if (srv_stats_sample_traditional) {
-		/* It makes no sense to test more pages than are contained
-		in the index, thus we lower the number if it is too high */
-		if (srv_stats_transient_sample_pages > index->stat_index_size) {
-			if (index->stat_index_size > 0) {
-				n_sample_pages = index->stat_index_size;
-			}
-		} else {
-			n_sample_pages = srv_stats_transient_sample_pages;
-		}
-	} else {
-		/* New logaritmic number of pages that are estimated.
-		Number of pages estimated should be between 1 and
-		index->stat_index_size.
+  btr_est_cur_t p1(index, *range_start->tuple, range_start->mode);
+  btr_est_cur_t p2(index, *range_end->tuple, range_end->mode);
+  mtr_t mtr;
 
-		If we have only 0 or 1 index pages then we can only take 1
-		sample. We have already initialized n_sample_pages to 1.
+  ulint height;
+  ulint root_height= 0; /* remove warning */
 
-		So taking index size as I and sample as S and log(I)*S as L
+  mem_heap_t *heap= NULL;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
 
-		requirement 1) we want the out limit of the expression to not exceed I;
-		requirement 2) we want the ideal pages to be at least S;
-		so the current expression is min(I, max( min(S,I), L)
+  mtr.start();
 
-		looking for simplifications:
+  ut_ad(mtr.get_savepoint() == 0);
+  mtr_s_lock_index(index, &mtr);
 
-		case 1: assume S < I
-		min(I, max( min(S,I), L) -> min(I , max( S, L))
+  ha_rows table_n_rows= dict_table_get_n_rows(index->table);
 
-		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
+  height= ULINT_UNDEFINED;
 
-		so we have: min(I , L)
+  /* This becomes true when the two paths do not pass through the same pages
+  anymore. */
+  bool diverged= false;
+  /* This is the height, i.e. the number of levels from the root, where paths
+   are not the same or adjacent any more. */
+  ulint divergence_height= ULINT_UNDEFINED;
+  bool should_count_the_left_border= true;
+  bool should_count_the_right_border= true;
+  bool is_n_rows_exact= true;
+  ha_rows n_rows= 0;
 
-		case 2: assume I < S
-		    min(I, max( min(S,I), L) -> min(I, max( I, L))
-
-		case 2a: L > I
-		    min(I, max( I, L)) -> min(I, L) -> I
-
-		case 2b: when L < I
-		    min(I, max( I, L))  ->  min(I, I ) -> I
-
-		so taking all case2 paths is I, our expression is:
-		n_pages = S < I? min(I,L) : I
-                */
-		if (index->stat_index_size > 1) {
-			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
-				? ut_min(index->stat_index_size,
-					 static_cast<ulint>(
-						 log2(double(index->stat_index_size))
-						 * double(srv_stats_transient_sample_pages)))
-				: index->stat_index_size;
-		}
-	}
-
-	/* Sanity check */
-	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
-
-	/* We sample some pages in the index to get an estimate */
-
-	for (i = 0; i < n_sample_pages; i++) {
-		mtr_start(&mtr);
-
-		bool	available;
-
-		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
-						    &cursor, &mtr);
-
-		if (!available) {
-			mtr_commit(&mtr);
-			mem_heap_free(heap);
-
-			return result;
-		}
-
-		/* Count the number of different key values for each prefix of
-		the key on this index page. If the prefix does not determine
-		the index record uniquely in the B-tree, then we subtract one
-		because otherwise our algorithm would give a wrong estimate
-		for an index where there is just one key value. */
+  /* Loop and search until we arrive at the desired level. */
+search_loop:
+  if (!p1.fetch_child(height, mtr, p2.block()))
+    goto error;
 
-		if (!index->is_readable()) {
-			mtr_commit(&mtr);
-			goto exit_loop;
-		}
+  if (height == ULINT_UNDEFINED)
+  {
+    /* We are in the root node */
+    height= btr_page_get_level(buf_block_get_frame(p1.block()));
+    root_height= height;
+  }
 
-		page = btr_cur_get_page(&cursor);
+  if (!height)
+  {
+    p1.set_page_mode_for_leaves();
+    p2.set_page_mode_for_leaves();
+  }
 
-		rec = page_rec_get_next(page_get_infimum_rec(page));
-		const ulint n_core = page_is_leaf(page)
-			? index->n_core_fields : 0;
+  if (p1.page_id() == p2.page_id())
+    p2.set_block(p1);
+  else
+  {
+    ut_ad(diverged);
+    if (divergence_height != ULINT_UNDEFINED) {
+      /* We need to call p1.search_on_page() here as
+      btr_estimate_n_rows_in_range_on_level() uses p1.m_n_recs and
+      p1.m_nth_rec. */
+      if (!p1.search_on_page(height, root_height, true))
+        goto error;
+      n_rows= btr_estimate_n_rows_in_range_on_level(
+          height, p1, p2.page_id().page_no(), n_rows, is_n_rows_exact, mtr);
+    }
+    if (!p2.fetch_child(height, mtr, nullptr))
+      goto error;
+  }
 
-		if (!page_rec_is_supremum(rec)) {
-			not_empty_flag = 1;
-			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
-						      n_core,
-						      ULINT_UNDEFINED, &heap);
+  if (height == 0)
+    /* There is no need to release non-leaf pages here as they must already be
+    unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after
+    releasing the index latch, to decrease contention. */
+    mtr.rollback_to_savepoint(0, 1);
 
-			if (n_not_null != NULL) {
-				btr_record_not_null_field_in_rec(
-					n_cols, offsets_rec, n_not_null);
-			}
-		}
+  /* There is no need to search on left page if
+  divergence_height != ULINT_UNDEFINED, as it was already searched before
+  btr_estimate_n_rows_in_range_on_level() call */
+  if (divergence_height == ULINT_UNDEFINED &&
+      !p1.search_on_page(height, root_height, true))
+    goto error;
 
-		while (!page_rec_is_supremum(rec)) {
-			ulint	matched_fields;
-			rec_t*	next_rec = page_rec_get_next(rec);
-			if (page_rec_is_supremum(next_rec)) {
-				total_external_size +=
-					btr_rec_get_externally_stored_len(
-						rec, offsets_rec);
-				break;
-			}
+  if (!p2.search_on_page(height, root_height, false))
+    goto error;
 
-			offsets_next_rec = rec_get_offsets(next_rec, index,
-							   offsets_next_rec,
-							   n_core,
-							   ULINT_UNDEFINED,
-							   &heap);
+  if (!diverged && (p1.nth_rec() != p2.nth_rec()))
+  {
+    ut_ad(p1.page_id() == p2.page_id());
+    diverged= true;
+    if (p1.nth_rec() < p2.nth_rec())
+    {
+      /* We do not count the borders (nor the left nor the right one), thus
+      "- 1". */
+      n_rows= p2.nth_rec() - p1.nth_rec() - 1;
+
+      if (n_rows > 0)
+      {
+        /* There is at least one row between the two borders pointed to by p1
+        and p2, so on the level below the slots will point to non-adjacent
+        pages. */
+        divergence_height= root_height - height;
+      }
+    }
+    else
+    {
+      /* It is possible that p1->nth_rec > p2->nth_rec if, for example, we have
+      a single page tree which contains (inf, 5, 6, supr) and we select where x
+      > 20 and x < 30; in this case p1->nth_rec will point to the supr record
+      and p2->nth_rec will point to 6. */
+      n_rows= 0;
+      should_count_the_left_border= false;
+      should_count_the_right_border= false;
+    }
+  }
+  else if (diverged && divergence_height == ULINT_UNDEFINED)
+  {
 
-			cmp_rec_rec(rec, next_rec,
-				    offsets_rec, offsets_next_rec,
-				    index, stats_null_not_equal,
-				    &matched_fields);
+    if (p1.nth_rec() < p1.n_recs() || p2.nth_rec() > 1)
+    {
+      ut_ad(p1.page_id() != p2.page_id());
+      divergence_height= root_height - height;
 
-			for (j = matched_fields; j < n_cols; j++) {
-				/* We add one if this index record has
-				a different prefix from the previous */
+      n_rows= 0;
 
-				n_diff[j]++;
-			}
+      if (p1.nth_rec() < p1.n_recs())
+      {
+        n_rows+= p1.n_recs() - p1.nth_rec();
+      }
 
-			if (n_not_null != NULL) {
-				btr_record_not_null_field_in_rec(
-					n_cols, offsets_next_rec, n_not_null);
-			}
+      if (p2.nth_rec() > 1)
+      {
+        n_rows+= p2.nth_rec() - 1;
+      }
+    }
+  }
+  else if (divergence_height != ULINT_UNDEFINED)
+  {
+    /* All records before the right page was already counted. Add records from
+    p2->page_no which are to the left of the record which servers as a right
+    border of the range, if any (we don't include the record itself in this
+    count). */
+    if (p2.nth_rec() > 1)
+      n_rows+= p2.nth_rec() - 1;
+  }
 
-			total_external_size
-				+= btr_rec_get_externally_stored_len(
-					rec, offsets_rec);
-
-			rec = next_rec;
-			/* Initialize offsets_rec for the next round
-			and assign the old offsets_rec buffer to
-			offsets_next_rec. */
-			{
-				rec_offs* offsets_tmp = offsets_rec;
-				offsets_rec = offsets_next_rec;
-				offsets_next_rec = offsets_tmp;
-			}
-		}
+  if (height)
+  {
+    ut_ad(height > 0);
+    height--;
+    p1.get_child(&offsets, &heap);
+    p2.get_child(&offsets, &heap);
+    goto search_loop;
+  }
 
-		if (n_cols == dict_index_get_n_unique_in_tree(index)
-		    && page_has_siblings(page)) {
+  should_count_the_left_border=
+      should_count_the_left_border && p1.should_count_the_left_border();
+  should_count_the_right_border=
+      should_count_the_right_border && p2.should_count_the_right_border();
 
-			/* If there is more than one leaf page in the tree,
-			we add one because we know that the first record
-			on the page certainly had a different prefix than the
-			last record on the previous index page in the
-			alphabetical order. Before this fix, if there was
-			just one big record on each clustered index page, the
-			algorithm grossly underestimated the number of rows
-			in the table. */
+  mtr.commit();
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
 
-			n_diff[n_cols - 1]++;
-		}
 
-		mtr_commit(&mtr);
-	}
+  range_start->page_id= p1.page_id();
+  range_end->page_id= p2.page_id();
 
-exit_loop:
-	/* If we saw k borders between different key values on
-	n_sample_pages leaf pages, we can estimate how many
-	there will be in index->stat_n_leaf_pages */
+  /* Here none of the borders were counted. For example, if on the leaf level
+  we descended to:
+  (inf, a, b, c, d, e, f, sup)
+           ^        ^
+         path1    path2
+  then n_rows will be 2 (c and d). */
 
-	/* We must take into account that our sample actually represents
-	also the pages used for external storage of fields (those pages are
-	included in index->stat_n_leaf_pages) */
+  if (is_n_rows_exact)
+  {
+    /* Only fiddle to adjust this off-by-one if the number is exact, otherwise
+    we do much grosser adjustments below. */
 
-	result.reserve(n_cols);
+    /* If both paths end up on the same record on the leaf level. */
+    if (p1.page_id() == p2.page_id() && p1.nth_rec() == p2.nth_rec())
+    {
 
-	for (j = 0; j < n_cols; j++) {
-		index_field_stats_t stat;
+      /* n_rows can be > 0 here if the paths were first different and then
+      converged to the same record on the leaf level.
+      For example:
+      SELECT ... LIKE 'wait/synch/rwlock%'
+      mode1=PAGE_CUR_GE,
+      tuple1="wait/synch/rwlock"
+      path1[0]={nth_rec=58, n_recs=58,
+                page_no=3, page_level=1}
+      path1[1]={nth_rec=56, n_recs=55,
+                page_no=119, page_level=0}
+
+      mode2=PAGE_CUR_G
+      tuple2="wait/synch/rwlock"
+      path2[0]={nth_rec=57, n_recs=57,
+                page_no=3, page_level=1}
+      path2[1]={nth_rec=56, n_recs=55,
+                page_no=119, page_level=0} */
+
+      /* If the range is such that we should count both borders, then avoid
+      counting that record twice - once as a left border and once as a right
+      border. Some of the borders should not be counted, e.g. [3,3). */
+      n_rows= should_count_the_left_border && should_count_the_right_border;
+    }
+    else
+      n_rows+= should_count_the_left_border + should_count_the_right_border;
+  }
 
-		stat.n_diff_key_vals
-			= BTR_TABLE_STATS_FROM_SAMPLE(
-				n_diff[j], index, n_sample_pages,
-				total_external_size, not_empty_flag);
+  if (root_height > divergence_height && !is_n_rows_exact)
+    /* In trees whose height is > 1 our algorithm tends to underestimate:
+    multiply the estimate by 2: */
+    n_rows*= 2;
 
-		/* If the tree is small, smaller than
-		10 * n_sample_pages + total_external_size, then
-		the above estimate is ok. For bigger trees it is common that we
-		do not see any borders between key values in the few pages
-		we pick. But still there may be n_sample_pages
-		different key values, or even more. Let us try to approximate
-		that: */
+  DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows););
 
-		add_on = index->stat_n_leaf_pages
-			/ (10 * (n_sample_pages
-				 + total_external_size));
+  /* Do not estimate the number of rows in the range to over 1 / 2 of the
+  estimated rows in the whole table */
 
-		if (add_on > n_sample_pages) {
-			add_on = n_sample_pages;
-		}
+  if (n_rows > table_n_rows / 2 && !is_n_rows_exact)
+  {
 
-		stat.n_diff_key_vals += add_on;
+    n_rows= table_n_rows / 2;
 
-		stat.n_sample_sizes = n_sample_pages;
+    /* If there are just 0 or 1 rows in the table, then we estimate all rows
+    are in the range */
 
-		if (n_not_null != NULL) {
-			stat.n_non_null_key_vals =
-				 BTR_TABLE_STATS_FROM_SAMPLE(
-					n_not_null[j], index, n_sample_pages,
-					total_external_size, not_empty_flag);
-		}
+    if (n_rows == 0)
+      n_rows= table_n_rows;
+  }
 
-		result.push_back(stat);
-	}
+  DBUG_RETURN(n_rows);
 
-	mem_heap_free(heap);
+error:
+  mtr.commit();
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
 
-	return result;
+  DBUG_RETURN(0);
 }
 
 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
@@ -7146,11 +5802,10 @@ static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
   ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
   mtr->commit();
 
-  const ulint fold= page_id.fold();
-
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
   mysql_mutex_lock(&buf_pool.mutex);
 
-  if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
+  if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
     if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
       /* Attempt to deallocate the redundant copy of the uncompressed page
       if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
@@ -7198,7 +5853,7 @@ struct btr_blob_log_check_t {
 		  m_op(op)
 	{
 		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
-		ut_ad((*m_block)->frame == page_align(*m_rec));
+		ut_ad((*m_block)->page.frame == page_align(*m_rec));
 		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
 	}
 
@@ -7213,7 +5868,7 @@ struct btr_blob_log_check_t {
 		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
 			offs = page_offset(*m_rec);
 			page_no = (*m_block)->page.id().page_no();
-			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
+			(*m_block)->page.fix();
 			ut_ad(page_no != FIL_NULL);
 		} else {
 			btr_pcur_store_position(m_pcur, m_mtr);
@@ -7222,27 +5877,34 @@ struct btr_blob_log_check_t {
 
 		DEBUG_SYNC_C("blob_write_middle");
 
-		log_free_check();
-
-		DEBUG_SYNC_C("blob_write_middle_after_check");
-
 		const mtr_log_t log_mode = m_mtr->get_log_mode();
 		m_mtr->start();
 		m_mtr->set_log_mode(log_mode);
 		index->set_modified(*m_mtr);
 
+		log_free_check();
+
+		DEBUG_SYNC_C("blob_write_middle_after_check");
+
 		if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
+			dberr_t err;
+			if (UNIV_LIKELY(index->page != page_no)) {
+				ut_a(btr_root_block_get(index, RW_SX_LATCH,
+							m_mtr, &err));
+			}
 			m_pcur->btr_cur.page_cur.block = btr_block_get(
 				*index, page_no, RW_X_LATCH, false, m_mtr);
+			/* The page should not be evicted or corrupted while
+			we are holding a buffer-fix on it. */
+			m_pcur->btr_cur.page_cur.block->page.unfix();
 			m_pcur->btr_cur.page_cur.rec
-				= m_pcur->btr_cur.page_cur.block->frame
+				= m_pcur->btr_cur.page_cur.block->page.frame
 				+ offs;
-
-			buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block);
 		} else {
 			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
-			ut_a(btr_pcur_restore_position(
-			      BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL, m_pcur,
+			mtr_sx_lock_index(index, m_mtr);
+			ut_a(m_pcur->restore_position(
+			      BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED,
 			      m_mtr) == btr_pcur_t::SAME_ALL);
 		}
 
@@ -7313,12 +5975,16 @@ btr_store_big_rec_extern_fields(
 	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
 	ut_a(dict_index_is_clust(index));
 
+	if (!fil_page_index_page_check(page_align(rec))) {
+		if (op != BTR_STORE_INSERT_BULK) {
+			return DB_PAGE_CORRUPTED;
+		}
+	}
+
 	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
 				      &rec, op);
 	page_zip = buf_block_get_page_zip(rec_block);
 	space_id = rec_block->page.id().space();
-	ut_a(fil_page_index_page_check(page_align(rec))
-	     || op == BTR_STORE_INSERT_BULK);
 
 	if (page_zip) {
 		int	err;
@@ -7407,84 +6073,93 @@ btr_store_big_rec_extern_fields(
 				page_zip = buf_block_get_page_zip(rec_block);
 			}
 
+			ut_ad(btr_mtr->get_already_latched(
+				      page_id_t{index->table->space_id, index->page},
+				      MTR_MEMO_PAGE_SX_FIX));
+
 			mtr.start();
 			index->set_modified(mtr);
-			mtr.set_log_mode(btr_mtr->get_log_mode());
+			mtr.set_log_mode_sub(*btr_mtr);
 
-			buf_page_get(rec_block->page.id(),
-				     rec_block->zip_size(), RW_X_LATCH, &mtr);
+			rec_block->page.fix();
+			rec_block->page.lock.x_lock();
+
+			mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+			ut_ad(!btr_search_check_marked_free_index(rec_block));
+#endif
 
 			uint32_t hint_prev = prev_page_no;
 			if (hint_prev == FIL_NULL) {
 				hint_prev = rec_block->page.id().page_no();
 			}
 
-			if (!fsp_reserve_free_extents(&r_extents,
-						      index->table->space, 1,
-						      FSP_BLOB, &mtr, 1)) {
+			error = fsp_reserve_free_extents(
+				&r_extents, index->table->space, 1,
+				FSP_BLOB, &mtr, 1);
+			if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+alloc_fail:
 				mtr.commit();
-				error = DB_OUT_OF_FILE_SPACE;
 				goto func_exit;
 			}
 
 			block = btr_page_alloc(index, hint_prev + 1,
-					       FSP_NO_DIR, 0, &mtr, &mtr);
+					       FSP_NO_DIR, 0, &mtr, &mtr,
+					       &error);
 
 			index->table->space->release_free_extents(r_extents);
-
-			ut_a(block != NULL);
+			if (!block) {
+				goto alloc_fail;
+			}
 
 			const uint32_t page_no = block->page.id().page_no();
 
-			if (prev_page_no != FIL_NULL) {
-				buf_block_t*	prev_block;
-
-				prev_block = buf_page_get(
-					page_id_t(space_id, prev_page_no),
-					rec_block->zip_size(),
-					RW_X_LATCH, &mtr);
-
-				buf_block_dbg_add_level(prev_block,
-							SYNC_EXTERN_STORAGE);
-
+			if (prev_page_no == FIL_NULL) {
+			} else if (buf_block_t* prev_block =
+				   buf_page_get_gen(page_id_t(space_id,
+							  prev_page_no),
+                                                    rec_block->zip_size(),
+                                                    RW_X_LATCH, nullptr,
+                                                    BUF_GET, &mtr, &error)) {
 				if (page_zip) {
 					mtr.write<4>(*prev_block,
-						     prev_block->frame
+						     prev_block->page.frame
 						     + FIL_PAGE_NEXT,
 						     page_no);
 					memcpy_aligned<4>(
 						buf_block_get_page_zip(
 							prev_block)
 						->data + FIL_PAGE_NEXT,
-						prev_block->frame
+						prev_block->page.frame
 						+ FIL_PAGE_NEXT, 4);
 				} else {
 					mtr.write<4>(*prev_block,
 						     BTR_BLOB_HDR_NEXT_PAGE_NO
 						     + FIL_PAGE_DATA
-						     + prev_block->frame,
+						     + prev_block->page.frame,
 						     page_no);
 				}
-			} else if (dict_index_is_online_ddl(index)) {
-				row_log_table_blob_alloc(index, page_no);
+			} else {
+				goto alloc_fail;
 			}
 
-			ut_ad(!page_has_siblings(block->frame));
-			ut_ad(!fil_page_get_type(block->frame));
+			ut_ad(!page_has_siblings(block->page.frame));
+			ut_ad(!fil_page_get_type(block->page.frame));
 
 			if (page_zip) {
 				int		err;
 				page_zip_des_t*	blob_page_zip;
 
 				mtr.write<1>(*block,
-					     FIL_PAGE_TYPE + 1 + block->frame,
+					     FIL_PAGE_TYPE + 1
+					     + block->page.frame,
 					     prev_page_no == FIL_NULL
 					     ? FIL_PAGE_TYPE_ZBLOB
 					     : FIL_PAGE_TYPE_ZBLOB2);
 				block->page.zip.data[FIL_PAGE_TYPE + 1]
-					= block->frame[FIL_PAGE_TYPE + 1];
+					= block->page.frame[FIL_PAGE_TYPE + 1];
 
-				c_stream.next_out = block->frame
+				c_stream.next_out = block->page.frame
 					+ FIL_PAGE_DATA;
 				c_stream.avail_out = static_cast<uInt>(
 					payload_size_zip);
@@ -7506,7 +6181,7 @@ btr_store_big_rec_extern_fields(
 				ut_ad(blob_page_zip);
 				ut_ad(page_zip_get_size(blob_page_zip)
 				      == page_zip_get_size(page_zip));
-				memcpy(blob_page_zip->data, block->frame,
+				memcpy(blob_page_zip->data, block->page.frame,
 				       page_zip_get_size(page_zip));
 
 				if (err == Z_OK && prev_page_no != FIL_NULL) {
@@ -7559,7 +6234,7 @@ next_zip_page:
 				}
 			} else {
 				mtr.write<1>(*block, FIL_PAGE_TYPE + 1
-					     + block->frame,
+					     + block->page.frame,
 					     FIL_PAGE_TYPE_BLOB);
 
 				if (extern_len > payload_size) {
@@ -7571,13 +6246,14 @@ next_zip_page:
 				mtr.memcpy<mtr_t::MAYBE_NOP>(
 					*block,
 					FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
-					+ block->frame,
+					+ block->page.frame,
 					static_cast<const byte*>
 					(big_rec_vec->fields[i].data)
 					+ big_rec_vec->fields[i].len
 					- extern_len, store_len);
 				mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
-					     + FIL_PAGE_DATA + block->frame,
+					     + FIL_PAGE_DATA
+					     + block->page.frame,
 					     store_len);
 				compile_time_assert(FIL_NULL == 0xffffffff);
 				mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
@@ -7657,29 +6333,29 @@ func_exit:
 }
 
 /** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
-@param[in]      block   uncompressed BLOB page
-@param[in]      read    true=read, false=purge */
-static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read)
+@param block   uncompressed BLOB page
+@param op      operation
+@return whether the type is invalid */
+static bool btr_check_blob_fil_page_type(const buf_block_t& block,
+                                         const char *op)
 {
-  uint16_t type= fil_page_get_type(block.frame);
+  uint16_t type= fil_page_get_type(block.page.frame);
 
-  if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB))
-    return;
-  /* FIXME: take the tablespace as a parameter */
-  if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
+  if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB));
+  else if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
   {
     /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
     pages.  Do not print anything about the type mismatch when reading
     a BLOB page that may be from old versions. */
-    if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags))
-    {
-      ib::fatal() << "FIL_PAGE_TYPE=" << type
-		  << (read ? " on BLOB read file " : " on BLOB purge file ")
-		  << space->chain.start->name
-		  << " page " << block.page.id().page_no();
-    }
+    bool fail= space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags);
+    if (fail)
+      sql_print_error("InnoDB: FIL_PAGE_TYPE=%u on BLOB %s file %s page %u",
+                      type, op, space->chain.start->name,
+                      block.page.id().page_no());
     space->release();
+    return fail;
   }
+  return false;
 }
 
 /*******************************************************************//**
@@ -7711,24 +6387,19 @@ btr_free_externally_stored_field(
 					containing the latch to data an an
 					X-latch to the index tree */
 {
-	page_t*		page;
 	const uint32_t	space_id	= mach_read_from_4(
 		field_ref + BTR_EXTERN_SPACE_ID);
-	const uint32_t	start_page	= mach_read_from_4(
-		field_ref + BTR_EXTERN_PAGE_NO);
-	uint32_t	page_no;
-	uint32_t	next_page_no;
-	mtr_t		mtr;
 
 	ut_ad(index->is_primary());
+	ut_ad(block->page.lock.have_x());
 	ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 					       | MTR_MEMO_SX_LOCK));
 	ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
 						    MTR_MEMO_PAGE_X_FIX));
 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
 	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
-	ut_ad(local_mtr->is_named_space(
-		      page_get_space_id(page_align(field_ref))));
+	ut_ad(index->table->space_id == index->table->space->id);
+	ut_ad(local_mtr->is_named_space(index->table->space));
 
 	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
 				  BTR_EXTERN_FIELD_REF_SIZE))) {
@@ -7742,40 +6413,25 @@ btr_free_externally_stored_field(
 	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
 	        & ~((BTR_EXTERN_OWNER_FLAG
 	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
-	ut_ad(space_id == index->table->space->id);
 	ut_ad(space_id == index->table->space_id);
 
 	const ulint ext_zip_size = index->table->space->zip_size();
-	const ulint rec_zip_size = rec ? ext_zip_size : 0;
-
 	/* !rec holds in a call from purge when field_ref is in an undo page */
 	ut_ad(rec || !block->page.zip.data);
 
 	for (;;) {
-#ifdef UNIV_DEBUG
-		buf_block_t*	rec_block;
-#endif /* UNIV_DEBUG */
-		buf_block_t*	ext_block;
+		mtr_t mtr;
 
-		mtr_start(&mtr);
+		mtr.start();
 		mtr.set_spaces(*local_mtr);
-		mtr.set_log_mode(local_mtr->get_log_mode());
+		mtr.set_log_mode_sub(*local_mtr);
 
 		ut_ad(!index->table->is_temporary()
 		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
 
-		const page_t*	p = page_align(field_ref);
-
-		const page_id_t	page_id(page_get_space_id(p),
-					page_get_page_no(p));
-
-#ifdef UNIV_DEBUG
-		rec_block =
-#endif /* UNIV_DEBUG */
-		buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr);
-
-		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
-		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+		const uint32_t page_no = mach_read_from_4(
+			field_ref + BTR_EXTERN_PAGE_NO);
+		buf_block_t* ext_block;
 
 		if (/* There is no external storage data */
 		    page_no == FIL_NULL
@@ -7786,23 +6442,32 @@ btr_free_externally_stored_field(
 		    || (rollback
 			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
 			    & BTR_EXTERN_INHERITED_FLAG))) {
-
+skip_free:
 			/* Do not free */
-			mtr_commit(&mtr);
+			mtr.commit();
 
 			return;
 		}
 
-		if (page_no == start_page && dict_index_is_online_ddl(index)) {
-			row_log_table_blob_free(index, start_page);
+		ext_block = buf_page_get(page_id_t(space_id, page_no),
+					 ext_zip_size, RW_X_LATCH, &mtr);
+
+		if (!ext_block) {
+			goto skip_free;
 		}
 
-		ext_block = buf_page_get(
-			page_id_t(space_id, page_no), ext_zip_size,
-			RW_X_LATCH, &mtr);
+		/* The buffer pool block containing the BLOB pointer is
+		exclusively latched by local_mtr. To satisfy some design
+		constraints, we must recursively latch it in mtr as well. */
+		block->fix();
+		block->page.lock.x_lock();
 
-		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
-		page = buf_block_get_frame(ext_block);
+		mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+		ut_ad(!btr_search_check_marked_free_index(block));
+#endif
+
+		const page_t* page = buf_block_get_frame(ext_block);
 
 		if (ext_zip_size) {
 			/* Note that page_zip will be NULL
@@ -7812,11 +6477,14 @@ btr_free_externally_stored_field(
 			case FIL_PAGE_TYPE_ZBLOB2:
 				break;
 			default:
-				ut_error;
+				MY_ASSERT_UNREACHABLE();
 			}
-			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
+			const uint32_t next_page_no = mach_read_from_4(
+				page + FIL_PAGE_NEXT);
 
-			btr_page_free(index, ext_block, &mtr, true);
+			btr_page_free(index, ext_block, &mtr, true,
+				      local_mtr->memo_contains(
+					      *index->table->space));
 
 			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
@@ -7835,12 +6503,14 @@ btr_free_externally_stored_field(
 			}
 		} else {
 			ut_ad(!block->page.zip.data);
-			btr_check_blob_fil_page_type(*ext_block, false);
+			btr_check_blob_fil_page_type(*ext_block, "purge");
 
-			next_page_no = mach_read_from_4(
+			const uint32_t next_page_no = mach_read_from_4(
 				page + FIL_PAGE_DATA
 				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
-			btr_page_free(index, ext_block, &mtr, true);
+			btr_page_free(index, ext_block, &mtr, true,
+				      local_mtr->memo_contains(
+					      *index->table->space));
 
 			mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
 				     next_page_no);
@@ -7967,11 +6637,12 @@ btr_copy_blob_prefix(
 		mtr_start(&mtr);
 
 		block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
-		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+		if (!block || btr_check_blob_fil_page_type(*block, "read")) {
+			mtr.commit();
+			return copied_len;
+		}
 		page = buf_block_get_frame(block);
 
-		btr_check_blob_fil_page_type(*block, true);
-
 		blob_header = page + offset;
 		part_len = btr_blob_get_part_len(blob_header);
 		copy_len = ut_min(part_len, len - copied_len);
@@ -8118,11 +6789,13 @@ inflate_error:
 			}
 
 end_of_blob:
-			buf_page_release_zip(bpage);
+			bpage->lock.s_unlock();
+			bpage->unfix();
 			goto func_exit;
 		}
 
-		buf_page_release_zip(bpage);
+		bpage->lock.s_unlock();
+		bpage->unfix();
 
 		/* On other BLOB pages except the first
 		the BLOB header always is at the page header: */
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
index a5335c73934..642db0e9f1c 100644
--- a/storage/innobase/btr/btr0defragment.cc
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
-Copyright (C) 2014, 2021, MariaDB Corporation.
+Copyright (C) 2014, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +36,7 @@ Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
 #include "ibuf0ibuf.h"
 #include "lock0lock.h"
 #include "srv0start.h"
+#include "mysqld.h"
 
 #include <list>
 
@@ -53,16 +54,15 @@ time will make sure the page is compressible within a couple of iterations. */
 /** Item in the work queue for btr_degrament_thread. */
 struct btr_defragment_item_t
 {
-	btr_pcur_t*	pcur;		/* persistent cursor where
-					btr_defragment_n_pages should start */
-	os_event_t	event;		/* if not null, signal after work
-					is done */
-	bool		removed;	/* Mark an item as removed */
-	ulonglong	last_processed;	/* timestamp of last time this index
-					is processed by defragment thread */
-
-	btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
-	~btr_defragment_item_t();
+  /** persistent cursor where btr_defragment_n_pages should start */
+  btr_pcur_t * const pcur;
+  /** completion signal */
+  pthread_cond_t *cond;
+  /** timestamp of last time this index is processed by defragment thread */
+  ulonglong last_processed= 0;
+
+  btr_defragment_item_t(btr_pcur_t *pcur, pthread_cond_t *cond)
+    : pcur(pcur), cond(cond) {}
 };
 
 /* Work queue for defragmentation. */
@@ -70,9 +70,9 @@ typedef std::list<btr_defragment_item_t*>	btr_defragment_wq_t;
 static btr_defragment_wq_t	btr_defragment_wq;
 
 /* Mutex protecting the defragmentation work queue.*/
-ib_mutex_t		btr_defragment_mutex;
+static mysql_mutex_t btr_defragment_mutex;
 #ifdef UNIV_PFS_MUTEX
-UNIV_INTERN mysql_pfs_key_t	btr_defragment_mutex_key;
+mysql_pfs_key_t btr_defragment_mutex_key;
 #endif /* UNIV_PFS_MUTEX */
 
 /* Number of compression failures caused by defragmentation since server
@@ -87,13 +87,6 @@ the amount of effort wasted. */
 Atomic_counter<ulint> btr_defragment_count;
 
 bool btr_defragment_active;
-
-struct defragment_chunk_state_t
-{
-	btr_defragment_item_t* m_item;
-};
-
-static defragment_chunk_state_t defragment_chunk_state;
 static void btr_defragment_chunk(void*);
 
 static tpool::timer* btr_defragment_timer;
@@ -101,29 +94,6 @@ static tpool::task_group task_group(1);
 static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
 static void btr_defragment_start();
 
-/******************************************************************//**
-Constructor for btr_defragment_item_t. */
-btr_defragment_item_t::btr_defragment_item_t(
-	btr_pcur_t* pcur,
-	os_event_t event)
-{
-	this->pcur = pcur;
-	this->event = event;
-	this->removed = false;
-	this->last_processed = 0;
-}
-
-/******************************************************************//**
-Destructor for btr_defragment_item_t. */
-btr_defragment_item_t::~btr_defragment_item_t() {
-	if (this->pcur) {
-		btr_pcur_free_for_mysql(this->pcur);
-	}
-	if (this->event) {
-		os_event_set(this->event);
-	}
-}
-
 static void submit_defragment_task(void*arg=0)
 {
 	srv_thread_pool->submit_task(&btr_defragment_task);
@@ -135,8 +105,8 @@ void
 btr_defragment_init()
 {
 	srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
-	mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex);
-	defragment_chunk_state.m_item = 0;
+	mysql_mutex_init(btr_defragment_mutex_key, &btr_defragment_mutex,
+			 nullptr);
 	btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
 	btr_defragment_active = true;
 }
@@ -151,15 +121,17 @@ btr_defragment_shutdown()
 	delete btr_defragment_timer;
 	btr_defragment_timer = 0;
 	task_group.cancel_pending(&btr_defragment_task);
-	mutex_enter(&btr_defragment_mutex);
+	mysql_mutex_lock(&btr_defragment_mutex);
 	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
 	while(iter != btr_defragment_wq.end()) {
 		btr_defragment_item_t* item = *iter;
 		iter = btr_defragment_wq.erase(iter);
-		delete item;
+		if (item->cond) {
+			pthread_cond_signal(item->cond);
+		}
 	}
-	mutex_exit(&btr_defragment_mutex);
-	mutex_free(&btr_defragment_mutex);
+	mysql_mutex_unlock(&btr_defragment_mutex);
+	mysql_mutex_destroy(&btr_defragment_mutex);
 	btr_defragment_active = false;
 }
 
@@ -174,7 +146,7 @@ bool
 btr_defragment_find_index(
 	dict_index_t*	index)	/*!< Index to find. */
 {
-	mutex_enter(&btr_defragment_mutex);
+	mysql_mutex_lock(&btr_defragment_mutex);
 	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
 	     iter != btr_defragment_wq.end();
 	     ++iter) {
@@ -183,65 +155,47 @@ btr_defragment_find_index(
 		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
 		dict_index_t* idx = btr_cur_get_index(cursor);
 		if (index->id == idx->id) {
-			mutex_exit(&btr_defragment_mutex);
+			mysql_mutex_unlock(&btr_defragment_mutex);
 			return true;
 		}
 	}
-	mutex_exit(&btr_defragment_mutex);
+	mysql_mutex_unlock(&btr_defragment_mutex);
 	return false;
 }
 
-/******************************************************************//**
-Query thread uses this function to add an index to btr_defragment_wq.
-Return a pointer to os_event for the query thread to wait on if this is a
-synchronized defragmentation. */
-os_event_t
-btr_defragment_add_index(
-	dict_index_t*	index,	/*!< index to be added  */
-	dberr_t*	err)	/*!< out: error code */
+/** Defragment an index.
+@param pcur      persistent cursor
+@param thd       current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd)
 {
-	mtr_t mtr;
-	*err = DB_SUCCESS;
-
-	mtr_start(&mtr);
-	buf_block_t* block = btr_root_block_get(index, RW_NO_LATCH, &mtr);
-	page_t* page = NULL;
-
-	if (block) {
-		page = buf_block_get_frame(block);
-	}
-
-	if (page == NULL && !index->is_readable()) {
-		mtr_commit(&mtr);
-		*err = DB_DECRYPTION_FAILED;
-		return NULL;
-	}
-
-	ut_ad(fil_page_index_page_check(page));
-	ut_ad(!page_has_siblings(page));
-
-	if (page_is_leaf(page)) {
-		// Index root is a leaf page, no need to defragment.
-		mtr_commit(&mtr);
-		return NULL;
-	}
-	btr_pcur_t* pcur = btr_pcur_create_for_mysql();
-	os_event_t event = os_event_create(0);
-	btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
-				    true, 0, &mtr);
-	btr_pcur_move_to_next(pcur, &mtr);
-	btr_pcur_store_position(pcur, &mtr);
-	mtr_commit(&mtr);
-	dict_stats_empty_defrag_summary(index);
-	btr_defragment_item_t*	item = new btr_defragment_item_t(pcur, event);
-	mutex_enter(&btr_defragment_mutex);
-	btr_defragment_wq.push_back(item);
-	if(btr_defragment_wq.size() == 1){
-		/* Kick off defragmentation work */
-		btr_defragment_start();
-	}
-	mutex_exit(&btr_defragment_mutex);
-	return event;
+  dict_stats_empty_defrag_summary(pcur->index());
+  pthread_cond_t cond;
+  pthread_cond_init(&cond, nullptr);
+  btr_defragment_item_t item(pcur, &cond);
+  mysql_mutex_lock(&btr_defragment_mutex);
+  btr_defragment_wq.push_back(&item);
+  if (btr_defragment_wq.size() == 1)
+    /* Kick off defragmentation work */
+    btr_defragment_start();
+  bool interrupted= false;
+  for (;;)
+  {
+    timespec abstime;
+    set_timespec(abstime, 1);
+    if (!my_cond_timedwait(&cond, &btr_defragment_mutex.m_mutex, &abstime))
+      break;
+    if (thd_killed(thd))
+    {
+      item.cond= nullptr;
+      interrupted= true;
+      break;
+    }
+  }
+
+  pthread_cond_destroy(&cond);
+  mysql_mutex_unlock(&btr_defragment_mutex);
+  return interrupted;
 }
 
 /******************************************************************//**
@@ -252,104 +206,22 @@ void
 btr_defragment_remove_table(
 	dict_table_t*	table)	/*!< Index to be removed. */
 {
-	mutex_enter(&btr_defragment_mutex);
-	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
-	     iter != btr_defragment_wq.end();
-	     ++iter) {
-		btr_defragment_item_t* item = *iter;
-		btr_pcur_t* pcur = item->pcur;
-		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
-		dict_index_t* idx = btr_cur_get_index(cursor);
-		if (table->id == idx->table->id) {
-			item->removed = true;
-		}
-	}
-	mutex_exit(&btr_defragment_mutex);
-}
-
-/******************************************************************//**
-Query thread uses this function to mark an index as removed in
-btr_efragment_wq. */
-void
-btr_defragment_remove_index(
-	dict_index_t*	index)	/*!< Index to be removed. */
-{
-	mutex_enter(&btr_defragment_mutex);
-	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
-	     iter != btr_defragment_wq.end();
-	     ++iter) {
-		btr_defragment_item_t* item = *iter;
-		btr_pcur_t* pcur = item->pcur;
-		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
-		dict_index_t* idx = btr_cur_get_index(cursor);
-		if (index->id == idx->id) {
-			item->removed = true;
-			item->event = NULL;
-			break;
-		}
-	}
-	mutex_exit(&btr_defragment_mutex);
-}
-
-/******************************************************************//**
-Functions used by defragmentation thread: btr_defragment_xxx_item.
-Defragmentation thread operates on the work *item*. It gets/removes
-item from the work queue. */
-/******************************************************************//**
-Defragment thread uses this to remove an item from btr_defragment_wq.
-When an item is removed from the work queue, all resources associated with it
-are free as well. */
-void
-btr_defragment_remove_item(
-	btr_defragment_item_t*	item) /*!< Item to be removed. */
-{
-	mutex_enter(&btr_defragment_mutex);
-	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
-	     iter != btr_defragment_wq.end();
-	     ++iter) {
-		if (item == *iter) {
-			btr_defragment_wq.erase(iter);
-			delete item;
-			break;
-		}
-	}
-	mutex_exit(&btr_defragment_mutex);
-}
-
-/******************************************************************//**
-Defragment thread uses this to get an item from btr_defragment_wq to work on.
-The item is not removed from the work queue so query threads can still access
-this item. We keep it this way so query threads can find and kill a
-defragmentation even if that index is being worked on. Be aware that while you
-work on this item you have no lock protection on it whatsoever. This is OK as
-long as the query threads and defragment thread won't modify the same fields
-without lock protection.
-*/
-btr_defragment_item_t*
-btr_defragment_get_item()
-{
-	if (btr_defragment_wq.empty()) {
-		return NULL;
-		//return nullptr;
-	}
-	mutex_enter(&btr_defragment_mutex);
-	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
-	if (iter == btr_defragment_wq.end()) {
-		iter = btr_defragment_wq.begin();
-	}
-	btr_defragment_item_t* item = *iter;
-	iter++;
-	mutex_exit(&btr_defragment_mutex);
-	return item;
+  mysql_mutex_lock(&btr_defragment_mutex);
+  for (auto item : btr_defragment_wq)
+  {
+    if (item->cond && table == item->pcur->index()->table)
+    {
+      pthread_cond_signal(item->cond);
+      item->cond= nullptr;
+    }
+  }
+  mysql_mutex_unlock(&btr_defragment_mutex);
 }
 
 /*********************************************************************//**
 Check whether we should save defragmentation statistics to persistent storage.
 Currently we save the stats to persistent storage every 100 updates. */
-UNIV_INTERN
-void
-btr_defragment_save_defrag_stats_if_needed(
-	dict_index_t*	index)	/*!< in: index */
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index)
 {
 	if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
 	    && index->table->space_id != 0 // do not track system tables
@@ -367,7 +239,7 @@ Main defragment functionalities used by defragment thread.*/
 Calculate number of records from beginning of block that can
 fit into size_limit
 @return number of records */
-UNIV_INTERN
+static
 ulint
 btr_defragment_calc_n_recs_for_size(
 	buf_block_t* block,	/*!< in: B-tree page */
@@ -387,9 +259,10 @@ btr_defragment_calc_n_recs_for_size(
 
 	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
 	page_cur_set_before_first(block, &cur);
-	page_cur_move_to_next(&cur);
-	while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
-		rec_t* cur_rec = page_cur_get_rec(&cur);
+	while (rec_t* cur_rec = page_cur_move_to_next(&cur)) {
+		if (page_rec_is_supremum(cur_rec)) {
+			break;
+		}
 		offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
 					  ULINT_UNDEFINED, &heap);
 		ulint rec_size = rec_offs_size(offsets);
@@ -399,7 +272,6 @@ btr_defragment_calc_n_recs_for_size(
 			break;
 		}
 		n_recs ++;
-		page_cur_move_to_next(&cur);
 	}
 	*n_recs_size = size;
 	if (UNIV_LIKELY_NULL(heap)) {
@@ -408,10 +280,75 @@ btr_defragment_calc_n_recs_for_size(
 	return n_recs;
 }
 
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_search_father_node_ptr(
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+	dict_index_t* index = btr_cur_get_index(cursor);
+	ut_ad(!index->is_spatial());
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	const auto level = btr_page_get_level(btr_cur_get_page(cursor));
+
+	const rec_t* user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+
+	if (btr_cur_search_to_nth_level(level + 1,
+					dict_index_build_node_ptr(index,
+								  user_rec, 0,
+								  heap, level),
+					RW_X_LATCH,
+					cursor, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	const rec_t* node_ptr = btr_cur_get_rec(cursor);
+	ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive()
+	      || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+
+	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		offsets = nullptr;
+	}
+
+	return(offsets);
+}
+
+static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor)
+{
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (UNIV_UNLIKELY(!rec))
+    return false;
+  cursor->page_cur.rec= rec;
+  mem_heap_t *heap= mem_heap_create(100);
+  const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr);
+  mem_heap_free(heap);
+  return got;
+}
+
 /*************************************************************//**
 Merge as many records from the from_block to the to_block. Delete
 the from_block if all records are successfully merged to to_block.
-@return the to_block to target for next merge operation. */
+@return the to_block to target for next merge operation.
+@retval nullptr if corruption was noticed */
 static
 buf_block_t*
 btr_defragment_merge_pages(
@@ -458,9 +395,9 @@ btr_defragment_merge_pages(
 	// reorganizing the page, otherwise we need to reorganize the page
 	// first to release more space.
 	if (move_size > max_ins_size) {
-		if (!btr_page_reorganize_block(page_zip_level,
-					       to_block, index,
-					       mtr)) {
+		dberr_t err = btr_page_reorganize_block(page_zip_level,
+                                                        to_block, index, mtr);
+		if (err != DB_SUCCESS) {
 			if (!dict_index_is_clust(index)
 			    && page_is_leaf(to_page)) {
 				ibuf_reset_free_bits(to_block);
@@ -469,23 +406,31 @@ btr_defragment_merge_pages(
 			// not compressable. There's no point to try
 			// merging into this page. Continue to the
 			// next page.
-			return from_block;
+			return err == DB_FAIL ? from_block : nullptr;
 		}
 		ut_ad(page_validate(to_page, index));
 		max_ins_size = page_get_max_insert_size(to_page, n_recs);
-		ut_a(max_ins_size >= move_size);
+		if (max_ins_size < move_size) {
+			return nullptr;
+		}
 	}
 
 	// Move records to pack to_page more full.
 	orig_pred = NULL;
 	target_n_recs = n_recs_to_move;
+	dberr_t err;
 	while (n_recs_to_move > 0) {
-		rec = page_rec_get_nth(from_page,
-					n_recs_to_move + 1);
+		if (!(rec = page_rec_get_nth(from_page, n_recs_to_move + 1))) {
+			return nullptr;
+		}
 		orig_pred = page_copy_rec_list_start(
-			to_block, from_block, rec, index, mtr);
+			to_block, from_block, rec, index, mtr, &err);
 		if (orig_pred)
 			break;
+		if (err != DB_FAIL) {
+			return nullptr;
+		}
+
 		// If we reach here, that means compression failed after packing
 		// n_recs_to_move number of records to to_page. We try to reduce
 		// the targeted data size on the to_page by
@@ -524,19 +469,23 @@ btr_defragment_merge_pages(
 		}
 	}
 	btr_cur_t parent;
-	if (n_recs_to_move == n_recs) {
+	parent.page_cur.index = index;
+	parent.page_cur.block = from_block;
+
+	if (!btr_page_search_father(mtr, &parent)) {
+		to_block = nullptr;
+	} else if (n_recs_to_move == n_recs) {
 		/* The whole page is merged with the previous page,
 		free it. */
-		lock_update_merge_left(to_block, orig_pred,
-				       from_block);
+		lock_update_merge_left(*to_block, orig_pred,
+				       from_block->page.id());
 		btr_search_drop_page_hash_index(from_block, false);
-		ut_a(DB_SUCCESS == btr_level_list_remove(*from_block, *index,
-							 mtr));
-		btr_page_get_father(index, from_block, mtr, &parent);
-		btr_cur_node_ptr_delete(&parent, mtr);
-		/* btr_blob_dbg_remove(from_page, index,
-		"btr_defragment_n_pages"); */
-		btr_page_free(index, from_block, mtr);
+		if (btr_level_list_remove(*from_block, *index, mtr)
+		    != DB_SUCCESS
+		    || btr_cur_node_ptr_delete(&parent, mtr) != DB_SUCCESS
+		    || btr_page_free(index, from_block, mtr) != DB_SUCCESS) {
+			return nullptr;
+		}
 	} else {
 		// There are still records left on the page, so
 		// increment n_defragmented. Node pointer will be changed
@@ -552,15 +501,23 @@ btr_defragment_merge_pages(
 						    orig_pred,
 						    from_block);
 			// FIXME: reuse the node_ptr!
-			btr_page_get_father(index, from_block, mtr, &parent);
-			btr_cur_node_ptr_delete(&parent, mtr);
+			if (btr_cur_node_ptr_delete(&parent, mtr)
+			    != DB_SUCCESS) {
+				return nullptr;
+			}
 			rec = page_rec_get_next(
 				page_get_infimum_rec(from_page));
+			if (!rec) {
+				return nullptr;
+			}
 			node_ptr = dict_index_build_node_ptr(
 				index, rec, page_get_page_no(from_page),
 				heap, level);
-			btr_insert_on_non_leaf_level(0, index, level+1,
-						     node_ptr, mtr);
+			if (btr_insert_on_non_leaf_level(0, index, level+1,
+							 node_ptr, mtr)
+			    != DB_SUCCESS) {
+				return nullptr;
+			}
 		}
 		to_block = from_block;
 	}
@@ -577,7 +534,7 @@ the process, if any page becomes empty, that page will be removed from
 the level list. Record locks, hash, and node pointers are updated after
 page reorganization.
 @return pointer to the last block processed, or NULL if reaching end of index */
-UNIV_INTERN
+static
 buf_block_t*
 btr_defragment_n_pages(
 	buf_block_t*	block,	/*!< in: starting block for defragmentation */
@@ -604,7 +561,7 @@ btr_defragment_n_pages(
 	/* It doesn't make sense to call this function with n_pages = 1. */
 	ut_ad(n_pages > 1);
 
-	if (!page_is_leaf(block->frame)) {
+	if (!page_is_leaf(block->page.frame)) {
 		return NULL;
 	}
 
@@ -635,6 +592,9 @@ btr_defragment_n_pages(
 
 		blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
 					  mtr);
+		if (!blocks[i]) {
+			return nullptr;
+		}
 	}
 
 	if (n_pages == 1) {
@@ -645,7 +605,8 @@ btr_defragment_n_pages(
 				return NULL;
 			/* given page is the last page.
 			Lift the records to father. */
-			btr_lift_page_up(index, block, mtr);
+			dberr_t err;
+			btr_lift_page_up(index, block, mtr, &err);
 		}
 		return NULL;
 	}
@@ -708,6 +669,9 @@ btr_defragment_n_pages(
 		if (new_block != current_block) {
 			n_defragmented ++;
 			current_block = new_block;
+			if (!new_block) {
+				break;
+			}
 		}
 	}
 	mem_heap_free(heap);
@@ -744,34 +708,39 @@ The state (current item) is stored in function parameter.
 */
 static void btr_defragment_chunk(void*)
 {
-	defragment_chunk_state_t* state = &defragment_chunk_state;
+	THD *thd = innobase_create_background_thd("InnoDB defragment");
+	set_current_thd(thd);
 
-	btr_pcur_t*	pcur;
-	btr_cur_t*	cursor;
-	dict_index_t*	index;
+	btr_defragment_item_t* item = nullptr;
 	mtr_t		mtr;
-	buf_block_t*	first_block;
-	buf_block_t*	last_block;
+
+	mysql_mutex_lock(&btr_defragment_mutex);
 
 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
-		if (!state->m_item) {
-			state->m_item = btr_defragment_get_item();
-		}
-		/* If an index is marked as removed, we remove it from the work
-		queue. No other thread could be using this item at this point so
-		it's safe to remove now. */
-		while (state->m_item && state->m_item->removed) {
-			btr_defragment_remove_item(state->m_item);
-			state->m_item = btr_defragment_get_item();
+		if (!item) {
+			if (btr_defragment_wq.empty()) {
+release_and_exit:
+				mysql_mutex_unlock(&btr_defragment_mutex);
+func_exit:
+				set_current_thd(nullptr);
+				destroy_background_thd(thd);
+				return;
+			}
+			item = *btr_defragment_wq.begin();
+			ut_ad(item);
 		}
-		if (!state->m_item) {
-			/* Queue empty */
-			return;
+
+		if (!item->cond) {
+processed:
+			btr_defragment_wq.remove(item);
+			item = nullptr;
+			continue;
 		}
 
-		pcur = state->m_item->pcur;
+		mysql_mutex_unlock(&btr_defragment_mutex);
+
 		ulonglong now = my_interval_timer();
-		ulonglong elapsed = now - state->m_item->last_processed;
+		ulonglong elapsed = now - item->last_processed;
 
 		if (elapsed < srv_defragment_interval) {
 			/* If we see an index again before the interval
@@ -783,52 +752,53 @@ static void btr_defragment_chunk(void*)
 			int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
 			if (sleep_ms) {
 				btr_defragment_timer->set_time(sleep_ms, 0);
-				return;
+				goto func_exit;
 			}
 		}
 		log_free_check();
 		mtr_start(&mtr);
-		cursor = btr_pcur_get_btr_cur(pcur);
-		index = btr_cur_get_index(cursor);
+		dict_index_t *index = item->pcur->index();
 		index->set_modified(mtr);
-		/* To follow the latching order defined in WL#6326, acquire index->lock X-latch.
-		This entitles us to acquire page latches in any order for the index. */
+		/* To follow the latching order defined in WL#6326,
+		acquire index->lock X-latch.  This entitles us to
+		acquire page latches in any order for the index. */
 		mtr_x_lock_index(index, &mtr);
-		/* This will acquire index->lock SX-latch, which per WL#6363 is allowed
-		when we are already holding the X-latch. */
-		btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
-		first_block = btr_cur_get_block(cursor);
-
-		last_block = btr_defragment_n_pages(first_block, index,
-						    srv_defragment_n_pages,
-						    &mtr);
-		if (last_block) {
+		if (buf_block_t *last_block =
+		    item->pcur->restore_position(
+			    BTR_PURGE_TREE_ALREADY_LATCHED, &mtr)
+		    == btr_pcur_t::CORRUPTED
+		    ? nullptr
+		    : btr_defragment_n_pages(btr_pcur_get_block(item->pcur),
+					     index, srv_defragment_n_pages,
+					     &mtr)) {
 			/* If we haven't reached the end of the index,
 			place the cursor on the last record of last page,
 			store the cursor position, and put back in queue. */
 			page_t* last_page = buf_block_get_frame(last_block);
 			rec_t* rec = page_rec_get_prev(
 				page_get_supremum_rec(last_page));
-			ut_a(page_rec_is_user_rec(rec));
-			page_cur_position(rec, last_block,
-					  btr_cur_get_page_cur(cursor));
-			btr_pcur_store_position(pcur, &mtr);
+			if (rec && page_rec_is_user_rec(rec)) {
+				page_cur_position(rec, last_block,
+						  btr_pcur_get_page_cur(
+							  item->pcur));
+			}
+			btr_pcur_store_position(item->pcur, &mtr);
 			mtr_commit(&mtr);
 			/* Update the last_processed time of this index. */
-			state->m_item->last_processed = now;
+			item->last_processed = now;
+			mysql_mutex_lock(&btr_defragment_mutex);
 		} else {
-			dberr_t err = DB_SUCCESS;
 			mtr_commit(&mtr);
 			/* Reaching the end of the index. */
 			dict_stats_empty_defrag_stats(index);
-			err = dict_stats_save_defrag_stats(index);
-			if (err != DB_SUCCESS) {
+			if (dberr_t err= dict_stats_save_defrag_stats(index)) {
 				ib::error() << "Saving defragmentation stats for table "
 					    << index->table->name
 					    << " index " << index->name()
 					    << " failed with error " << err;
 			} else {
-				err = dict_stats_save_defrag_summary(index);
+				err = dict_stats_save_defrag_summary(index,
+								     thd);
 
 				if (err != DB_SUCCESS) {
 					ib::error() << "Saving defragmentation summary for table "
@@ -838,8 +808,13 @@ static void btr_defragment_chunk(void*)
 				}
 			}
 
-			btr_defragment_remove_item(state->m_item);
-			state->m_item = NULL;
+			mysql_mutex_lock(&btr_defragment_mutex);
+			if (item->cond) {
+				pthread_cond_signal(item->cond);
+			}
+			goto processed;
 		}
 	}
+
+	goto release_and_exit;
 }
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
index 2d538d69d04..d48437e4bd0 100644
--- a/storage/innobase/btr/btr0pcur.cc
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2021, MariaDB Corporation.
+Copyright (c) 2016, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,25 +30,6 @@ Created 2/23/1996 Heikki Tuuri
 #include "trx0trx.h"
 
 /**************************************************************//**
-Allocates memory for a persistent cursor object and initializes the cursor.
-@return own: persistent cursor */
-btr_pcur_t*
-btr_pcur_create_for_mysql(void)
-/*============================*/
-{
-	btr_pcur_t*	pcur;
-	DBUG_ENTER("btr_pcur_create_for_mysql");
-
-	pcur = (btr_pcur_t*) ut_malloc_nokey(sizeof(btr_pcur_t));
-
-	pcur->btr_cur.index = NULL;
-	btr_pcur_init(pcur);
-
-	DBUG_PRINT("btr_pcur_create_for_mysql", ("pcur: %p", pcur));
-	DBUG_RETURN(pcur);
-}
-
-/**************************************************************//**
 Resets a persistent cursor object, freeing ::old_rec_buf if it is
 allocated and resetting the other members to their initial values. */
 void
@@ -56,35 +37,18 @@ btr_pcur_reset(
 /*===========*/
 	btr_pcur_t*	cursor)	/*!< in, out: persistent cursor */
 {
-	btr_pcur_free(cursor);
+	ut_free(cursor->old_rec_buf);
+	memset(&cursor->btr_cur.page_cur, 0, sizeof(page_cur_t));
 	cursor->old_rec_buf = NULL;
-	cursor->btr_cur.index = NULL;
-	cursor->btr_cur.page_cur.rec = NULL;
 	cursor->old_rec = NULL;
 	cursor->old_n_core_fields = 0;
 	cursor->old_n_fields = 0;
-	cursor->old_stored = false;
 
 	cursor->latch_mode = BTR_NO_LATCHES;
 	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
 }
 
 /**************************************************************//**
-Frees the memory for a persistent cursor object. */
-void
-btr_pcur_free_for_mysql(
-/*====================*/
-	btr_pcur_t*	cursor)	/*!< in, own: persistent cursor */
-{
-	DBUG_ENTER("btr_pcur_free_for_mysql");
-	DBUG_PRINT("btr_pcur_free_for_mysql", ("pcur: %p", cursor));
-
-	btr_pcur_free(cursor);
-	ut_free(cursor);
-	DBUG_VOID_RETURN;
-}
-
-/**************************************************************//**
 The position of the cursor is stored by taking an initial segment of the
 record the cursor is positioned on, before, or after, and copying it to the
 cursor data structure, or just setting a flag if the cursor id before the
@@ -112,8 +76,9 @@ btr_pcur_store_position(
 	page_cursor = btr_pcur_get_page_cur(cursor);
 
 	rec = page_cur_get_rec(page_cursor);
-	offs = rec - block->frame;
-	ut_ad(block->page.id().page_no() == page_get_page_no(block->frame));
+	offs = rec - block->page.frame;
+	ut_ad(block->page.id().page_no()
+	      == page_get_page_no(block->page.frame));
 	ut_ad(block->page.buf_fix_count());
 	/* For spatial index, when we do positioning on parent
 	buffer if necessary, it might not hold latches, but the
@@ -124,15 +89,13 @@ btr_pcur_store_position(
 		  && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 						| MTR_MEMO_SX_LOCK)));
 
-	cursor->old_stored = true;
-
-	if (page_is_empty(block->frame)) {
+	if (page_is_empty(block->page.frame)) {
 		/* It must be an empty index tree; NOTE that in this case
 		we do not store the modify_clock, but always do a search
 		if we restore the cursor position */
 
-		ut_a(!page_has_siblings(block->frame));
-		ut_ad(page_is_leaf(block->frame));
+		ut_a(!page_has_siblings(block->page.frame));
+		ut_ad(page_is_leaf(block->page.frame));
 		ut_ad(block->page.id().page_no() == index->page);
 
 		if (page_rec_is_supremum_low(offs)) {
@@ -147,6 +110,11 @@ before_first:
 
 	if (page_rec_is_supremum_low(offs)) {
 		rec = page_rec_get_prev(rec);
+		if (UNIV_UNLIKELY(!rec || page_rec_is_infimum(rec))) {
+			ut_ad("corrupted index" == 0);
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+			return;
+		}
 
 		ut_ad(!page_rec_is_infimum(rec));
 		if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) {
@@ -159,9 +127,9 @@ before_first:
 #endif
 			ut_ad(index->is_instant()
 			      || block->page.id().page_no() != index->page);
-			ut_ad(page_get_n_recs(block->frame) == 1);
-			ut_ad(page_is_leaf(block->frame));
-			ut_ad(!page_has_prev(block->frame));
+			ut_ad(page_get_n_recs(block->page.frame) == 1);
+			ut_ad(page_is_leaf(block->page.frame));
+			ut_ad(!page_has_prev(block->page.frame));
 			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
 			return;
 		}
@@ -170,10 +138,16 @@ before_first:
 	} else if (page_rec_is_infimum_low(offs)) {
 		rec = page_rec_get_next(rec);
 
+		if (UNIV_UNLIKELY(!rec)) {
+			ut_ad("corrupted page" == 0);
+			goto before_first;
+		}
+
 		if (rec_is_metadata(rec, *index)) {
-			ut_ad(!page_has_prev(block->frame));
+			ut_ad(!page_has_prev(block->page.frame));
 			rec = page_rec_get_next(rec);
-			if (page_rec_is_supremum(rec)) {
+			ut_ad(rec);
+			if (!rec || page_rec_is_supremum(rec)) {
 				goto before_first;
 			}
 		}
@@ -238,23 +212,98 @@ btr_pcur_copy_stored_position(
 	pcur_receive->old_n_fields = pcur_donate->old_n_fields;
 }
 
+/** Optimistically latches the leaf page or pages requested.
+@param[in]	block		guessed buffer block
+@param[in,out]	pcur		cursor
+@param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[in,out]	mtr		mini-transaction
+@return true if success */
+TRANSACTIONAL_TARGET
+static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block,
+                                             btr_pcur_t *pcur,
+                                             btr_latch_mode *latch_mode,
+                                             mtr_t *mtr)
+{
+  ut_ad(block->page.buf_fix_count());
+  ut_ad(block->page.in_file());
+  ut_ad(block->page.frame);
+
+  static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
+  static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
+  static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) ==
+                (RW_S_LATCH ^ RW_X_LATCH), "");
+
+  const rw_lock_type_t mode=
+    rw_lock_type_t(*latch_mode & (RW_X_LATCH | RW_S_LATCH));
+
+  switch (*latch_mode) {
+  default:
+    ut_ad(*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF);
+    return buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr);
+  case BTR_SEARCH_PREV:
+  case BTR_MODIFY_PREV:
+    page_id_t id{0};
+    uint32_t left_page_no;
+    ulint zip_size;
+    buf_block_t *left_block= nullptr;
+    {
+      transactional_shared_lock_guard<block_lock> g{block->page.lock};
+      if (block->modify_clock != pcur->modify_clock)
+        return false;
+      id= block->page.id();
+      zip_size= block->zip_size();
+      left_page_no= btr_page_get_prev(block->page.frame);
+    }
+
+    if (left_page_no != FIL_NULL)
+    {
+      left_block=
+        buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size,
+                         mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+      if (left_block &&
+          btr_page_get_next(left_block->page.frame) != id.page_no())
+      {
+release_left_block:
+        mtr->release_last_page();
+        return false;
+      }
+    }
+
+    if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr))
+    {
+      if (btr_page_get_prev(block->page.frame) == left_page_no)
+      {
+        /* block was already buffer-fixed while entering the function and
+        buf_page_optimistic_get() buffer-fixes it again. */
+        ut_ad(2 <= block->page.buf_fix_count());
+        *latch_mode= btr_latch_mode(mode);
+        return true;
+      }
+
+      mtr->release_last_page();
+    }
+
+    ut_ad(block->page.buf_fix_count());
+    if (left_block)
+      goto release_left_block;
+    return false;
+  }
+}
+
 /** Structure acts as functor to do the latching of leaf pages.
 It returns true if latching of leaf pages succeeded and false
 otherwise. */
 struct optimistic_latch_leaves
 {
   btr_pcur_t *const cursor;
-  ulint *latch_mode;
+  btr_latch_mode *const latch_mode;
   mtr_t *const mtr;
 
-  optimistic_latch_leaves(btr_pcur_t *cursor, ulint *latch_mode, mtr_t *mtr)
-  :cursor(cursor), latch_mode(latch_mode), mtr(mtr) {}
-
-  bool operator() (buf_block_t *hint) const
+  bool operator()(buf_block_t *hint) const
   {
-    return hint && btr_cur_optimistic_latch_leaves(
-             hint, cursor->modify_clock, latch_mode,
-             btr_pcur_get_btr_cur(cursor), __FILE__, __LINE__, mtr);
+    return hint &&
+      btr_pcur_optimistic_latch_leaves(hint, cursor, latch_mode, mtr);
   }
 };
 
@@ -271,10 +320,8 @@ record GREATER than the user record which was the predecessor of the
 supremum.
 (4) cursor was positioned before the first or after the last in an
 empty tree: restores to before first or after the last in the tree.
-@param latch_mode BTR_SEARCH_LEAF, ...
-@param file file name
-@param line line where called
-@param mtr mtr
+@param latch_mode  BTR_SEARCH_LEAF, ...
+@param mtr         mini-transaction
 @return btr_pcur_t::SAME_ALL cursor position on user rec and points on
 the record with the same field values as in the stored record,
 btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the
@@ -282,8 +329,7 @@ record with the same unique field values as in the stored record,
 btr_pcur_t::NOT_SAME cursor position is not on user rec or points on
 the record with not the samebuniq field values as in the stored */
 btr_pcur_t::restore_status
-btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
-                             unsigned line, mtr_t *mtr)
+btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
 {
 	dict_index_t*	index;
 	dtuple_t*	tuple;
@@ -292,7 +338,6 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 	mem_heap_t*	heap;
 
 	ut_ad(mtr->is_active());
-	//ut_ad(cursor->old_stored);
 	ut_ad(pos_state == BTR_PCUR_WAS_POSITIONED
 	      || pos_state == BTR_PCUR_IS_POSITIONED);
 
@@ -301,23 +346,13 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 	if (UNIV_UNLIKELY
 	    (rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
 	     || rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
-		dberr_t err = DB_SUCCESS;
-
 		/* In these cases we do not try an optimistic restoration,
 		but always do a search */
 
-		err = btr_cur_open_at_index_side(
-			rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
-			index, restore_latch_mode,
-			&btr_cur, 0, mtr);
-
-		if (err != DB_SUCCESS) {
-			ib::warn() << " Error code: " << err
-				   << " btr_pcur_t::restore_position "
-				   << " called from file: "
-				   << file << " line: " << line
-				   << " table: " << index->table->name
-				   << " index: " << index->name;
+		if (btr_cur.open_leaf(rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+				      index, restore_latch_mode, mtr)
+		    != DB_SUCCESS) {
+			return restore_status::CORRUPTED;
 		}
 
 		latch_mode =
@@ -333,24 +368,19 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 	ut_a(old_n_core_fields <= index->n_core_fields);
 	ut_a(old_n_fields);
 
-	switch (restore_latch_mode) {
-	case BTR_SEARCH_LEAF:
-	case BTR_MODIFY_LEAF:
+	static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), "");
+	static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), "");
+
+	switch (restore_latch_mode | 4) {
 	case BTR_SEARCH_PREV:
 	case BTR_MODIFY_PREV:
 		/* Try optimistic restoration. */
-
 		if (block_when_stored.run_with_hint(
-			optimistic_latch_leaves(this, &restore_latch_mode,
-						mtr))) {
+			optimistic_latch_leaves{this, &restore_latch_mode,
+						mtr})) {
 			pos_state = BTR_PCUR_IS_POSITIONED;
 			latch_mode = restore_latch_mode;
 
-			buf_block_dbg_add_level(
-				btr_pcur_get_block(this),
-				dict_index_is_ibuf(index)
-				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
-
 			if (rel_pos == BTR_PCUR_ON) {
 #ifdef UNIV_DEBUG
 				const rec_t*	rec;
@@ -421,12 +451,15 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 		mode = PAGE_CUR_L;
 		break;
 	default:
-		ut_error;
+		MY_ASSERT_UNREACHABLE();
 		mode = PAGE_CUR_UNSUPP;
 	}
 
-	btr_pcur_open_with_no_init_func(index, tuple, mode, restore_latch_mode,
-					this, file, line, mtr);
+	if (btr_pcur_open_with_no_init(tuple, mode, restore_latch_mode,
+				       this, mtr) != DB_SUCCESS) {
+		mem_heap_free(heap);
+		return restore_status::CORRUPTED;
+        }
 
 	/* Restore the old search mode */
 	search_mode = old_mode;
@@ -452,7 +485,6 @@ btr_pcur_t::restore_position(ulint restore_latch_mode, const char *file,
 			block_when_stored.store(btr_pcur_get_block(this));
 			modify_clock= buf_block_get_modify_clock(
 			    block_when_stored.block());
-			old_stored= true;
 
 			mem_heap_free(heap);
 
@@ -478,7 +510,7 @@ Moves the persistent cursor to the first record on the next page. Releases the
 latch on the current page, and bufferunfixes it. Note that there must not be
 modifications on the current page, as then the x-latch can be released only in
 mtr_commit. */
-void
+dberr_t
 btr_pcur_move_to_next_page(
 /*=======================*/
 	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
@@ -489,49 +521,49 @@ btr_pcur_move_to_next_page(
 	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
 	ut_ad(btr_pcur_is_after_last_on_page(cursor));
 
-	cursor->old_stored = false;
+	cursor->old_rec = nullptr;
 
 	const page_t* page = btr_pcur_get_page(cursor);
-
-	if (UNIV_UNLIKELY(!page)) {
-		return;
-	}
-
 	const uint32_t next_page_no = btr_page_get_next(page);
 
-	ut_ad(next_page_no != FIL_NULL);
+	switch (next_page_no) {
+	case 0:
+	case 1:
+	case FIL_NULL:
+		return DB_CORRUPTION;
+	}
 
-	ulint mode = cursor->latch_mode;
-	switch (mode) {
-	case BTR_SEARCH_TREE:
-		mode = BTR_SEARCH_LEAF;
-		break;
-	case BTR_MODIFY_TREE:
-		mode = BTR_MODIFY_LEAF;
+	if (UNIV_UNLIKELY(next_page_no == btr_pcur_get_block(cursor)
+			  ->page.id().page_no())) {
+		return DB_CORRUPTION;
 	}
 
+	dberr_t err;
 	buf_block_t* next_block = btr_block_get(
-		*btr_pcur_get_btr_cur(cursor)->index, next_page_no, mode,
-		page_is_leaf(page), mtr);
+		*cursor->index(), next_page_no, cursor->latch_mode & ~12,
+		page_is_leaf(page), mtr, &err);
 
 	if (UNIV_UNLIKELY(!next_block)) {
-		return;
+		return err;
 	}
 
 	const page_t* next_page = buf_block_get_frame(next_block);
-#ifdef UNIV_BTR_DEBUG
-	ut_a(page_is_comp(next_page) == page_is_comp(page));
-	ut_a(btr_page_get_prev(next_page)
-	     == btr_pcur_get_block(cursor)->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
 
-	btr_leaf_page_release(btr_pcur_get_block(cursor), mode, mtr);
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+					    page + FIL_PAGE_OFFSET, 4))) {
+		return DB_CORRUPTION;
+	}
 
 	page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
 
 	ut_d(page_check_dir(next_page));
+
+	const auto s = mtr->get_savepoint();
+	mtr->rollback_to_savepoint(s - 2, s - 1);
+	return DB_SUCCESS;
 }
 
+MY_ATTRIBUTE((nonnull,warn_unused_result))
 /*********************************************************//**
 Moves the persistent cursor backward if it is on the first record of the page.
 Commits mtr. Note that to prevent a possible deadlock, the operation
@@ -542,36 +574,18 @@ return, but it may happen that the cursor is not positioned on the last
 record of any page, because the structure of the tree may have changed
 during the time when the cursor had no latches. */
 static
-void
+bool
 btr_pcur_move_backward_from_page(
 /*=============================*/
 	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the first
 				record of the current page */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ulint		prev_page_no;
-	page_t*		page;
-	buf_block_t*	prev_block;
-	ulint		latch_mode;
-	ulint		latch_mode2;
-
-	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
 	ut_ad(btr_pcur_is_before_first_on_page(cursor));
 	ut_ad(!btr_pcur_is_before_first_in_tree(cursor));
 
-	latch_mode = cursor->latch_mode;
-
-	if (latch_mode == BTR_SEARCH_LEAF) {
-
-		latch_mode2 = BTR_SEARCH_PREV;
-
-	} else if (latch_mode == BTR_MODIFY_LEAF) {
-
-		latch_mode2 = BTR_MODIFY_PREV;
-	} else {
-		latch_mode2 = 0; /* To eliminate compiler warning */
-		ut_error;
-	}
+	const auto latch_mode = cursor->latch_mode;
+	ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
 
 	btr_pcur_store_position(cursor, mtr);
 
@@ -579,43 +593,59 @@ btr_pcur_move_backward_from_page(
 
 	mtr_start(mtr);
 
-	btr_pcur_restore_position(latch_mode2, cursor, mtr);
-
-	page = btr_pcur_get_page(cursor);
-
-	prev_page_no = btr_page_get_prev(page);
+	static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), "");
+	static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), "");
 
-	if (prev_page_no == FIL_NULL) {
-	} else if (btr_pcur_is_before_first_on_page(cursor)) {
-
-		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
-
-		btr_leaf_page_release(btr_pcur_get_block(cursor),
-				      latch_mode, mtr);
-
-		page_cur_set_after_last(prev_block,
-					btr_pcur_get_page_cur(cursor));
-	} else {
-
-		/* The repositioned cursor did not end on an infimum
-		record on a page. Cursor repositioning acquired a latch
-		also on the previous page, but we do not need the latch:
-		release it. */
-
-		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+	if (UNIV_UNLIKELY(cursor->restore_position(
+				  btr_latch_mode(4 | latch_mode), mtr)
+			  == btr_pcur_t::CORRUPTED)) {
+		return true;
+	}
 
-		btr_leaf_page_release(prev_block, latch_mode, mtr);
+	buf_block_t* block = btr_pcur_get_block(cursor);
+
+	if (page_has_prev(block->page.frame)) {
+		buf_block_t* left_block
+			= mtr->at_savepoint(mtr->get_savepoint() - 1);
+		const page_t* const left = left_block->page.frame;
+		if (memcmp_aligned<4>(left + FIL_PAGE_NEXT,
+				      block->page.frame
+				      + FIL_PAGE_OFFSET, 4)) {
+			/* This should be the right sibling page, or
+			if there is none, the current block. */
+			ut_ad(left_block == block
+			      || !memcmp_aligned<4>(left + FIL_PAGE_PREV,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4));
+			/* The previous one must be the left sibling. */
+			left_block
+				= mtr->at_savepoint(mtr->get_savepoint() - 2);
+			ut_ad(!memcmp_aligned<4>(left_block->page.frame
+						 + FIL_PAGE_NEXT,
+						 block->page.frame
+						 + FIL_PAGE_OFFSET, 4));
+		}
+		if (btr_pcur_is_before_first_on_page(cursor)) {
+			page_cur_set_after_last(left_block,
+						&cursor->btr_cur.page_cur);
+			/* Release the right sibling. */
+		} else {
+			/* Release the left sibling. */
+			block = left_block;
+		}
+		mtr->release(*block);
 	}
 
 	cursor->latch_mode = latch_mode;
-	cursor->old_stored = false;
+	cursor->old_rec = nullptr;
+	return false;
 }
 
 /*********************************************************//**
 Moves the persistent cursor to the previous record in the tree. If no records
 are left, the cursor stays 'before first in tree'.
 @return TRUE if the cursor was not before first in tree */
-ibool
+bool
 btr_pcur_move_to_prev(
 /*==================*/
 	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
@@ -625,60 +655,12 @@ btr_pcur_move_to_prev(
 	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
 
-	cursor->old_stored = false;
+	cursor->old_rec = nullptr;
 
 	if (btr_pcur_is_before_first_on_page(cursor)) {
-
-		if (btr_pcur_is_before_first_in_tree(cursor)) {
-
-			return(FALSE);
-		}
-
-		btr_pcur_move_backward_from_page(cursor, mtr);
-
-		return(TRUE);
+		return (!btr_pcur_is_before_first_in_tree(cursor)
+			&& !btr_pcur_move_backward_from_page(cursor, mtr));
 	}
 
-	btr_pcur_move_to_prev_on_page(cursor);
-
-	return(TRUE);
-}
-
-/**************************************************************//**
-If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
-user record satisfying the search condition, in the case PAGE_CUR_L or
-PAGE_CUR_LE, on the last user record. If no such user record exists, then
-in the first case sets the cursor after last in tree, and in the latter case
-before first in tree. The latching mode must be BTR_SEARCH_LEAF or
-BTR_MODIFY_LEAF. */
-void
-btr_pcur_open_on_user_rec_func(
-/*===========================*/
-	dict_index_t*	index,		/*!< in: index */
-	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
-	page_cur_mode_t	mode,		/*!< in: PAGE_CUR_L, ... */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
-					BTR_MODIFY_LEAF */
-	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
-					cursor */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
-			  file, line, 0, mtr);
-
-	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
-
-		if (btr_pcur_is_after_last_on_page(cursor)) {
-
-			btr_pcur_move_to_next_user_rec(cursor, mtr);
-		}
-	} else {
-		ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L));
-
-		/* Not implemented yet */
-
-		ut_error;
-	}
+	return btr_pcur_move_to_prev_on_page(cursor) != nullptr;
 }
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
index d277c1cc362..a1609248512 100644
--- a/storage/innobase/btr/btr0sea.cc
+++ b/storage/innobase/btr/btr0sea.cc
@@ -55,6 +55,10 @@ ulint		btr_search_n_succ	= 0;
 ulint		btr_search_n_hash_fail	= 0;
 #endif /* UNIV_SEARCH_PERF_STAT */
 
+#ifdef UNIV_PFS_RWLOCK
+mysql_pfs_key_t	btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
 /** The adaptive hash index */
 btr_search_sys_t btr_search_sys;
 
@@ -177,14 +181,14 @@ static void btr_search_check_free_space_in_heap(const dict_index_t *index)
   buf_block_t *block= buf_block_alloc();
   auto part= btr_search_sys.get_part(*index);
 
-  rw_lock_x_lock(&part->latch);
+  part->latch.wr_lock(SRW_LOCK_CALL);
 
   if (!btr_search_enabled || part->heap->free_block)
     buf_block_free(block);
   else
     part->heap->free_block= block;
 
-  rw_lock_x_unlock(&part->latch);
+  part->latch.wr_unlock();
 }
 
 /** Set index->ref_count = 0 on all indexes of a table.
@@ -201,24 +205,24 @@ ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
 {
   ut_ad(index->freed());
   dict_table_t *table= index->table;
-  table->autoinc_mutex.lock();
+  table->autoinc_mutex.wr_lock();
 
   /* Perform the skipped steps of dict_index_remove_from_cache_low(). */
   UT_LIST_REMOVE(table->freed_indexes, index);
-  rw_lock_free(&index->lock);
+  index->lock.free();
   dict_mem_index_free(index);
 
   if (!UT_LIST_GET_LEN(table->freed_indexes) &&
       !UT_LIST_GET_LEN(table->indexes))
   {
     ut_ad(!table->id);
-    table->autoinc_mutex.unlock();
-    table->autoinc_mutex.~mutex();
+    table->autoinc_mutex.wr_unlock();
+    table->autoinc_mutex.destroy();
     dict_mem_table_free(table);
     return;
   }
 
-  table->autoinc_mutex.unlock();
+  table->autoinc_mutex.wr_unlock();
 }
 
 /** Disable the adaptive hash search system and empty the index. */
@@ -226,12 +230,12 @@ void btr_search_disable()
 {
 	dict_table_t*	table;
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 
 	btr_search_x_lock_all();
 
 	if (!btr_search_enabled) {
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unfreeze();
 		btr_search_x_unlock_all();
 		return;
 	}
@@ -252,7 +256,7 @@ void btr_search_disable()
 		btr_search_disable_ref_count(table);
 	}
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 
 	/* Set all block->index = NULL. */
 	buf_pool.clear_hash_index();
@@ -296,18 +300,11 @@ is NOT protected by any semaphore, to save CPU time! Do not assume its fields
 are consistent.
 @param[in,out]	info	search info
 @param[in]	cursor	cursor which was just positioned */
-static
-void
-btr_search_info_update_hash(
-	btr_search_t*	info,
-	btr_cur_t*	cursor)
+static void btr_search_info_update_hash(btr_search_t *info, btr_cur_t *cursor)
 {
-	dict_index_t*	index = cursor->index;
+	dict_index_t*	index = cursor->index();
 	int		cmp;
 
-	ut_ad(!btr_search_own_any(RW_LOCK_S));
-	ut_ad(!btr_search_own_any(RW_LOCK_X));
-
 	if (dict_index_is_ibuf(index)) {
 		/* So many deletes are performed on an insert buffer tree
 		that we do not consider a hash index useful on it: */
@@ -412,16 +409,10 @@ static
 bool
 btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
 {
-	ut_ad(!btr_search_own_any());
-	ut_ad(rw_lock_own_flagged(&block->lock,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+	ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
 
 	info->last_hash_succ = FALSE;
-	ut_d(auto state= block->page.state());
-	ut_ad(state == BUF_BLOCK_NOT_USED
-	      || state == BUF_BLOCK_FILE_PAGE
-	      || state == BUF_BLOCK_MEMORY
-	      || state == BUF_BLOCK_REMOVE_HASH);
+	ut_ad(block->page.frame);
 	ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
 
 	if ((block->n_hash_helps > 0)
@@ -449,13 +440,13 @@ btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
 		block->left_side = info->left_side;
 	}
 
-	if ((block->n_hash_helps > page_get_n_recs(block->frame)
+	if ((block->n_hash_helps > page_get_n_recs(block->page.frame)
 	     / BTR_SEARCH_PAGE_BUILD_LIMIT)
 	    && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
 
 		if ((!block->index)
 		    || (block->n_hash_helps
-			> 2U * page_get_n_recs(block->frame))
+			> 2U * page_get_n_recs(block->page.frame))
 		    || (block->n_fields != block->curr_n_fields)
 		    || (block->n_bytes != block->curr_n_bytes)
 		    || (block->left_side != block->curr_left_side)) {
@@ -494,7 +485,7 @@ static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
                                const rec_t *data)
 {
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-  ut_a(block->frame == page_align(data));
+  ut_a(block->page.frame == page_align(data));
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
   ut_ad(btr_search_enabled);
 
@@ -507,7 +498,7 @@ static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
     {
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
       buf_block_t *prev_block= prev->block;
-      ut_a(prev_block->frame == page_align(prev->data));
+      ut_a(prev_block->page.frame == page_align(prev->data));
       ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
       ut_a(block->n_pointers++ < MAX_N_POINTERS);
 
@@ -555,7 +546,7 @@ static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap,
 {
   ut_ad(btr_search_enabled);
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-  ut_a(del_node->block->frame == page_align(del_node->data));
+  ut_a(del_node->block->page.frame == page_align(del_node->data));
   ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 
@@ -655,7 +646,7 @@ static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold,
                                           const rec_t *new_data)
 {
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
-  ut_a(new_block->frame == page_align(new_data));
+  ut_a(new_block->page.frame == page_align(new_data));
 #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
 
   if (!btr_search_enabled)
@@ -702,10 +693,9 @@ btr_search_update_hash_ref(
 {
 	ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
 
-	ut_ad(rw_lock_own_flagged(&block->lock,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-	ut_ad(page_align(btr_cur_get_rec(cursor)) == block->frame);
-	ut_ad(page_is_leaf(block->frame));
+	ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+	ut_ad(page_align(btr_cur_get_rec(cursor)) == block->page.frame);
+	ut_ad(page_is_leaf(block->page.frame));
 	assert_block_ahi_valid(block);
 
 	dict_index_t* index = block->index;
@@ -714,17 +704,17 @@ btr_search_update_hash_ref(
 		return;
 	}
 
-	if (index != cursor->index) {
-		ut_ad(index->id == cursor->index->id);
+	if (index != cursor->index()) {
+		ut_ad(index->id == cursor->index()->id);
 		btr_search_drop_page_hash_index(block, false);
 		return;
 	}
 
 	ut_ad(block->page.id().space() == index->table->space_id);
-	ut_ad(index == cursor->index);
+	ut_ad(index == cursor->index());
 	ut_ad(!dict_index_is_ibuf(index));
 	auto part = btr_search_sys.get_part(*index);
-	rw_lock_x_lock(&part->latch);
+	part->latch.wr_lock(SRW_LOCK_CALL);
 	ut_ad(!block->index || block->index == index);
 
 	if (block->index
@@ -759,7 +749,7 @@ btr_search_update_hash_ref(
 	}
 
 func_exit:
-	rw_lock_x_unlock(&part->latch);
+	part->latch.wr_unlock();
 }
 
 /** Checks if a guessed position for a tree cursor is right. Note that if
@@ -791,20 +781,32 @@ btr_search_check_guess(
 	mem_heap_t*	heap		= NULL;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets		= offsets_;
-	ibool		success		= FALSE;
+	bool		success		= false;
 	rec_offs_init(offsets_);
 
-	n_unique = dict_index_get_n_unique_in_tree(cursor->index);
+	n_unique = dict_index_get_n_unique_in_tree(cursor->index());
 
 	rec = btr_cur_get_rec(cursor);
 
-	ut_ad(page_rec_is_user_rec(rec));
-	ut_ad(page_rec_is_leaf(rec));
+	if (UNIV_UNLIKELY(!page_rec_is_user_rec(rec)
+			  || !page_rec_is_leaf(rec))) {
+		ut_ad("corrupted index" == 0);
+		return false;
+	} else if (cursor->index()->table->not_redundant()) {
+		switch (rec_get_status(rec)) {
+		case REC_STATUS_INSTANT:
+		case REC_STATUS_ORDINARY:
+			break;
+		default:
+			ut_ad("corrupted index" == 0);
+			return false;
+		}
+	}
 
 	match = 0;
 
-	offsets = rec_get_offsets(rec, cursor->index, offsets,
-				  cursor->index->n_core_fields,
+	offsets = rec_get_offsets(rec, cursor->index(), offsets,
+				  cursor->index()->n_core_fields,
 				  n_unique, &heap);
 	cmp = cmp_dtuple_rec_with_match(tuple, rec, offsets, &match);
 
@@ -816,7 +818,7 @@ btr_search_check_guess(
 		cursor->up_match = match;
 
 		if (match >= n_unique) {
-			success = TRUE;
+			success = true;
 			goto exit_func;
 		}
 	} else if (mode == PAGE_CUR_LE) {
@@ -845,17 +847,31 @@ btr_search_check_guess(
 	match = 0;
 
 	if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
-		ut_ad(!page_rec_is_infimum(rec));
-
 		const rec_t* prev_rec = page_rec_get_prev(rec);
 
+		if (UNIV_UNLIKELY(!prev_rec)) {
+			ut_ad("corrupted index" == 0);
+			goto exit_func;
+		}
+
 		if (page_rec_is_infimum(prev_rec)) {
 			success = !page_has_prev(page_align(prev_rec));
 			goto exit_func;
 		}
 
-		offsets = rec_get_offsets(prev_rec, cursor->index, offsets,
-					  cursor->index->n_core_fields,
+		if (cursor->index()->table->not_redundant()) {
+			switch (rec_get_status(prev_rec)) {
+			case REC_STATUS_INSTANT:
+			case REC_STATUS_ORDINARY:
+				break;
+			default:
+				ut_ad("corrupted index" == 0);
+				goto exit_func;
+			}
+		}
+
+		offsets = rec_get_offsets(prev_rec, cursor->index(), offsets,
+					  cursor->index()->n_core_fields,
 					  n_unique, &heap);
 		cmp = cmp_dtuple_rec_with_match(
 			tuple, prev_rec, offsets, &match);
@@ -869,17 +885,33 @@ btr_search_check_guess(
 
 		const rec_t* next_rec = page_rec_get_next(rec);
 
+		if (UNIV_UNLIKELY(!next_rec)) {
+			ut_ad("corrupted index" == 0);
+			goto exit_func;
+		}
+
 		if (page_rec_is_supremum(next_rec)) {
 			if (!page_has_next(page_align(next_rec))) {
 				cursor->up_match = 0;
-				success = TRUE;
+				success = true;
 			}
 
 			goto exit_func;
 		}
 
-		offsets = rec_get_offsets(next_rec, cursor->index, offsets,
-					  cursor->index->n_core_fields,
+		if (cursor->index()->table->not_redundant()) {
+			switch (rec_get_status(next_rec)) {
+			case REC_STATUS_INSTANT:
+			case REC_STATUS_ORDINARY:
+				break;
+			default:
+				ut_ad("corrupted index" == 0);
+				goto exit_func;
+			}
+		}
+
+		offsets = rec_get_offsets(next_rec, cursor->index(), offsets,
+					  cursor->index()->n_core_fields,
 					  n_unique, &heap);
 		cmp = cmp_dtuple_rec_with_match(
 			tuple, next_rec, offsets, &match);
@@ -917,7 +949,6 @@ btr_search_failure(btr_search_t* info, btr_cur_t* cursor)
 /** Clear the adaptive hash index on all pages in the buffer pool. */
 inline void buf_pool_t::clear_hash_index()
 {
-  ut_ad(btr_search_own_all(RW_LOCK_X));
   ut_ad(!resizing);
   ut_ad(!btr_search_enabled);
 
@@ -932,7 +963,7 @@ inline void buf_pool_t::clear_hash_index()
       assert_block_ahi_valid(block);
 
       /* We can clear block->index and block->n_pointers when
-      btr_search_own_all(RW_LOCK_X); see the comments in buf0buf.h */
+      holding all AHI latches exclusively; see the comments in buf0buf.h */
 
       if (!index)
       {
@@ -942,15 +973,15 @@ inline void buf_pool_t::clear_hash_index()
         continue;
       }
 
-      ut_d(buf_page_state state= block->page.state());
+      ut_d(const auto s= block->page.state());
       /* Another thread may have set the state to
-      BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed().
+      REMOVE_HASH in buf_LRU_block_remove_hashed().
 
       The state change in buf_pool_t::realloc() is not observable
       here, because in that case we would have !block->index.
 
       In the end, the entire adaptive hash index will be removed. */
-      ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+      ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH);
 # if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
       block->n_pointers= 0;
 # endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
@@ -981,18 +1012,19 @@ inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const
     ? chunk_map->rbegin()->second
     : (--it)->second;
 
-  const size_t offs= size_t(ptr - chunk->blocks->frame) >> srv_page_size_shift;
+  const size_t offs= size_t(ptr - chunk->blocks->page.frame) >>
+    srv_page_size_shift;
   ut_a(offs < chunk->size);
 
   buf_block_t *block= &chunk->blocks[offs];
   /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
-  block[n].frame == block->frame + n * srv_page_size.  Check it. */
-  ut_ad(block->frame == page_align(ptr));
+  block[n].frame == block->page.frame + n * srv_page_size.  Check it. */
+  ut_ad(block->page.frame == page_align(ptr));
   /* Read the state of the block without holding hash_lock.
-  A state transition from BUF_BLOCK_FILE_PAGE to
-  BUF_BLOCK_REMOVE_HASH is possible during this execution. */
-  ut_d(const buf_page_state state = block->page.state());
-  ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
+  A state transition to REMOVE_HASH is possible during
+  this execution. */
+  ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH);
+
   return block;
 }
 
@@ -1004,14 +1036,11 @@ both have sensible values.
 @param[in,out]	info		index search info
 @param[in]	tuple		logical record
 @param[in]	mode		PAGE_CUR_L, ....
-@param[in]	latch_mode	BTR_SEARCH_LEAF, ...;
-				NOTE that only if has_search_latch is 0, we will
-				have a latch set on the cursor page, otherwise
-				we assume the caller uses his search latch
-				to protect the record!
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
 @param[out]	cursor		tree cursor
-@param[in]	mtr		mini transaction
+@param[in]	mtr		mini-transaction
 @return whether the search succeeded */
+TRANSACTIONAL_TARGET
 bool
 btr_search_guess_on_hash(
 	dict_index_t*	index,
@@ -1026,27 +1055,24 @@ btr_search_guess_on_hash(
 	index_id_t	index_id;
 
 	ut_ad(mtr->is_active());
-
-	if (!btr_search_enabled) {
-		return false;
-	}
-
-	ut_ad(!index->is_ibuf());
-	ut_ad((latch_mode == BTR_SEARCH_LEAF)
-	      || (latch_mode == BTR_MODIFY_LEAF));
-	compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
-	compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
-
-	/* Not supported for spatial index */
-	ut_ad(!dict_index_is_spatial(index));
+	ut_ad(index->is_btree() || index->is_ibuf());
 
 	/* Note that, for efficiency, the struct info may not be protected by
 	any latch here! */
 
-	if (info->n_hash_potential == 0) {
+	if (latch_mode > BTR_MODIFY_LEAF
+	    || !info->last_hash_succ || !info->n_hash_potential
+	    || (tuple->info_bits & REC_INFO_MIN_REC_FLAG)) {
 		return false;
 	}
 
+	ut_ad(index->is_btree());
+        ut_ad(!index->table->is_temporary());
+
+	ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+	compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
+	compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
+
 	cursor->n_fields = info->n_fields;
 	cursor->n_bytes = info->n_bytes;
 
@@ -1067,83 +1093,73 @@ btr_search_guess_on_hash(
 	auto part = btr_search_sys.get_part(*index);
 	const rec_t* rec;
 
-	rw_lock_s_lock(&part->latch);
+	part->latch.rd_lock(SRW_LOCK_CALL);
 
 	if (!btr_search_enabled) {
-		goto fail;
+		goto ahi_release_and_fail;
 	}
 
 	rec = static_cast<const rec_t*>(
 		ha_search_and_get_data(&part->table, fold));
 
 	if (!rec) {
+ahi_release_and_fail:
+		part->latch.rd_unlock();
 fail:
-		rw_lock_s_unlock(&part->latch);
-
 		btr_search_failure(info, cursor);
 		return false;
 	}
 
 	buf_block_t* block = buf_pool.block_from_ahi(rec);
 
-	page_hash_latch* hash_lock = buf_pool.hash_lock_get(block->page.id());
-	hash_lock->read_lock();
-
-	if (block->page.state() == BUF_BLOCK_REMOVE_HASH) {
-		/* Another thread is just freeing the block
-		from the LRU list of the buffer pool: do not
-		try to access this page. */
-		hash_lock->read_unlock();
-		goto fail;
+	buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(
+		block->page.id().fold());
+	bool got_latch;
+	{
+		transactional_shared_lock_guard<page_hash_latch> g{
+			buf_pool.page_hash.lock_get(chain)};
+		got_latch = (latch_mode == BTR_SEARCH_LEAF)
+			? block->page.lock.s_lock_try()
+			: block->page.lock.x_lock_try();
 	}
 
-	const bool fail = index != block->index
-		&& index_id == block->index->id;
-	ut_a(!fail || block->index->freed());
-	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
-	DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED);
-
-	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-	hash_lock->read_unlock();
-	block->page.set_accessed();
+	if (!got_latch) {
+		goto ahi_release_and_fail;
+	}
 
-	buf_page_make_young_if_needed(&block->page);
-	mtr_memo_type_t	fix_type;
-	if (latch_mode == BTR_SEARCH_LEAF) {
-		if (!rw_lock_s_lock_nowait(&block->lock, __FILE__, __LINE__)) {
-got_no_latch:
-			buf_block_buf_fix_dec(block);
-			goto fail;
-		}
-		fix_type = MTR_MEMO_PAGE_S_FIX;
-	} else {
-		if (!rw_lock_x_lock_func_nowait_inline(
-			    &block->lock, __FILE__, __LINE__)) {
-			goto got_no_latch;
+	const auto state = block->page.state();
+	if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+		ut_ad(state == buf_page_t::REMOVE_HASH);
+block_and_ahi_release_and_fail:
+		if (latch_mode == BTR_SEARCH_LEAF) {
+			block->page.lock.s_unlock();
+		} else {
+			block->page.lock.x_unlock();
 		}
-		fix_type = MTR_MEMO_PAGE_X_FIX;
+		goto ahi_release_and_fail;
 	}
-	mtr->memo_push(block, fix_type);
 
-	buf_pool.stat.n_page_gets++;
+	ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+	ut_ad(state < buf_page_t::READ_FIX || latch_mode == BTR_SEARCH_LEAF);
 
-	rw_lock_s_unlock(&part->latch);
-
-	buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
-	if (UNIV_UNLIKELY(fail)) {
-		goto fail_and_release_page;
+	if (index != block->index && index_id == block->index->id) {
+		ut_a(block->index->freed());
+		goto block_and_ahi_release_and_fail;
 	}
 
-	if (block->page.state() != BUF_BLOCK_FILE_PAGE) {
+	block->page.fix();
+	block->page.set_accessed();
+	buf_page_make_young_if_needed(&block->page);
+	static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF},
+		      "");
+	static_assert(ulint{MTR_MEMO_PAGE_X_FIX} == ulint{BTR_MODIFY_LEAF},
+		      "");
 
-		ut_ad(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+	part->latch.rd_unlock();
 
-fail_and_release_page:
-		btr_leaf_page_release(block, latch_mode, mtr);
+	++buf_pool.stat.n_page_gets;
 
-		btr_search_failure(info, cursor);
-		return false;
-	}
+	mtr->memo_push(block, mtr_memo_type_t(latch_mode));
 
 	ut_ad(page_rec_is_user_rec(rec));
 
@@ -1156,9 +1172,10 @@ fail_and_release_page:
 	is positioned on. We cannot look at the next of the previous
 	record to determine if our guess for the cursor position is
 	right. */
-	if (index_id != btr_page_get_index_id(block->frame)
-	    || !btr_search_check_guess(cursor, 0, tuple, mode)) {
-		goto fail_and_release_page;
+	if (index_id != btr_page_get_index_id(block->page.frame)
+	    || !btr_search_check_guess(cursor, false, tuple, mode)) {
+		mtr->release_last_page();
+		goto fail;
 	}
 
 	if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) {
@@ -1166,41 +1183,6 @@ fail_and_release_page:
 		info->n_hash_potential++;
 	}
 
-#ifdef notdefined
-	/* These lines of code can be used in a debug version to check
-	the correctness of the searched cursor position: */
-
-	info->last_hash_succ = FALSE;
-
-	/* Currently, does not work if the following fails: */
-	ut_ad(!ahi_latch);
-
-	btr_leaf_page_release(block, latch_mode, mtr);
-
-	btr_cur_search_to_nth_level(
-		index, 0, tuple, mode, latch_mode, &cursor2, 0, mtr);
-
-	if (mode == PAGE_CUR_GE
-	    && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) {
-
-		/* If mode is PAGE_CUR_GE, then the binary search
-		in the index tree may actually take us to the supremum
-		of the previous page */
-
-		info->last_hash_succ = FALSE;
-
-		btr_pcur_open_on_user_rec(
-			index, tuple, mode, latch_mode, &pcur, mtr);
-
-		ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
-	} else {
-		ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
-	}
-
-	/* NOTE that it is theoretically possible that the above assertions
-	fail if the page of the cursor gets removed from the buffer pool
-	meanwhile! Thus it might not be a bug. */
-#endif
 	info->last_hash_succ = TRUE;
 
 #ifdef UNIV_SEARCH_PERF_STAT
@@ -1222,56 +1204,48 @@ void btr_search_drop_page_hash_index(buf_block_t* block,
 {
 	ulint			n_fields;
 	ulint			n_bytes;
-	const page_t*		page;
 	const rec_t*		rec;
-	ulint			fold;
-	ulint			prev_fold;
-	ulint			n_cached;
-	ulint			n_recs;
-	ulint*			folds;
-	ulint			i;
 	mem_heap_t*		heap;
 	rec_offs*		offsets;
 
 retry:
-	ut_ad(!btr_search_own_any(RW_LOCK_S));
-	ut_ad(!btr_search_own_any(RW_LOCK_X));
-
 	if (!block->index) {
 		return;
 	}
 
-	ut_ad(!block->page.buf_fix_count()
-	      || block->page.state() == BUF_BLOCK_REMOVE_HASH
-	      || rw_lock_own_flagged(&block->lock,
-				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_S
-				     | RW_LOCK_FLAG_SX));
-	ut_ad(page_is_leaf(block->frame));
+	ut_d(const auto state = block->page.state());
+	ut_ad(state == buf_page_t::REMOVE_HASH
+	      || state >= buf_page_t::UNFIXED);
+	ut_ad(state == buf_page_t::REMOVE_HASH
+	      || !(~buf_page_t::LRU_MASK & state)
+	      || block->page.lock.have_any());
+	ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+	ut_ad(page_is_leaf(block->page.frame));
 
 	/* We must not dereference block->index here, because it could be freed
-	if (index->table->n_ref_count == 0 && !mutex_own(&dict_sys.mutex)).
+	if (!index->table->get_ref_count() && !dict_sys.frozen()).
 	Determine the ahi_slot based on the block contents. */
 
 	const index_id_t	index_id
-		= btr_page_get_index_id(block->frame);
+		= btr_page_get_index_id(block->page.frame);
 
 	auto part = btr_search_sys.get_part(index_id,
 					    block->page.id().space());
 
-	rw_lock_s_lock(&part->latch);
+	part->latch.rd_lock(SRW_LOCK_CALL);
 
 	dict_index_t* index = block->index;
 	bool is_freed = index && index->freed();
 
 	if (is_freed) {
-		rw_lock_s_unlock(&part->latch);
-		rw_lock_x_lock(&part->latch);
+		part->latch.rd_unlock();
+		part->latch.wr_lock(SRW_LOCK_CALL);
 		if (index != block->index) {
-			rw_lock_x_unlock(&part->latch);
+			part->latch.wr_unlock();
 			goto retry;
 		}
 	} else if (garbage_collect) {
-		rw_lock_s_unlock(&part->latch);
+		part->latch.rd_unlock();
 		return;
 	}
 
@@ -1279,16 +1253,14 @@ retry:
 
 	if (!index || !btr_search_enabled) {
 		if (is_freed) {
-			rw_lock_x_unlock(&part->latch);
+			part->latch.wr_unlock();
 		} else {
-			rw_lock_s_unlock(&part->latch);
+			part->latch.rd_unlock();
 		}
 		return;
 	}
 
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	ut_ad(!index->disable_ahi);
-#endif
+	ut_ad(!index->table->is_temporary());
 	ut_ad(btr_search_enabled);
 
 	ut_ad(block->page.id().space() == index->table->space_id);
@@ -1302,38 +1274,55 @@ retry:
 	releasing search latch, as the index page might only be s-latched! */
 
 	if (!is_freed) {
-		rw_lock_s_unlock(&part->latch);
+		part->latch.rd_unlock();
 	}
 
 	ut_a(n_fields > 0 || n_bytes > 0);
 
-	page = block->frame;
-	n_recs = page_get_n_recs(page);
+	const page_t* const page = block->page.frame;
+	ulint n_recs = page_get_n_recs(page);
+	if (!n_recs) {
+		ut_ad("corrupted adaptive hash index" == 0);
+		return;
+	}
 
 	/* Calculate and cache fold values into an array for fast deletion
 	from the hash index */
 
-	folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint));
-
-	n_cached = 0;
-
 	rec = page_get_infimum_rec(page);
 	rec = page_rec_get_next_low(rec, page_is_comp(page));
-	if (rec_is_metadata(rec, *index)) {
+
+	ulint* folds;
+	ulint n_cached = 0;
+	ulint prev_fold = 0;
+
+	if (rec && rec_is_metadata(rec, *index)) {
 		rec = page_rec_get_next_low(rec, page_is_comp(page));
+		if (!--n_recs) {
+			/* The page only contains the hidden metadata record
+			for instant ALTER TABLE that the adaptive hash index
+			never points to. */
+			folds = nullptr;
+			goto all_deleted;
+		}
 	}
 
-	prev_fold = 0;
-
-	heap = NULL;
-	offsets = NULL;
+	folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint));
+	heap = nullptr;
+	offsets = nullptr;
 
-	while (!page_rec_is_supremum(rec)) {
+	while (rec) {
+		if (n_cached >= n_recs) {
+			ut_ad(page_rec_is_supremum(rec));
+			break;
+		}
+		ut_ad(page_rec_is_user_rec(rec));
 		offsets = rec_get_offsets(
 			rec, index, offsets, index->n_core_fields,
 			btr_search_get_n_fields(n_fields, n_bytes),
 			&heap);
-		fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+		const ulint fold = rec_fold(rec, offsets, n_fields, n_bytes,
+					    index_id);
 
 		if (fold == prev_fold && prev_fold != 0) {
 
@@ -1342,11 +1331,13 @@ retry:
 
 		/* Remove all hash nodes pointing to this page from the
 		hash chain */
+		folds[n_cached++] = fold;
 
-		folds[n_cached] = fold;
-		n_cached++;
 next_rec:
 		rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+		if (!rec || page_rec_is_supremum(rec)) {
+			break;
+		}
 		prev_fold = fold;
 	}
 
@@ -1354,8 +1345,9 @@ next_rec:
 		mem_heap_free(heap);
 	}
 
+all_deleted:
 	if (!is_freed) {
-		rw_lock_x_lock(&part->latch);
+		part->latch.wr_lock(SRW_LOCK_CALL);
 
 		if (UNIV_UNLIKELY(!block->index)) {
 			/* Someone else has meanwhile dropped the
@@ -1372,13 +1364,13 @@ next_rec:
 		/* Someone else has meanwhile built a new hash index on the
 		page, with different parameters */
 
-		rw_lock_x_unlock(&part->latch);
+		part->latch.wr_unlock();
 
 		ut_free(folds);
 		goto retry;
 	}
 
-	for (i = 0; i < n_cached; i++) {
+	for (ulint i = 0; i < n_cached; i++) {
 		ha_remove_all_nodes_to_page(&part->table, part->heap,
 					    folds[i], page);
 	}
@@ -1392,14 +1384,14 @@ next_rec:
 		}
 	}
 
-	block->index = NULL;
+	block->index = nullptr;
 
 	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
 	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
 
 cleanup:
 	assert_block_ahi_valid(block);
-	rw_lock_x_unlock(&part->latch);
+	part->latch.wr_unlock();
 
 	ut_free(folds);
 }
@@ -1411,7 +1403,6 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
 {
 	buf_block_t*	block;
 	mtr_t		mtr;
-	dberr_t		err = DB_SUCCESS;
 
 	mtr_start(&mtr);
 
@@ -1422,26 +1413,15 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
 	(recursively) x-latch it, even though we are only reading. */
 
 	block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL,
-				 BUF_PEEK_IF_IN_POOL, __FILE__, __LINE__,
-				 &mtr, &err);
-
-	if (block) {
-
-		/* If AHI is still valid, page can't be in free state.
-		AHI is dropped when page is freed. */
-		DBUG_ASSERT(block->page.status != buf_page_t::FREED);
-
-		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
-
-		dict_index_t*	index = block->index;
-		if (index != NULL) {
-			/* In all our callers, the table handle should
-			be open, or we should be in the process of
-			dropping the table (preventing eviction). */
-			ut_ad(index->table->get_ref_count() > 0
-			      || mutex_own(&dict_sys.mutex));
-			btr_search_drop_page_hash_index(block, false);
-		}
+				 BUF_PEEK_IF_IN_POOL, &mtr);
+
+	if (block && block->index) {
+		/* In all our callers, the table handle should
+		be open, or we should be in the process of
+		dropping the table (preventing eviction). */
+		DBUG_ASSERT(block->index->table->get_ref_count()
+			    || dict_sys.locked());
+		btr_search_drop_page_hash_index(block, false);
 	}
 
 	mtr_commit(&mtr);
@@ -1462,13 +1442,12 @@ void
 btr_search_build_page_hash_index(
 	dict_index_t*	index,
 	buf_block_t*	block,
-	rw_lock_t*	ahi_latch,
+	srw_spin_lock*	ahi_latch,
 	uint16_t	n_fields,
 	uint16_t	n_bytes,
 	bool		left_side)
 {
 	const rec_t*	rec;
-	const rec_t*	next_rec;
 	ulint		fold;
 	ulint		next_fold;
 	ulint		n_cached;
@@ -1479,9 +1458,8 @@ btr_search_build_page_hash_index(
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets		= offsets_;
 
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	if (index->disable_ahi) return;
-#endif
+	ut_ad(!index->table->is_temporary());
+
 	if (!btr_search_enabled) {
 		return;
 	}
@@ -1491,13 +1469,12 @@ btr_search_build_page_hash_index(
 	ut_ad(index);
 	ut_ad(block->page.id().space() == index->table->space_id);
 	ut_ad(!dict_index_is_ibuf(index));
-	ut_ad(page_is_leaf(block->frame));
+	ut_ad(page_is_leaf(block->page.frame));
 
-	ut_ad(rw_lock_own_flagged(&block->lock,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+	ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
 	ut_ad(block->page.id().page_no() >= 3);
 
-	rw_lock_s_lock(ahi_latch);
+	ahi_latch->rd_lock(SRW_LOCK_CALL);
 
 	const bool enabled = btr_search_enabled;
 	const bool rebuild = enabled && block->index
@@ -1505,7 +1482,7 @@ btr_search_build_page_hash_index(
 		    || block->curr_n_bytes != n_bytes
 		    || block->curr_left_side != left_side);
 
-	rw_lock_s_unlock(ahi_latch);
+	ahi_latch->rd_unlock();
 
 	if (!enabled) {
 		return;
@@ -1536,10 +1513,11 @@ btr_search_build_page_hash_index(
 	}
 
 	rec = page_rec_get_next_const(page_get_infimum_rec(page));
+        if (!rec) return;
 
 	if (rec_is_metadata(rec, *index)) {
 		rec = page_rec_get_next_const(rec);
-		if (!--n_recs) return;
+		if (!rec || !--n_recs) return;
 	}
 
 	/* Calculate and cache fold values and corresponding records into
@@ -1569,9 +1547,7 @@ btr_search_build_page_hash_index(
 		n_cached++;
 	}
 
-	for (;;) {
-		next_rec = page_rec_get_next_const(rec);
-
+	while (const rec_t* next_rec = page_rec_get_next_const(rec)) {
 		if (page_rec_is_supremum(next_rec)) {
 
 			if (!left_side) {
@@ -1611,7 +1587,7 @@ btr_search_build_page_hash_index(
 
 	btr_search_check_free_space_in_heap(index);
 
-	rw_lock_x_lock(ahi_latch);
+	ahi_latch->wr_lock(SRW_LOCK_CALL);
 
 	if (!btr_search_enabled) {
 		goto exit_func;
@@ -1650,7 +1626,7 @@ btr_search_build_page_hash_index(
 	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
 exit_func:
 	assert_block_ahi_valid(block);
-	rw_lock_x_unlock(ahi_latch);
+	ahi_latch->wr_unlock();
 
 	ut_free(folds);
 	ut_free(recs);
@@ -1662,14 +1638,10 @@ exit_func:
 /** Updates the search info.
 @param[in,out]	info	search info
 @param[in,out]	cursor	cursor which was just positioned */
-void
-btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor)
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor)
 {
-	rw_lock_t*	ahi_latch = &btr_search_sys.get_part(*cursor->index)
+	srw_spin_lock*	ahi_latch = &btr_search_sys.get_part(*cursor->index())
 		->latch;
-	ut_ad(!rw_lock_own_flagged(ahi_latch,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-
 	buf_block_t*	block = btr_cur_get_block(cursor);
 
 	/* NOTE that the following two function calls do NOT protect
@@ -1683,7 +1655,7 @@ btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor)
 
 	if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
 
-		btr_search_check_free_space_in_heap(cursor->index);
+		btr_search_check_free_space_in_heap(cursor->index());
 	}
 
 	if (cursor->flag == BTR_CUR_HASH_FAIL) {
@@ -1700,7 +1672,7 @@ btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor)
 		/* Note that since we did not protect block->n_fields etc.
 		with any semaphore, the values can be inconsistent. We have
 		to check inside the function call that they make sense. */
-		btr_search_build_page_hash_index(cursor->index, block,
+		btr_search_build_page_hash_index(cursor->index(), block,
 						 ahi_latch,
 						 block->n_fields,
 						 block->n_bytes,
@@ -1719,8 +1691,8 @@ btr_search_move_or_delete_hash_entries(
 	buf_block_t*	new_block,
 	buf_block_t*	block)
 {
-	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
-	ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_X));
+	ut_ad(block->page.lock.have_x());
+	ut_ad(new_block->page.lock.have_x());
 
 	if (!btr_search_enabled) {
 		return;
@@ -1735,7 +1707,7 @@ btr_search_move_or_delete_hash_entries(
 	assert_block_ahi_valid(block);
 	assert_block_ahi_valid(new_block);
 
-	rw_lock_t* ahi_latch = index
+	srw_spin_lock* ahi_latch = index
 		? &btr_search_sys.get_part(*index)->latch
 		: nullptr;
 
@@ -1749,10 +1721,10 @@ drop_exit:
 		return;
 	}
 
-	rw_lock_s_lock(ahi_latch);
+	ahi_latch->rd_lock(SRW_LOCK_CALL);
 
 	if (index->freed()) {
-		rw_lock_s_unlock(ahi_latch);
+		ahi_latch->rd_unlock();
 		goto drop_exit;
 	}
 
@@ -1765,7 +1737,7 @@ drop_exit:
 		new_block->n_bytes = block->curr_n_bytes;
 		new_block->left_side = left_side;
 
-		rw_lock_s_unlock(ahi_latch);
+		ahi_latch->rd_unlock();
 
 		ut_a(n_fields > 0 || n_bytes > 0);
 
@@ -1778,13 +1750,13 @@ drop_exit:
 		return;
 	}
 
-	rw_lock_s_unlock(ahi_latch);
+	ahi_latch->rd_unlock();
 }
 
 /** Updates the page hash index when a single record is deleted from a page.
 @param[in]	cursor	cursor which was positioned on the record to delete
 			using btr_cur_search_, the record is not yet deleted.*/
-void btr_search_update_hash_on_delete(btr_cur_t* cursor)
+void btr_search_update_hash_on_delete(btr_cur_t *cursor)
 {
 	buf_block_t*	block;
 	const rec_t*	rec;
@@ -1795,9 +1767,6 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 	rec_offs_init(offsets_);
 
 	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	if (cursor->index->disable_ahi) return;
-#endif
 
 	if (!btr_search_enabled) {
 		return;
@@ -1805,7 +1774,7 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 
 	block = btr_cur_get_block(cursor);
 
-	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+	ut_ad(block->page.lock.have_x());
 
 	assert_block_ahi_valid(block);
 	index = block->index;
@@ -1815,13 +1784,15 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 		return;
 	}
 
-	if (index != cursor->index) {
+	ut_ad(!cursor->index()->table->is_temporary());
+
+	if (index != cursor->index()) {
 		btr_search_drop_page_hash_index(block, false);
 		return;
 	}
 
 	ut_ad(block->page.id().space() == index->table->space_id);
-	ut_a(index == cursor->index);
+	ut_a(index == cursor->index());
 	ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
 	ut_ad(!dict_index_is_ibuf(index));
 
@@ -1837,7 +1808,7 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 
 	auto part = btr_search_sys.get_part(*index);
 
-	rw_lock_x_lock(&part->latch);
+	part->latch.wr_lock(SRW_LOCK_CALL);
 	assert_block_ahi_valid(block);
 
 	if (block->index && btr_search_enabled) {
@@ -1853,7 +1824,7 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 		assert_block_ahi_valid(block);
 	}
 
-	rw_lock_x_unlock(&part->latch);
+	part->latch.wr_unlock();
 }
 
 /** Updates the page hash index when a single record is inserted on a page.
@@ -1861,19 +1832,15 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor)
 			using btr_cur_search_, and the new record has been
 			inserted next to the cursor.
 @param[in]	ahi_latch	the adaptive hash index latch */
-void
-btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+                                           srw_spin_lock *ahi_latch)
 {
 	buf_block_t*	block;
 	dict_index_t*	index;
 	rec_t*		rec;
 
-	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
-	ut_ad(!btr_search_own_any(RW_LOCK_S));
-	ut_ad(!btr_search_own_any(RW_LOCK_X));
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	if (cursor->index->disable_ahi) return;
-#endif
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch);
+
 	if (!btr_search_enabled) {
 		return;
 	}
@@ -1882,7 +1849,7 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 
 	block = btr_cur_get_block(cursor);
 
-	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+	ut_ad(block->page.lock.have_x());
 
 	index = block->index;
 
@@ -1891,15 +1858,17 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 		return;
 	}
 
-	if (index != cursor->index) {
-		ut_ad(index->id == cursor->index->id);
+	ut_ad(!cursor->index()->table->is_temporary());
+
+	if (index != cursor->index()) {
+		ut_ad(index->id == cursor->index()->id);
 		btr_search_drop_page_hash_index(block, false);
 		return;
 	}
 
-	ut_a(cursor->index == index);
+	ut_a(cursor->index() == index);
 	ut_ad(!dict_index_is_ibuf(index));
-	rw_lock_x_lock(ahi_latch);
+	ahi_latch->wr_lock(SRW_LOCK_CALL);
 
 	if (!block->index || !btr_search_enabled) {
 
@@ -1912,19 +1881,22 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 	    && (cursor->n_fields == block->curr_n_fields)
 	    && (cursor->n_bytes == block->curr_n_bytes)
 	    && !block->curr_left_side) {
-
-		if (ha_search_and_update_if_found(
-			&btr_search_sys.get_part(*cursor->index)->table,
-			cursor->fold, rec, block,
-			page_rec_get_next(rec))) {
-			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+		if (const rec_t *new_rec = page_rec_get_next_const(rec)) {
+			if (ha_search_and_update_if_found(
+				&btr_search_sys.get_part(*cursor->index())
+				->table,
+				cursor->fold, rec, block, new_rec)) {
+				MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+			}
+		} else {
+			ut_ad("corrupted page" == 0);
 		}
 
 func_exit:
 		assert_block_ahi_valid(block);
-		rw_lock_x_unlock(ahi_latch);
+		ahi_latch->wr_unlock();
 	} else {
-		rw_lock_x_unlock(ahi_latch);
+		ahi_latch->wr_unlock();
 
 		btr_search_update_hash_on_insert(cursor, ahi_latch);
 	}
@@ -1936,8 +1908,8 @@ func_exit:
 				and the new record has been inserted next
 				to the cursor
 @param[in]	ahi_latch	the adaptive hash index latch */
-void
-btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+                                      srw_spin_lock *ahi_latch)
 {
 	buf_block_t*	block;
 	dict_index_t*	index;
@@ -1954,20 +1926,16 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 	rec_offs*	offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index)->latch);
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch);
 	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
-	ut_ad(!btr_search_own_any(RW_LOCK_S));
-	ut_ad(!btr_search_own_any(RW_LOCK_X));
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	if (cursor->index->disable_ahi) return;
-#endif
+
 	if (!btr_search_enabled) {
 		return;
 	}
 
 	block = btr_cur_get_block(cursor);
 
-	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_X));
+	ut_ad(block->page.lock.have_x());
 	assert_block_ahi_valid(block);
 
 	index = block->index;
@@ -1982,16 +1950,16 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 
 	rec = btr_cur_get_rec(cursor);
 
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	ut_a(!index->disable_ahi);
-#endif
-	if (index != cursor->index) {
-		ut_ad(index->id == cursor->index->id);
+	ut_ad(!cursor->index()->table->is_temporary());
+
+	if (index != cursor->index()) {
+		ut_ad(index->id == cursor->index()->id);
+drop:
 		btr_search_drop_page_hash_index(block, false);
 		return;
 	}
 
-	ut_a(index == cursor->index);
+	ut_a(index == cursor->index());
 	ut_ad(!dict_index_is_ibuf(index));
 
 	n_fields = block->curr_n_fields;
@@ -1999,7 +1967,9 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 	const bool left_side = block->curr_left_side;
 
 	ins_rec = page_rec_get_next_const(rec);
+	if (UNIV_UNLIKELY(!ins_rec)) goto drop;
 	next_rec = page_rec_get_next_const(ins_rec);
+	if (UNIV_UNLIKELY(!next_rec)) goto drop;
 
 	offsets = rec_get_offsets(ins_rec, index, offsets,
 				  index->n_core_fields,
@@ -2026,7 +1996,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 	} else {
 		if (left_side) {
 			locked = true;
-			rw_lock_x_lock(ahi_latch);
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
 
 			if (!btr_search_enabled || !block->index) {
 				goto function_exit;
@@ -2045,7 +2015,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch)
 
 		if (!locked) {
 			locked = true;
-			rw_lock_x_lock(ahi_latch);
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
 
 			if (!btr_search_enabled || !block->index) {
 				goto function_exit;
@@ -2070,7 +2040,7 @@ check_next_rec:
 		if (!left_side) {
 			if (!locked) {
 				locked = true;
-				rw_lock_x_lock(ahi_latch);
+				ahi_latch->wr_lock(SRW_LOCK_CALL);
 
 				if (!btr_search_enabled || !block->index) {
 					goto function_exit;
@@ -2090,7 +2060,7 @@ check_next_rec:
 	if (ins_fold != next_fold) {
 		if (!locked) {
 			locked = true;
-			rw_lock_x_lock(ahi_latch);
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
 
 			if (!btr_search_enabled || !block->index) {
 				goto function_exit;
@@ -2114,9 +2084,8 @@ function_exit:
 		mem_heap_free(heap);
 	}
 	if (locked) {
-		rw_lock_x_unlock(ahi_latch);
+		ahi_latch->wr_unlock();
 	}
-	ut_ad(!rw_lock_own(ahi_latch, RW_LOCK_X));
 }
 
 #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
@@ -2187,7 +2156,7 @@ btr_search_hash_table_validate(ulint hash_table_id)
 			mysql_mutex_unlock(&buf_pool.mutex);
 			btr_search_x_unlock_all();
 
-			os_thread_yield();
+			std::this_thread::yield();
 
 			btr_search_x_lock_all();
 
@@ -2217,9 +2186,7 @@ btr_search_hash_table_validate(ulint hash_table_id)
 				= buf_pool.block_from_ahi((byte*) node->data);
 			index_id_t		page_index_id;
 
-			if (UNIV_LIKELY(block->page.state()
-					== BUF_BLOCK_FILE_PAGE)) {
-
+			if (UNIV_LIKELY(block->page.in_file())) {
 				/* The space and offset are only valid
 				for file blocks.  It is possible that
 				the block is being freed
@@ -2227,8 +2194,9 @@ btr_search_hash_table_validate(ulint hash_table_id)
 				assertion and the comment below) */
 				const page_id_t id(block->page.id());
 				if (const buf_page_t* hash_page
-				    = buf_pool.page_hash_get_low(
-					    id, id.fold())) {
+				    = buf_pool.page_hash.get(
+					    id, buf_pool.page_hash.cell_get(
+						    id.fold()))) {
 					ut_ad(hash_page == &block->page);
 					goto state_ok;
 				}
@@ -2239,13 +2207,15 @@ btr_search_hash_table_validate(ulint hash_table_id)
 			the block from buf_pool.page_hash by calling
 			buf_LRU_block_remove_hashed_page(). Then it
 			invokes btr_search_drop_page_hash_index(). */
-			ut_a(block->page.state() == BUF_BLOCK_REMOVE_HASH);
+			ut_a(block->page.state() == buf_page_t::REMOVE_HASH);
 state_ok:
 			ut_ad(!dict_index_is_ibuf(block->index));
 			ut_ad(block->page.id().space()
 			      == block->index->table->space_id);
 
-			page_index_id = btr_page_get_index_id(block->frame);
+			const page_t* page = block->page.frame;
+
+			page_index_id = btr_page_get_index_id(page);
 
 			offsets = rec_get_offsets(
 				node->data, block->index, offsets,
@@ -2261,8 +2231,6 @@ state_ok:
 				page_index_id);
 
 			if (node->fold != fold) {
-				const page_t*	page = block->frame;
-
 				ok = FALSE;
 
 				ib::error() << "Error in an adaptive hash"
@@ -2296,7 +2264,7 @@ state_ok:
 			mysql_mutex_unlock(&buf_pool.mutex);
 			btr_search_x_unlock_all();
 
-			os_thread_yield();
+			std::this_thread::yield();
 
 			btr_search_x_lock_all();
 
@@ -2354,14 +2322,14 @@ btr_search_validate()
 #ifdef UNIV_DEBUG
 bool btr_search_check_marked_free_index(const buf_block_t *block)
 {
-  const index_id_t index_id= btr_page_get_index_id(block->frame);
+  const index_id_t index_id= btr_page_get_index_id(block->page.frame);
   auto part= btr_search_sys.get_part(index_id, block->page.id().space());
 
-  rw_lock_s_lock(&part->latch);
+  part->latch.rd_lock(SRW_LOCK_CALL);
 
   bool is_freed= block->index && block->index->freed();
 
-  rw_lock_s_unlock(&part->latch);
+  part->latch.rd_unlock();
 
   return is_freed;
 }
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
index 6d99d0b61f0..6bd01faa279 100644
--- a/storage/innobase/buf/buf0block_hint.cc
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, MariaDB Corporation.
+Copyright (c) 2020, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License, version 2.0, as published by the
@@ -28,6 +28,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "buf0block_hint.h"
 namespace buf {
 
+TRANSACTIONAL_TARGET
 void Block_hint::buffer_fix_block_if_still_valid()
 {
   /* To check if m_block belongs to the current buf_pool, we must
@@ -42,18 +43,17 @@ void Block_hint::buffer_fix_block_if_still_valid()
   different page, and that slice of buf_pool.page_hash could be protected
   by another hash_lock that we are not holding.)
 
-  Finally, assuming that we have correct hash bucket latched, we must
-  validate m_block->state() to ensure that the block is not being freed. */
+  Finally, we must ensure that the block is not being freed. */
   if (m_block)
   {
-    const ulint fold= m_page_id.fold();
-    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
+    auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(cell)};
     if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
-        m_block->page.state() == BUF_BLOCK_FILE_PAGE)
-      buf_block_buf_fix_inc(m_block, __FILE__, __LINE__);
+        m_block->page.frame && m_block->page.in_file())
+      m_block->page.fix();
     else
       clear();
-    hash_lock->read_unlock();
   }
 }
 }  // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
index f822adc3389..85a698bc875 100644
--- a/storage/innobase/buf/buf0buddy.cc
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -298,7 +298,7 @@ static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
 
 	buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
 
-	if (buf_pool.curr_size < buf_pool.old_size
+	if (buf_pool.is_shrinking()
 	    && UT_LIST_GET_LEN(buf_pool.withdraw)
 	    < buf_pool.withdraw_target) {
 
@@ -354,14 +354,15 @@ buf_buddy_block_free(void* buf)
 	ut_a(!ut_align_offset(buf, srv_page_size));
 
 	HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
-		    ut_ad(bpage->state() == BUF_BLOCK_MEMORY
+		    ut_ad(bpage->state() == buf_page_t::MEMORY
 			  && bpage->in_zip_hash),
-		    ((buf_block_t*) bpage)->frame == buf);
+		    bpage->frame == buf);
 	ut_a(bpage);
-	ut_a(bpage->state() == BUF_BLOCK_MEMORY);
+	ut_a(bpage->state() == buf_page_t::MEMORY);
 	ut_ad(bpage->in_zip_hash);
 	ut_d(bpage->in_zip_hash = false);
 	HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
+	bpage->hash = nullptr;
 
 	ut_d(memset(buf, 0, srv_page_size));
 	MEM_UNDEFINED(buf, srv_page_size);
@@ -382,10 +383,10 @@ buf_buddy_block_register(
 	buf_block_t*	block)	/*!< in: buffer frame to allocate */
 {
 	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
-	ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+	ut_ad(block->page.state() == buf_page_t::MEMORY);
 
-	ut_a(block->frame);
-	ut_a(!ut_align_offset(block->frame, srv_page_size));
+	ut_a(block->page.frame);
+	ut_a(!ut_align_offset(block->page.frame, srv_page_size));
 
 	ut_ad(!block->page.in_zip_hash);
 	ut_d(block->page.in_zip_hash = true);
@@ -461,8 +462,8 @@ byte *buf_buddy_alloc_low(ulint i, bool *lru)
 alloc_big:
 	buf_buddy_block_register(block);
 
-	block = (buf_block_t*) buf_buddy_alloc_from(
-		block->frame, i, BUF_BUDDY_SIZES);
+	block = reinterpret_cast<buf_block_t*>(
+		buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES));
 
 func_exit:
 	buf_pool.buddy_stat[i].used++;
@@ -499,9 +500,10 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
 	ut_ad(space != BUF_BUDDY_STAMP_FREE);
 
 	const page_id_t	page_id(space, offset);
-	const ulint fold= page_id.fold();
+	/* FIXME: we are computing this while holding buf_pool.mutex */
+	auto &cell= buf_pool.page_hash.cell_get(page_id.fold());
 
-	bpage = buf_pool.page_hash_get_low(page_id, fold);
+	bpage = buf_pool.page_hash.get(page_id, cell);
 
 	if (!bpage || bpage->zip.data != src) {
 		/* The block has probably been freshly
@@ -546,8 +548,11 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
 		return false;
 	}
 
-	page_hash_latch *hash_lock = buf_pool.page_hash.lock_get(fold);
-	hash_lock->write_lock();
+	page_hash_latch &hash_lock = buf_pool.page_hash.lock_get(cell);
+	/* It does not make sense to use transactional_lock_guard here,
+	because the memcpy() of 1024 to 16384 bytes would likely make the
+	memory transaction too large. */
+	hash_lock.lock();
 
 	if (bpage->can_relocate()) {
 		/* Relocate the compressed page. */
@@ -558,7 +563,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
 		memcpy(dst, src, size);
 		bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
 
-		hash_lock->write_unlock();
+		hash_lock.unlock();
 
 		buf_buddy_mem_invalid(
 			reinterpret_cast<buf_buddy_free_t*>(src), i);
@@ -569,7 +574,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
 		return(true);
 	}
 
-	hash_lock->write_unlock();
+	hash_lock.unlock();
 
 	return(false);
 }
@@ -604,7 +609,7 @@ recombine:
 	We may waste up to 15360*max_len bytes to free blocks
 	(1024 + 2048 + 4096 + 8192 = 15360) */
 	if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
-	    && buf_pool.curr_size >= buf_pool.old_size) {
+	    && !buf_pool.is_shrinking()) {
 		goto func_exit;
 	}
 
@@ -689,7 +694,7 @@ buf_buddy_realloc(void* buf, ulint size)
 
 		block = reinterpret_cast<buf_block_t*>(
 			buf_buddy_alloc_from(
-				block->frame, i, BUF_BUDDY_SIZES));
+				block->page.frame, i, BUF_BUDDY_SIZES));
 	}
 
 	buf_pool.buddy_stat[i].used++;
@@ -710,7 +715,7 @@ buf_buddy_realloc(void* buf, ulint size)
 void buf_buddy_condense_free()
 {
 	mysql_mutex_assert_owner(&buf_pool.mutex);
-	ut_ad(buf_pool.curr_size < buf_pool.old_size);
+	ut_ad(buf_pool.is_shrinking());
 
 	for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
 		buf_buddy_free_t* buf =
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 83eee85d749..462b1eb634a 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -48,7 +48,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0buddy.h"
 #include "buf0dblwr.h"
 #include "lock0lock.h"
-#include "sync0rw.h"
 #include "btr0sea.h"
 #include "ibuf0ibuf.h"
 #include "trx0undo.h"
@@ -64,7 +63,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "fil0pagecompress.h"
 #endif /* !UNIV_INNOCHECKSUM */
 #include "page0zip.h"
-#include "sync0sync.h"
 #include "buf0dump.h"
 #include <map>
 #include <sstream>
@@ -222,11 +220,11 @@ buf_pool.LRU.
 
 The chains of free memory blocks (buf_pool.zip_free[]) are used by
 the buddy allocator (buf0buddy.cc) to keep track of currently unused
-memory blocks of size sizeof(buf_page_t)..srv_page_size / 2.  These
-blocks are inside the srv_page_size-sized memory blocks of type
+memory blocks of size 1024..innodb_page_size / 2.  These
+blocks are inside the memory blocks of size innodb_page_size and type
 BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
-pool.  The buddy allocator is solely used for allocating control
-blocks for compressed pages (buf_page_t) and compressed page frames.
+pool.  The buddy allocator is solely used for allocating
+ROW_FORMAT=COMPRESSED page frames.
 
 		Loading a file page
 		-------------------
@@ -234,9 +232,9 @@ blocks for compressed pages (buf_page_t) and compressed page frames.
 First, a victim block for replacement has to be found in the
 buf_pool. It is taken from the free list or searched for from the
 end of the LRU-list. An exclusive lock is reserved for the frame,
-the io_fix field is set in the block fixing the block in buf_pool,
+the io_fix is set in the block fixing the block in buf_pool,
 and the io-operation for loading the page is queued. The io-handler thread
-releases the X-lock on the frame and resets the io_fix field
+releases the X-lock on the frame and releases the io_fix
 when the io operation completes.
 
 A thread may request the above operation using the function
@@ -280,18 +278,19 @@ the read requests for the whole area.
 */
 
 #ifndef UNIV_INNOCHECKSUM
+# ifdef SUX_LOCK_GENERIC
 void page_hash_latch::read_lock_wait()
 {
   /* First, try busy spinning for a while. */
   for (auto spin= srv_n_spin_wait_rounds; spin--; )
   {
-    ut_delay(srv_spin_wait_delay);
+    LF_BACKOFF();
     if (read_trylock())
       return;
   }
   /* Fall back to yielding to other threads. */
   do
-    os_thread_yield();
+    std::this_thread::yield();
   while (!read_trylock());
 }
 
@@ -304,18 +303,16 @@ void page_hash_latch::write_lock_wait()
   {
     if (write_lock_poll())
       return;
-    ut_delay(srv_spin_wait_delay);
+    LF_BACKOFF();
   }
 
   /* Fall back to yielding to other threads. */
   do
-    os_thread_yield();
+    std::this_thread::yield();
   while (!write_lock_poll());
 }
+# endif
 
-/** Value in microseconds */
-constexpr int WAIT_FOR_READ= 100;
-constexpr int WAIT_FOR_WRITE= 100;
 /** Number of attempts made to read in a page in the buffer pool */
 constexpr ulint	BUF_PAGE_READ_MAX_RETRIES= 100;
 /** The maximum portion of the buffer pool that can be used for the
@@ -336,16 +333,13 @@ buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
 #ifdef UNIV_DEBUG
 /** This is used to insert validation operations in execution
 in the debug version */
-static ulint buf_dbg_counter;
+static Atomic_counter<size_t> buf_dbg_counter;
 #endif /* UNIV_DEBUG */
 
 /** Macro to determine whether the read of write counter is used depending
 on the io_type */
-#define MONITOR_RW_COUNTER(io_type, counter)		\
-	((io_type == BUF_IO_READ)			\
-	 ? (counter##_READ)				\
-	 : (counter##_WRITTEN))
-
+#define MONITOR_RW_COUNTER(read, counter)		\
+	(read ? (counter##_READ) : (counter##_WRITTEN))
 
 /** Decrypt a page for temporary tablespace.
 @param[in,out]	tmp_frame	Temporary buffer
@@ -399,8 +393,7 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
 	ut_ad(node.space->id == bpage->id().space());
 	const auto flags = node.space->flags;
 
-	byte* dst_frame = bpage->zip.data ? bpage->zip.data :
-		((buf_block_t*) bpage)->frame;
+	byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame;
 	bool page_compressed = node.space->is_compressed()
 		&& buf_page_is_compressed(dst_frame, flags);
 	const page_id_t id(bpage->id());
@@ -410,28 +403,21 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
 		return (true);
 	}
 
-	if (node.space->purpose == FIL_TYPE_TEMPORARY
+	buf_tmp_buffer_t* slot;
+
+	if (id.space() == SRV_TMP_SPACE_ID
 	    && innodb_encrypt_temporary_tables) {
-		buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve();
-		ut_a(slot);
+		slot = buf_pool.io_buf_reserve();
 		slot->allocate();
-
-		if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) {
-			slot->release();
-			ib::error() << "Encrypted page " << id
-				    << " in file " << node.name;
-			return false;
-		}
-
+		bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame);
 		slot->release();
-		return true;
+		return ok;
 	}
 
 	/* Page is encrypted if encryption information is found from
 	tablespace and page contains used key_version. This is true
 	also for pages first compressed and then encrypted. */
 
-	buf_tmp_buffer_t* slot;
 	uint key_version = buf_page_get_key_version(dst_frame, flags);
 
 	if (page_compressed && !key_version) {
@@ -444,17 +430,12 @@ decompress:
 		}
 
 		slot = buf_pool.io_buf_reserve();
-		ut_a(slot);
 		slot->allocate();
 
 decompress_with_slot:
-		ut_d(fil_page_type_validate(node.space, dst_frame));
-
 		ulint write_size = fil_page_decompress(
 			slot->crypt_buf, dst_frame, flags);
 		slot->release();
-		ut_ad(!write_size
-		      || fil_page_type_validate(node.space, dst_frame));
 		ut_ad(node.space->referenced());
 		return write_size != 0;
 	}
@@ -472,9 +453,7 @@ decrypt_failed:
 		}
 
 		slot = buf_pool.io_buf_reserve();
-		ut_a(slot);
 		slot->allocate();
-		ut_d(fil_page_type_validate(node.space, dst_frame));
 
 		/* decrypt using crypt_buf to dst_frame */
 		if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
@@ -482,8 +461,6 @@ decrypt_failed:
 			goto decrypt_failed;
 		}
 
-		ut_d(fil_page_type_validate(node.space, dst_frame));
-
 		if ((fil_space_t::full_crc32(flags) && page_compressed)
 		    || fil_page_get_type(dst_frame)
 		    == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
@@ -506,6 +483,7 @@ decrypt_failed:
 @param[in]	checksum_field1		new checksum field
 @param[in]	checksum_field2		old checksum field
 @return true if the page is in crc32 checksum format. */
+static
 bool
 buf_page_is_checksum_valid_crc32(
 	const byte*			read_buf,
@@ -515,8 +493,9 @@ buf_page_is_checksum_valid_crc32(
 	const uint32_t	crc32 = buf_calc_page_crc32(read_buf);
 
 #ifdef UNIV_INNOCHECKSUM
-	if (log_file
-	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
+	extern FILE* log_file;
+	extern uint32_t cur_page_num;
+	if (log_file) {
 		fprintf(log_file, "page::" UINT32PF ";"
 			" crc32 calculated = " UINT32PF ";"
 			" recorded checksum field1 = " ULINTPF " recorded"
@@ -532,132 +511,6 @@ buf_page_is_checksum_valid_crc32(
 	return checksum_field1 == crc32;
 }
 
-/** Checks if the page is in innodb checksum format.
-@param[in]	read_buf	database page
-@param[in]	checksum_field1	new checksum field
-@param[in]	checksum_field2	old checksum field
-@return true if the page is in innodb checksum format. */
-bool
-buf_page_is_checksum_valid_innodb(
-	const byte*			read_buf,
-	ulint				checksum_field1,
-	ulint				checksum_field2)
-{
-	/* There are 2 valid formulas for
-	checksum_field2 (old checksum field) which algo=innodb could have
-	written to the page:
-
-	1. Very old versions of InnoDB only stored 8 byte lsn to the
-	start and the end of the page.
-
-	2. Newer InnoDB versions store the old formula checksum
-	(buf_calc_page_old_checksum()). */
-
-	ulint	old_checksum = buf_calc_page_old_checksum(read_buf);
-	ulint	new_checksum = buf_calc_page_new_checksum(read_buf);
-
-#ifdef UNIV_INNOCHECKSUM
-	if (log_file
-	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
-		fprintf(log_file, "page::" UINT32PF ";"
-			" old style: calculated ="
-			" " ULINTPF "; recorded = " ULINTPF "\n",
-			cur_page_num, old_checksum,
-			checksum_field2);
-		fprintf(log_file, "page::" UINT32PF ";"
-			" new style: calculated ="
-			" " ULINTPF "; crc32 = " UINT32PF "; recorded = " ULINTPF "\n",
-			cur_page_num, new_checksum,
-			buf_calc_page_crc32(read_buf), checksum_field1);
-	}
-
-	if (log_file
-	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
-		fprintf(log_file, "page::" UINT32PF ";"
-			" old style: calculated ="
-			" " ULINTPF "; recorded checksum = " ULINTPF "\n",
-			cur_page_num, old_checksum,
-			checksum_field2);
-		fprintf(log_file, "page::" UINT32PF ";"
-			" new style: calculated ="
-			" " ULINTPF "; recorded checksum  = " ULINTPF "\n",
-			cur_page_num, new_checksum,
-			checksum_field1);
-	}
-#endif /* UNIV_INNOCHECKSUM */
-
-
-	if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
-	    && checksum_field2 != old_checksum) {
-		DBUG_LOG("checksum",
-			 "Page checksum crc32 not valid"
-			 << " field1 " << checksum_field1
-			 << " field2 " << checksum_field2
-			 << " crc32 " << buf_calc_page_old_checksum(read_buf)
-			 << " lsn " << mach_read_from_4(
-				 read_buf + FIL_PAGE_LSN));
-		return(false);
-	}
-
-	/* old field is fine, check the new field */
-
-	/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
-	(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
-
-	if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
-		DBUG_LOG("checksum",
-			 "Page checksum crc32 not valid"
-			 << " field1 " << checksum_field1
-			 << " field2 " << checksum_field2
-			 << " crc32 " << buf_calc_page_new_checksum(read_buf)
-			 << " lsn " << mach_read_from_4(
-				 read_buf + FIL_PAGE_LSN));
-		return(false);
-	}
-
-	return(true);
-}
-
-/** Checks if the page is in none checksum format.
-@param[in]	read_buf	database page
-@param[in]	checksum_field1	new checksum field
-@param[in]	checksum_field2	old checksum field
-@return true if the page is in none checksum format. */
-bool
-buf_page_is_checksum_valid_none(
-	const byte*			read_buf,
-	ulint				checksum_field1,
-	ulint				checksum_field2)
-{
-#ifndef DBUG_OFF
-	if (checksum_field1 != checksum_field2
-	    && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
-		DBUG_LOG("checksum",
-			 "Page checksum crc32 not valid"
-			 << " field1 " << checksum_field1
-			 << " field2 " << checksum_field2
-			 << " crc32 " << BUF_NO_CHECKSUM_MAGIC
-			 << " lsn " << mach_read_from_4(read_buf
-							+ FIL_PAGE_LSN));
-	}
-#endif /* DBUG_OFF */
-
-#ifdef UNIV_INNOCHECKSUM
-	if (log_file
-	    && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
-		fprintf(log_file,
-			"page::" UINT32PF "; none checksum: calculated"
-			" = %lu; recorded checksum_field1 = " ULINTPF
-			" recorded checksum_field2 = " ULINTPF "\n",
-			cur_page_num, BUF_NO_CHECKSUM_MAGIC,
-			checksum_field1, checksum_field2);
-	}
-#endif /* UNIV_INNOCHECKSUM */
-
-	return(checksum_field1 == checksum_field2
-	       && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
-}
-
 /** Checks whether the lsn present in the page is lesser than the
 peek current lsn.
 @param[in]	check_lsn	lsn to check
@@ -717,9 +570,6 @@ buf_page_is_corrupted(
 	const byte*		read_buf,
 	ulint			fsp_flags)
 {
-#ifndef UNIV_INNOCHECKSUM
-	DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
-#endif
 	if (fil_space_t::full_crc32(fsp_flags)) {
 		bool compressed = false, corrupted = false;
 		const uint size = buf_page_full_crc32_size(
@@ -764,8 +614,6 @@ buf_page_is_corrupted(
 		return false;
 	}
 
-	size_t		checksum_field1 = 0;
-	size_t		checksum_field2 = 0;
 	const ulint zip_size = fil_space_t::zip_size(fsp_flags);
 	const uint16_t page_type = fil_page_get_type(read_buf);
 
@@ -804,21 +652,14 @@ buf_page_is_corrupted(
 
 	/* Check whether the checksum fields have correct values */
 
-	const srv_checksum_algorithm_t curr_algo =
-		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
-
-	if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
-		return(false);
-	}
-
 	if (zip_size) {
 		return !page_zip_verify_checksum(read_buf, zip_size);
 	}
 
-	checksum_field1 = mach_read_from_4(
+	const uint32_t checksum_field1 = mach_read_from_4(
 		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
 
-	checksum_field2 = mach_read_from_4(
+	const uint32_t checksum_field2 = mach_read_from_4(
 		read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
 
 	static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
@@ -853,47 +694,22 @@ buf_page_is_corrupted(
 		}
 	}
 
-	switch (curr_algo) {
+#ifndef UNIV_INNOCHECKSUM
+	switch (srv_checksum_algorithm) {
 	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+#endif /* !UNIV_INNOCHECKSUM */
 		return !buf_page_is_checksum_valid_crc32(
 			read_buf, checksum_field1, checksum_field2);
-	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-		return !buf_page_is_checksum_valid_innodb(
-			read_buf, checksum_field1, checksum_field2);
-	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		return !buf_page_is_checksum_valid_none(
-			read_buf, checksum_field1, checksum_field2);
-	case SRV_CHECKSUM_ALGORITHM_NONE:
-		/* should have returned false earlier */
-		break;
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_INNODB:
-		const uint32_t crc32 = buf_calc_page_crc32(read_buf);
-
-		if (buf_page_is_checksum_valid_none(read_buf,
-			checksum_field1, checksum_field2)) {
-#ifdef UNIV_INNOCHECKSUM
-			if (log_file) {
-				fprintf(log_file, "page::" UINT32PF ";"
-					" old style: calculated = %u;"
-					" recorded = " ULINTPF ";\n",
-					cur_page_num,
-					buf_calc_page_old_checksum(read_buf),
-					checksum_field2);
-				fprintf(log_file, "page::" UINT32PF ";"
-					" new style: calculated = " UINT32PF ";"
-					" crc32 = " UINT32PF "; recorded = " ULINTPF ";\n",
-					cur_page_num,
-					buf_calc_page_new_checksum(read_buf),
-					crc32,
-					checksum_field1);
-			}
-#endif /* UNIV_INNOCHECKSUM */
+#ifndef UNIV_INNOCHECKSUM
+	default:
+		if (checksum_field1 == BUF_NO_CHECKSUM_MAGIC
+		    && checksum_field2 == BUF_NO_CHECKSUM_MAGIC) {
 			return false;
 		}
 
+		const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+
 		/* Very old versions of InnoDB only stored 8 byte lsn to the
 		start and the end of the page. */
 
@@ -921,20 +737,13 @@ buf_page_is_corrupted(
 		switch (checksum_field1) {
 		case 0:
 		case BUF_NO_CHECKSUM_MAGIC:
-			break;
-		default:
-			if ((checksum_field1 != crc32
-			     || checksum_field2 != crc32)
-			    && checksum_field1
-			    != buf_calc_page_new_checksum(read_buf)) {
-				return true;
-			}
+			return false;
 		}
-
-		break;
+		return (checksum_field1 != crc32 || checksum_field2 != crc32)
+			&& checksum_field1
+			!= buf_calc_page_new_checksum(read_buf);
 	}
-
-	return false;
+#endif /* !UNIV_INNOCHECKSUM */
 }
 
 #ifndef UNIV_INNOCHECKSUM
@@ -1029,28 +838,23 @@ buf_block_init(buf_block_t* block, byte* frame)
 	buf_pool.resize(). Either way, adaptive hash index must not exist. */
 	assert_block_ahi_empty_on_init(block);
 
-	block->frame = frame;
+	block->page.frame = frame;
 
-	block->modify_clock = 0;
-	block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL));
+	MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock);
+	ut_ad(!block->modify_clock);
+	MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock);
+	block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL));
 #ifdef BTR_CUR_HASH_ADAPT
-	block->index = NULL;
+	MEM_MAKE_DEFINED(&block->index, sizeof block->index);
+	ut_ad(!block->index);
 #endif /* BTR_CUR_HASH_ADAPT */
 	ut_d(block->in_unzip_LRU_list = false);
 	ut_d(block->in_withdraw_list = false);
 
 	page_zip_des_init(&block->page.zip);
 
-	ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t)));
-
-	rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
-
-	ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, block->debug_latch,
-			    SYNC_LEVEL_VARYING));
-
-	block->lock.is_block_lock = 1;
-
-	ut_ad(rw_lock_validate(&(block->lock)));
+	MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash);
+	ut_ad(!block->page.hash);
 }
 
 /** Allocate a chunk of buffer frames.
@@ -1123,7 +927,7 @@ inline bool buf_pool_t::chunk_t::create(size_t bytes)
 
   for (auto i= size; i--; ) {
     buf_block_init(block, frame);
-    MEM_UNDEFINED(block->frame, srv_page_size);
+    MEM_UNDEFINED(block->page.frame, srv_page_size);
     /* Add the block to the free list */
     UT_LIST_ADD_LAST(buf_pool.free, &block->page);
 
@@ -1146,18 +950,11 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
   buf_block_t *block= blocks;
   for (auto i= size; i--; block++)
   {
-    switch (block->page.state()) {
-    case BUF_BLOCK_ZIP_PAGE:
+    if (block->page.in_file())
+    {
       /* The uncompressed buffer pool should never
       contain ROW_FORMAT=COMPRESSED block descriptors. */
-      ut_error;
-      break;
-    case BUF_BLOCK_NOT_USED:
-    case BUF_BLOCK_MEMORY:
-    case BUF_BLOCK_REMOVE_HASH:
-      /* Skip blocks that are not being used for file pages. */
-      break;
-    case BUF_BLOCK_FILE_PAGE:
+      ut_ad(block->page.frame);
       const lsn_t lsn= block->page.oldest_modification();
 
       if (srv_read_only_mode)
@@ -1166,8 +963,6 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
         can be dirtied, so all of them must be clean. */
         ut_ad(lsn == 0 || lsn == recv_sys.recovered_lsn ||
               srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
-        ut_ad(!block->page.buf_fix_count());
-        ut_ad(block->page.io_fix() == BUF_IO_NONE);
         break;
       }
 
@@ -1188,24 +983,16 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
 }
 #endif /* UNIV_DEBUG */
 
-/** Free the synchronization objects of a buffer pool block descriptor
-@param[in,out]	block	buffer pool block descriptor */
-static void buf_block_free_mutexes(buf_block_t* block)
-{
-	rw_lock_free(&block->lock);
-	ut_d(rw_lock_free(block->debug_latch));
-	ut_d(ut_free(block->debug_latch));
-}
-
 /** Create the hash table.
 @param n  the lower bound of n_cells */
 void buf_pool_t::page_hash_table::create(ulint n)
 {
   n_cells= ut_find_prime(n);
-  const size_t size= pad(n_cells) * sizeof *array;
-  void* v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
-  memset(v, 0, size);
-  array= static_cast<hash_cell_t*>(v);
+  const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array,
+                              CPU_LEVEL1_DCACHE_LINESIZE);
+  void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+  array= static_cast<hash_chain*>(v);
 }
 
 /** Create the buffer pool.
@@ -1253,7 +1040,7 @@ bool buf_pool_t::create()
         buf_block_t* block= chunk->blocks;
 
         for (auto i= chunk->size; i--; block++)
-          buf_block_free_mutexes(block);
+          block->page.lock.free();
 
         allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
       }
@@ -1283,7 +1070,6 @@ bool buf_pool_t::create()
   for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
     UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
   ulint s= curr_size;
-  old_size= s;
   s/= BUF_READ_AHEAD_PORTION;
   read_ahead_area= s >= READ_AHEAD_PAGES
     ? READ_AHEAD_PAGES
@@ -1351,8 +1137,11 @@ void buf_pool_t::close()
           ? (oldest == 0 || oldest == 2)
           : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
 
-    if (bpage->state() != BUF_BLOCK_FILE_PAGE)
-      buf_page_free_descriptor(bpage);
+    if (UNIV_UNLIKELY(!bpage->frame))
+    {
+      bpage->lock.free();
+      ut_free(bpage);
+    }
   }
 
   for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
@@ -1360,7 +1149,7 @@ void buf_pool_t::close()
     buf_block_t *block= chunk->blocks;
 
     for (auto i= chunk->size; i--; block++)
-      buf_block_free_mutexes(block);
+      block->page.lock.free();
 
     allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
   }
@@ -1390,7 +1179,8 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
 	buf_block_t*	new_block;
 
 	mysql_mutex_assert_owner(&mutex);
-	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.in_file());
+	ut_ad(block->page.frame);
 
 	new_block = buf_LRU_get_free_only();
 
@@ -1401,15 +1191,24 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
 		return(false); /* free list was not enough */
 	}
 
-	const page_id_t id(block->page.id());
-	page_hash_latch* hash_lock = hash_lock_get(id);
-	hash_lock->write_lock();
+	const page_id_t id{block->page.id()};
+	hash_chain& chain = page_hash.cell_get(id.fold());
+	page_hash_latch& hash_lock = page_hash.lock_get(chain);
+	/* It does not make sense to use transactional_lock_guard
+	here, because copying innodb_page_size (4096 to 65536) bytes
+	as well as other changes would likely make the memory
+	transaction too large. */
+	hash_lock.lock();
 
 	if (block->page.can_relocate()) {
 		memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(
-			new_block->frame, block->frame, srv_page_size);
+			new_block->page.frame, block->page.frame,
+			srv_page_size);
 		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		const auto frame = new_block->page.frame;
+		new_block->page.lock.free();
 		new (&new_block->page) buf_page_t(block->page);
+		new_block->page.frame = frame;
 
 		/* relocate LRU list */
 		if (buf_page_t*	prev_b = buf_pool.LRU_remove(&block->page)) {
@@ -1447,23 +1246,20 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
 		}
 
 		/* relocate page_hash */
-		ut_ad(block->page.in_page_hash);
-		ut_ad(new_block->page.in_page_hash);
-		const ulint fold = id.fold();
-		ut_ad(&block->page == page_hash_get_low(id, fold));
-		ut_d(block->page.in_page_hash = false);
-		HASH_REPLACE(buf_page_t, hash, &page_hash, fold,
-			     &block->page, &new_block->page);
-
+		hash_chain& chain = page_hash.cell_get(id.fold());
+		ut_ad(&block->page == page_hash.get(id, chain));
+		buf_pool.page_hash.replace(chain, &block->page,
+					   &new_block->page);
 		buf_block_modify_clock_inc(block);
 		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-		memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
+		memset_aligned<4>(block->page.frame
+				  + FIL_PAGE_OFFSET, 0xff, 4);
 		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
 			      "not perfect alignment");
-		memset_aligned<2>(block->frame
+		memset_aligned<2>(block->page.frame
 				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
-		MEM_UNDEFINED(block->frame, srv_page_size);
-		block->page.set_state(BUF_BLOCK_REMOVE_HASH);
+		MEM_UNDEFINED(block->page.frame, srv_page_size);
+		block->page.set_state(buf_page_t::REMOVE_HASH);
 		if (!fsp_is_system_temporary(id.space())) {
 			buf_flush_relocate_on_flush_list(&block->page,
 							 &new_block->page);
@@ -1484,16 +1280,51 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
 		new_block->n_fields	= 1;
 		new_block->left_side	= TRUE;
 #endif /* BTR_CUR_HASH_ADAPT */
-		ut_d(block->page.set_state(BUF_BLOCK_MEMORY));
+		ut_d(block->page.set_state(buf_page_t::MEMORY));
 		/* free block */
 		new_block = block;
 	}
 
-	hash_lock->write_unlock();
+	hash_lock.unlock();
 	buf_LRU_block_free_non_file_page(new_block);
 	return(true); /* free_list was enough */
 }
 
+void buf_pool_t::io_buf_t::create(ulint n_slots)
+{
+  this->n_slots= n_slots;
+  slots= static_cast<buf_tmp_buffer_t*>
+    (ut_malloc_nokey(n_slots * sizeof *slots));
+  memset((void*) slots, 0, n_slots * sizeof *slots);
+}
+
+void buf_pool_t::io_buf_t::close()
+{
+  for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+  {
+    aligned_free(s->crypt_buf);
+    aligned_free(s->comp_buf);
+  }
+  ut_free(slots);
+  slots= nullptr;
+  n_slots= 0;
+}
+
+buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve()
+{
+  for (;;)
+  {
+    for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+      if (s->acquire())
+        return s;
+    os_aio_wait_until_no_pending_writes();
+    for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+      if (s->acquire())
+        return s;
+    os_aio_wait_until_no_pending_reads();
+  }
+}
+
 /** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
 to the specified string. The format and the following parameters are the
 same as the ones used for printf(3).
@@ -1529,17 +1360,13 @@ inline bool buf_pool_t::withdraw_blocks()
 	ib::info() << "start to withdraw the last "
 		<< withdraw_target << " blocks";
 
-	/* Minimize zip_free[i] lists */
-	mysql_mutex_lock(&mutex);
-	buf_buddy_condense_free();
-	mysql_mutex_unlock(&mutex);
-
 	while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
 
 		/* try to withdraw from free_list */
 		ulint	count1 = 0;
 
 		mysql_mutex_lock(&mutex);
+		buf_buddy_condense_free();
 		block = reinterpret_cast<buf_block_t*>(
 			UT_LIST_GET_FIRST(free));
 		while (block != NULL
@@ -1564,68 +1391,57 @@ inline bool buf_pool_t::withdraw_blocks()
 
 			block = next_block;
 		}
-		mysql_mutex_unlock(&mutex);
 
 		/* reserve free_list length */
 		if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
-			ulint n_flushed = buf_flush_LRU(
+			buf_flush_LRU(
 				std::max<ulint>(withdraw_target
 						- UT_LIST_GET_LEN(withdraw),
-						srv_LRU_scan_depth));
-			buf_flush_wait_batch_end_acquiring_mutex(true);
-
-			if (n_flushed) {
-				MONITOR_INC_VALUE_CUMULATIVE(
-					MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
-					MONITOR_LRU_BATCH_FLUSH_COUNT,
-					MONITOR_LRU_BATCH_FLUSH_PAGES,
-					n_flushed);
-			}
+						srv_LRU_scan_depth),
+				true);
+			mysql_mutex_unlock(&buf_pool.mutex);
+			buf_dblwr.flush_buffered_writes();
+			mysql_mutex_lock(&buf_pool.flush_list_mutex);
+			buf_flush_wait_LRU_batch_end();
+			mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+			mysql_mutex_lock(&buf_pool.mutex);
 		}
 
 		/* relocate blocks/buddies in withdrawn area */
 		ulint	count2 = 0;
 
-		mysql_mutex_lock(&mutex);
-		buf_page_t*	bpage;
-		bpage = UT_LIST_GET_FIRST(LRU);
-		while (bpage != NULL) {
-			buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
-			if (bpage->zip.data != NULL
+		buf_pool_mutex_exit_forbid();
+		for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage;
+		     bpage; bpage = next_bpage) {
+			ut_ad(bpage->in_file());
+			next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+			if (UNIV_LIKELY_NULL(bpage->zip.data)
 			    && will_be_withdrawn(bpage->zip.data)
 			    && bpage->can_relocate()) {
-				buf_pool_mutex_exit_forbid();
 				if (!buf_buddy_realloc(
 					    bpage->zip.data,
 					    page_zip_get_size(&bpage->zip))) {
 					/* failed to allocate block */
-					buf_pool_mutex_exit_allow();
 					break;
 				}
-				buf_pool_mutex_exit_allow();
 				count2++;
+				if (bpage->frame) {
+					goto realloc_frame;
+				}
 			}
 
-			if (bpage->state() == BUF_BLOCK_FILE_PAGE
-			    && will_be_withdrawn(*bpage)) {
-				if (bpage->can_relocate()) {
-					buf_pool_mutex_exit_forbid();
-					if (!realloc(
-						reinterpret_cast<buf_block_t*>(
-							bpage))) {
-						/* failed to allocate block */
-						buf_pool_mutex_exit_allow();
-						break;
-					}
-					buf_pool_mutex_exit_allow();
-					count2++;
+			if (bpage->frame && will_be_withdrawn(*bpage)
+			    && bpage->can_relocate()) {
+realloc_frame:
+				if (!realloc(reinterpret_cast<buf_block_t*>(
+						     bpage))) {
+					/* failed to allocate block */
+					break;
 				}
-				/* NOTE: if the page is in use,
-				not relocated yet */
+				count2++;
 			}
-
-			bpage = next_bpage;
 		}
+		buf_pool_mutex_exit_allow();
 		mysql_mutex_unlock(&mutex);
 
 		buf_resize_status(
@@ -1655,7 +1471,7 @@ inline bool buf_pool_t::withdraw_blocks()
 	     * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
 		block = chunk->blocks;
 		for (ulint j = chunk->size; j--; block++) {
-			ut_a(block->page.state() == BUF_BLOCK_NOT_USED);
+			ut_a(block->page.state() == buf_page_t::NOT_USED);
 			ut_ad(block->in_withdraw_list);
 		}
 	}
@@ -1672,7 +1488,7 @@ inline void buf_pool_t::page_hash_table::write_lock_all()
 {
   for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
   {
-    reinterpret_cast<page_hash_latch&>(array[n]).write_lock();
+    reinterpret_cast<page_hash_latch&>(array[n]).lock();
     if (!n)
       break;
   }
@@ -1683,7 +1499,7 @@ inline void buf_pool_t::page_hash_table::write_unlock_all()
 {
   for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
   {
-    reinterpret_cast<page_hash_latch&>(array[n]).write_unlock();
+    reinterpret_cast<page_hash_latch&>(array[n]).unlock();
     if (!n)
       break;
   }
@@ -1701,7 +1517,7 @@ struct find_interesting_trx
       return;
     if (trx.mysql_thd == nullptr)
       return;
-    if (withdraw_started <= trx.start_time)
+    if (withdraw_started <= trx.start_time_micro)
       return;
 
     if (!found)
@@ -1719,8 +1535,9 @@ struct find_interesting_trx
   }
 
   bool &found;
-  time_t withdraw_started;
-  time_t current_time;
+  /** microsecond_interval_timer() */
+  const ulonglong withdraw_started;
+  const my_hrtime_t current_time;
 };
 
 } // namespace
@@ -1744,28 +1561,13 @@ inline void buf_pool_t::resize()
 			  srv_buf_pool_old_size, srv_buf_pool_size,
 			  srv_buf_pool_chunk_unit);
 
-	mysql_mutex_lock(&mutex);
-	ut_ad(curr_size == old_size);
-	ut_ad(n_chunks_new == n_chunks);
-	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
-
-	n_chunks_new = (new_instance_size << srv_page_size_shift)
-		/ srv_buf_pool_chunk_unit;
-	curr_size = n_chunks_new * chunks->size;
-	mysql_mutex_unlock(&mutex);
-
 #ifdef BTR_CUR_HASH_ADAPT
 	/* disable AHI if needed */
-	const bool btr_search_disabled = btr_search_enabled;
-
 	buf_resize_status("Disabling adaptive hash index.");
 
 	btr_search_s_lock_all();
-	if (btr_search_disabled) {
-		btr_search_s_unlock_all();
-	} else {
-		btr_search_s_unlock_all();
-	}
+	const bool btr_search_disabled = btr_search_enabled;
+	btr_search_s_unlock_all();
 
 	btr_search_disable();
 
@@ -1774,7 +1576,16 @@ inline void buf_pool_t::resize()
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
 
-	if (curr_size < old_size) {
+	mysql_mutex_lock(&mutex);
+	ut_ad(n_chunks_new == n_chunks);
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+
+	n_chunks_new = (new_instance_size << srv_page_size_shift)
+		/ srv_buf_pool_chunk_unit;
+	curr_size = n_chunks_new * chunks->size;
+	mysql_mutex_unlock(&mutex);
+
+	if (is_shrinking()) {
 		/* set withdraw target */
 		size_t w = 0;
 
@@ -1789,13 +1600,13 @@ inline void buf_pool_t::resize()
 
 	buf_resize_status("Withdrawing blocks to be shrunken.");
 
-	time_t		withdraw_started = time(NULL);
-	double		message_interval = 60;
+	ulonglong	withdraw_started = microsecond_interval_timer();
+	ulonglong	message_interval = 60ULL * 1000 * 1000;
 	ulint		retry_interval = 1;
 
 withdraw_retry:
 	/* wait for the number of blocks fit to the new size (if needed)*/
-	bool	should_retry_withdraw = curr_size < old_size
+	bool	should_retry_withdraw = is_shrinking()
 		&& withdraw_blocks();
 
 	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
@@ -1806,30 +1617,33 @@ withdraw_retry:
 	/* abort buffer pool load */
 	buf_load_abort();
 
-	const time_t current_time = time(NULL);
+	const ulonglong current_time = microsecond_interval_timer();
 
 	if (should_retry_withdraw
-	    && difftime(current_time, withdraw_started) >= message_interval) {
+	    && current_time - withdraw_started >= message_interval) {
 
-		if (message_interval > 900) {
-			message_interval = 1800;
+		if (message_interval > 900000000) {
+			message_interval = 1800000000;
 		} else {
 			message_interval *= 2;
 		}
 
-		lock_mutex_enter();
-		bool	found = false;
-		trx_sys.trx_list.for_each(find_interesting_trx{
-			found, withdraw_started, current_time});
-		lock_mutex_exit();
-
+		bool found= false;
+		find_interesting_trx f
+			{found, withdraw_started, my_hrtime_coarse()};
 		withdraw_started = current_time;
+
+		/* This is going to exceed the maximum size of a
+		memory transaction. */
+		LockMutexGuard g{SRW_LOCK_CALL};
+		trx_sys.trx_list.for_each(f);
 	}
 
 	if (should_retry_withdraw) {
 		ib::info() << "Will retry to withdraw " << retry_interval
 			<< " seconds later.";
-		os_thread_sleep(retry_interval * 1000000);
+		std::this_thread::sleep_for(
+			std::chrono::seconds(retry_interval));
 
 		if (retry_interval > 5) {
 			retry_interval = 10;
@@ -1850,7 +1664,9 @@ withdraw_retry:
 			should_wait = false;
 			DBUG_EXECUTE_IF(
 				"ib_buf_pool_resize_wait_before_resize",
-				should_wait = true; os_thread_sleep(10000););
+				should_wait = true;
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(10)););
 		}
 	}
 #endif /* !DBUG_OFF */
@@ -1873,7 +1689,7 @@ withdraw_retry:
 			  ULINTPF " to " ULINTPF ".",
 			  n_chunks, n_chunks_new);
 
-	if (n_chunks_new < n_chunks) {
+	if (is_shrinking()) {
 		/* delete chunks */
 		chunk_t* chunk = chunks + n_chunks_new;
 		const chunk_t* const echunk = chunks + n_chunks;
@@ -1897,7 +1713,7 @@ withdraw_retry:
 			buf_block_t*	block = chunk->blocks;
 
 			for (ulint j = chunk->size; j--; block++) {
-				buf_block_free_mutexes(block);
+				block->page.lock.free();
 			}
 
 			allocator.deallocate_large_dodump(
@@ -1937,8 +1753,7 @@ withdraw_retry:
 			goto calc_buf_pool_size;
 		}
 
-		ulint	n_chunks_copy = ut_min(n_chunks_new,
-					       n_chunks);
+		ulint	n_chunks_copy = ut_min(n_chunks_new, n_chunks);
 
 		memcpy(new_chunks, chunks,
 		       n_chunks_copy * sizeof *new_chunks);
@@ -2005,14 +1820,14 @@ calc_buf_pool_size:
 	/* set size */
 	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
   ulint s= curr_size;
-  old_size= s;
   s/= BUF_READ_AHEAD_PORTION;
   read_ahead_area= s >= READ_AHEAD_PAGES
     ? READ_AHEAD_PAGES
     : my_round_up_to_next_power(static_cast<uint32_t>(s));
   curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
   srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
-  innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size));
+  extern ulonglong innobase_buffer_pool_size;
+  innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size);
 
 	const bool	new_size_too_diff
 		= srv_buf_pool_base_size > srv_buf_pool_size * 2
@@ -2118,22 +1933,28 @@ void buf_resize_shutdown()
 /** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
 buf_pool.page_hash.
 The caller must relocate bpage->list.
-@param bpage   BUF_BLOCK_ZIP_PAGE block
+@param bpage   ROW_FORMAT=COMPRESSED only block
 @param dpage   destination control block */
 static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
 {
-  const ulint fold= bpage->id().fold();
-  ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+  const page_id_t id{bpage->id()};
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+  ut_ad(!bpage->frame);
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  ut_ad(buf_pool.hash_lock_get(bpage->id())->is_write_locked());
-  ut_a(bpage->io_fix() == BUF_IO_NONE);
-  ut_a(!bpage->buf_fix_count());
-  ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id(), fold));
+  ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
+  ut_ad(bpage == buf_pool.page_hash.get(id, chain));
   ut_ad(!buf_pool.watch_is_sentinel(*bpage));
-  ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE);
+  ut_d(const auto state= bpage->state());
+  ut_ad(state >= buf_page_t::FREED);
+  ut_ad(state <= buf_page_t::READ_FIX);
+  ut_ad(bpage->lock.is_write_locked());
+  const auto frame= dpage->frame;
 
+  dpage->lock.free();
   new (dpage) buf_page_t(*bpage);
 
+  dpage->frame= frame;
+
   /* Important that we adjust the hazard pointer before
   removing bpage from LRU list. */
   if (buf_page_t *b= buf_pool.LRU_remove(bpage))
@@ -2163,39 +1984,28 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
 
   ut_d(CheckInLRUList::validate());
 
-  /* relocate buf_pool.page_hash */
-  ut_ad(bpage->in_page_hash);
-  ut_ad(dpage->in_page_hash);
-  ut_d(bpage->in_page_hash= false);
-  HASH_REPLACE(buf_page_t, hash, &buf_pool.page_hash, fold, bpage, dpage);
+  buf_pool.page_hash.replace(chain, bpage, dpage);
 }
 
-/** Register a watch for a page identifier. The caller must hold an
-exclusive page hash latch. The *hash_lock may be released,
-relocated, and reacquired.
-@param id         page identifier
-@param hash_lock  exclusively held page_hash latch
-@return a buffer pool block corresponding to id
-@retval nullptr   if the block was not present, and a watch was installed */
-inline buf_page_t *buf_pool_t::watch_set(const page_id_t id,
-                                         page_hash_latch **hash_lock)
+buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+                                  buf_pool_t::hash_chain &chain)
 {
-  const ulint fold= id.fold();
-  ut_ad(*hash_lock == page_hash.lock_get(fold));
-  ut_ad((*hash_lock)->is_write_locked());
+  ut_ad(&chain == &page_hash.cell_get(id.fold()));
+  page_hash.lock_get(chain).lock();
 
-retry:
-  if (buf_page_t *bpage= page_hash_get_low(id, fold))
+  buf_page_t *bpage= page_hash.get(id, chain);
+
+  if (bpage)
   {
-    if (!watch_is_sentinel(*bpage))
-      /* The page was loaded meanwhile. */
-      return bpage;
-    /* Add to an existing watch. */
+got_block:
     bpage->fix();
-    return nullptr;
+    if (watch_is_sentinel(*bpage))
+      bpage= nullptr;
+    page_hash.lock_get(chain).unlock();
+    return bpage;
   }
 
-  (*hash_lock)->write_unlock();
+  page_hash.lock_get(chain).unlock();
   /* Allocate a watch[] and then try to insert it into the page_hash. */
   mysql_mutex_lock(&mutex);
 
@@ -2208,93 +2018,102 @@ retry:
     ut_ad(!w->oldest_modification());
     ut_ad(!w->zip.data);
     ut_ad(!w->in_zip_hash);
-    if (w->state() == BUF_BLOCK_ZIP_PAGE)
+    static_assert(buf_page_t::NOT_USED == 0, "efficiency");
+    if (ut_d(auto s=) w->state())
+    {
       /* This watch may be in use for some other page. */
+      ut_ad(s >= buf_page_t::UNFIXED);
       continue;
-    ut_ad(w->state() == BUF_BLOCK_NOT_USED);
-    ut_ad(!w->buf_fix_count());
+    }
     /* w is pointing to watch[], which is protected by mutex.
     Normally, buf_page_t::id for objects that are reachable by
-    page_hash_get_low(id, fold) are protected by hash_lock. */
-    w->set_state(BUF_BLOCK_ZIP_PAGE);
+    page_hash.get(id, chain) are protected by hash_lock. */
+    w->set_state(buf_page_t::UNFIXED + 1);
     w->id_= id;
 
-    *hash_lock= page_hash.lock_get(fold);
-
-    buf_page_t *bpage= page_hash_get_low(id, fold);
+    page_hash.lock_get(chain).lock();
+    bpage= page_hash.get(id, chain);
     if (UNIV_LIKELY_NULL(bpage))
     {
-      w->set_state(BUF_BLOCK_NOT_USED);
-      *hash_lock= page_hash.lock_get(fold);
-      (*hash_lock)->write_lock();
+      w->set_state(buf_page_t::NOT_USED);
       mysql_mutex_unlock(&mutex);
-      goto retry;
+      goto got_block;
     }
 
-    (*hash_lock)->write_lock();
-    ut_ad(!w->buf_fix_count_);
-    w->buf_fix_count_= 1;
-    ut_ad(!w->in_page_hash);
-    ut_d(w->in_page_hash= true);
-    HASH_INSERT(buf_page_t, hash, &page_hash, fold, w);
+    ut_ad(w->state() == buf_page_t::UNFIXED + 1);
+    buf_pool.page_hash.append(chain, w);
     mysql_mutex_unlock(&mutex);
+    page_hash.lock_get(chain).unlock();
     return nullptr;
   }
 
   ut_error;
-  mysql_mutex_unlock(&mutex);
-  return nullptr;
 }
 
 /** Stop watching whether a page has been read in.
 watch_set(id) must have returned nullptr before.
-@param id   page identifier */
-void buf_pool_t::watch_unset(const page_id_t id)
+@param id         page identifier
+@param chain      unlocked hash table chain */
+TRANSACTIONAL_TARGET
+void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain)
 {
   mysql_mutex_assert_not_owner(&mutex);
-  const ulint fold= id.fold();
-  page_hash_latch *hash_lock= page_hash.lock<true>(fold);
-  /* The page must exist because watch_set() increments buf_fix_count. */
-  buf_page_t *w= page_hash_get_low(id, fold);
-  ut_ad(w->in_page_hash);
-  const bool must_remove= watch_is_sentinel(*w) && w->buf_fix_count() == 1;
-  if (!must_remove)
-    w->unfix();
-  hash_lock->write_unlock();
-
-  if (must_remove)
+  buf_page_t *w;
   {
-    const auto old= w;
-    /* The following is based on buf_pool_t::watch_remove(). */
-    mysql_mutex_lock(&mutex);
-    w= page_hash_get_low(id, fold);
-    page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
-    hash_lock->write_lock();
-    if (w->unfix() == 0 && w == old)
+    transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)};
+    /* The page must exist because watch_set() did fix(). */
+    w= page_hash.get(id, chain);
+    ut_ad(w->in_page_hash);
+    if (!watch_is_sentinel(*w))
     {
-      ut_ad(w->in_page_hash);
-      ut_d(w->in_page_hash= false);
-      HASH_DELETE(buf_page_t, hash, &page_hash, fold, w);
-      // Now that the watch is detached from page_hash, release it to watch[].
+    no_watch:
+      w->unfix();
+      w= nullptr;
+    }
+    else
+    {
+      const auto state= w->state();
+      ut_ad(~buf_page_t::LRU_MASK & state);
+      ut_ad(state >= buf_page_t::UNFIXED + 1);
+      if (state != buf_page_t::UNFIXED + 1)
+        goto no_watch;
+    }
+  }
+
+  if (!w)
+    return;
+
+  const auto old= w;
+  /* The following is based on buf_pool_t::watch_remove(). */
+  mysql_mutex_lock(&mutex);
+  w= page_hash.get(id, chain);
+
+  {
+    transactional_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    auto f= w->unfix();
+    ut_ad(f < buf_page_t::READ_FIX || w != old);
+
+    if (f == buf_page_t::UNFIXED && w == old)
+    {
+      page_hash.remove(chain, w);
+      // Now that w is detached from page_hash, release it to watch[].
       ut_ad(w->id_ == id);
-      ut_ad(!w->buf_fix_count());
-      ut_ad(w->state() == BUF_BLOCK_ZIP_PAGE);
-      w->set_state(BUF_BLOCK_NOT_USED);
+      ut_ad(!w->frame);
+      ut_ad(!w->zip.data);
+      w->set_state(buf_page_t::NOT_USED);
     }
-    hash_lock->write_unlock();
-    mysql_mutex_unlock(&mutex);
   }
+
+  mysql_mutex_unlock(&mutex);
 }
 
-/** Mark the page status as FREED for the given tablespace id and
-page number. If the page is not in buffer pool then ignore it.
+/** Mark the page status as FREED for the given tablespace and page number.
 @param[in,out]	space	tablespace
 @param[in]	page	page number
-@param[in,out]	mtr	mini-transaction
-@param[in]	file	file name
-@param[in]	line	line where called */
-void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
-                   const char *file, unsigned line)
+@param[in,out]	mtr	mini-transaction */
+TRANSACTIONAL_TARGET
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
 {
   ut_ad(mtr);
   ut_ad(mtr->is_active());
@@ -2306,124 +2125,138 @@ void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
       )
     mtr->add_freed_offset(space, page);
 
-  buf_pool.stat.n_page_gets++;
+  ++buf_pool.stat.n_page_gets;
   const page_id_t page_id(space->id, page);
-  const ulint fold= page_id.fold();
-  page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
-  if (buf_block_t *block= reinterpret_cast<buf_block_t*>
-      (buf_pool.page_hash_get_low(page_id, fold)))
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  uint32_t fix;
+  buf_block_t *block;
   {
-    if (block->page.state() != BUF_BLOCK_FILE_PAGE)
-      /* FIXME: convert, but avoid buf_zip_decompress() */;
-    else
-    {
-      buf_block_buf_fix_inc(block, file, line);
-      ut_ad(block->page.buf_fix_count());
-      hash_lock->read_unlock();
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash.get(page_id, chain));
+    if (!block || !block->page.frame)
+      /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */
+      return;
+    /* To avoid a deadlock with buf_LRU_free_page() of some other page
+    and buf_page_write_complete() of this page, we must not wait for a
+    page latch while holding a page_hash latch. */
+    fix= block->page.fix();
+  }
 
-      mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
-      rw_lock_x_lock_inline(&block->lock, 0, file, line);
-      buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+  if (UNIV_UNLIKELY(fix < buf_page_t::UNFIXED))
+  {
+    block->page.unfix();
+    return;
+  }
 
+  block->page.lock.x_lock();
 #ifdef BTR_CUR_HASH_ADAPT
-      if (block->index)
-        btr_search_drop_page_hash_index(block, false);
+  if (block->index)
+    btr_search_drop_page_hash_index(block, false);
 #endif /* BTR_CUR_HASH_ADAPT */
-
-      block->page.status= buf_page_t::FREED;
-      return;
-    }
-  }
-
-  hash_lock->read_unlock();
+  block->page.set_freed(block->page.state());
+  mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
 }
 
 /** Get read access to a compressed page (usually of type
 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
-The page must be released with buf_page_release_zip().
+The page must be released with unfix().
 NOTE: the page is not protected by any latch.  Mutual exclusion has to
 be implemented at a higher level.  In other words, all possible
 accesses to a given page through this function must be protected by
 the same set of mutexes or latches.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
-@return pointer to the block */
+@param page_id   page identifier
+@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+TRANSACTIONAL_TARGET
 buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
 {
   ut_ad(zip_size);
   ut_ad(ut_is_2pow(zip_size));
-  buf_pool.stat.n_page_gets++;
+  ++buf_pool.stat.n_page_gets;
 
-  bool discard_attempted= false;
-  const ulint fold= page_id.fold();
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
   buf_page_t *bpage;
-  page_hash_latch *hash_lock;
 
-  for (;;)
-  {
 lookup:
-    bpage= buf_pool.page_hash_get_locked<false>(page_id, fold, &hash_lock);
-    if (bpage)
-      break;
-
-    dberr_t err= buf_read_page(page_id, zip_size);
-
-    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  for (bool discard_attempted= false;;)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
     {
-      ib::error() << "Reading compressed page " << page_id
-                  << " failed with error: " << err;
-      goto err_exit;
+      if (hash_lock.is_locked())
+        xabort();
+      bpage= buf_pool.page_hash.get(page_id, chain);
+      if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+      {
+        xend();
+        goto must_read_page;
+      }
+      if (!bpage->zip.data)
+      {
+        /* There is no ROW_FORMAT=COMPRESSED page. */
+        xend();
+        return nullptr;
+      }
+      if (discard_attempted || !bpage->frame)
+      {
+        if (!bpage->lock.s_lock_try())
+          xabort();
+        xend();
+        break;
+      }
+      xend();
     }
+    else
+#endif
+    {
+      hash_lock.lock_shared();
+      bpage= buf_pool.page_hash.get(page_id, chain);
+      if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+      {
+        hash_lock.unlock_shared();
+        goto must_read_page;
+      }
 
-#ifdef UNIV_DEBUG
-    if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
-#endif /* UNIV_DEBUG */
-  }
-
-  ut_ad(hash_lock->is_read_locked());
+      ut_ad(bpage->in_file());
+      ut_ad(page_id == bpage->id());
 
-  if (!bpage->zip.data)
-  {
-    /* There is no compressed page. */
-err_exit:
-    hash_lock->read_unlock();
-    return nullptr;
-  }
+      if (!bpage->zip.data)
+      {
+        /* There is no ROW_FORMAT=COMPRESSED page. */
+        hash_lock.unlock_shared();
+        return nullptr;
+      }
 
-  ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+      if (discard_attempted || !bpage->frame)
+      {
+        /* Even when we are holding a hash_lock, it should be
+        acceptable to wait for a page S-latch here, because
+        buf_page_t::read_complete() will not wait for buf_pool.mutex,
+        and because S-latch would not conflict with a U-latch
+        that would be protecting buf_page_t::write_complete(). */
+        bpage->lock.s_lock();
+        hash_lock.unlock_shared();
+        break;
+      }
 
-  switch (bpage->state()) {
-  case BUF_BLOCK_ZIP_PAGE:
-    bpage->fix();
-    goto got_block;
-  case BUF_BLOCK_FILE_PAGE:
-    /* Discard the uncompressed page frame if possible. */
-    if (!discard_attempted)
-    {
-      discard_attempted= true;
-      hash_lock->read_unlock();
-      mysql_mutex_lock(&buf_pool.mutex);
-      if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
-        buf_LRU_free_page(bpage, false);
-      mysql_mutex_unlock(&buf_pool.mutex);
-      goto lookup;
+      hash_lock.unlock_shared();
     }
 
-    buf_block_buf_fix_inc(reinterpret_cast<buf_block_t*>(bpage),
-                          __FILE__, __LINE__);
-    goto got_block;
-  default:
-    break;
+    discard_attempted= true;
+    mysql_mutex_lock(&buf_pool.mutex);
+    if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
+      buf_LRU_free_page(bpage, false);
+    mysql_mutex_unlock(&buf_pool.mutex);
   }
 
-  ut_error;
-  goto err_exit;
-
-got_block:
-  bool must_read= bpage->io_fix() == BUF_IO_READ;
-  hash_lock->read_unlock();
-
-  DBUG_ASSERT(bpage->status != buf_page_t::FREED);
+  {
+    ut_d(const auto s=) bpage->fix();
+    ut_ad(s >= buf_page_t::UNFIXED);
+    ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+  }
 
   bpage->set_accessed();
   buf_page_make_young_if_needed(bpage);
@@ -2431,15 +2264,18 @@ got_block:
 #ifdef UNIV_DEBUG
   if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
 #endif /* UNIV_DEBUG */
-  ut_ad(bpage->buf_fix_count());
-  ut_ad(bpage->in_file());
-
-  if (must_read)
-    /* Let us wait until the read operation completes */
-    while (bpage->io_fix() == BUF_IO_READ)
-      os_thread_sleep(WAIT_FOR_READ);
-
   return bpage;
+
+must_read_page:
+  switch (dberr_t err= buf_read_page(page_id, zip_size)) {
+  case DB_SUCCESS:
+  case DB_SUCCESS_LOCKED_REC:
+    goto lookup;
+  default:
+    ib::error() << "Reading compressed page " << page_id
+                << " failed with error: " << err;
+    return nullptr;
+  }
 }
 
 /********************************************************************//**
@@ -2495,15 +2331,9 @@ buf_zip_decompress(
 			<< block->page.id() << ": stored: "
 			<< mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
 			<< ", crc32: "
-			<< page_zip_calc_checksum(
-				frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
-			<< " innodb: "
-			<< page_zip_calc_checksum(
-				frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
-			<< ", none: "
-			<< page_zip_calc_checksum(
-				frame, size, SRV_CHECKSUM_ALGORITHM_NONE)
-			<< " (algorithm: " << srv_checksum_algorithm << ")";
+			<< page_zip_calc_checksum(frame, size, false)
+			<< " adler32: "
+			<< page_zip_calc_checksum(frame, size, true);
 		goto err_exit;
 	}
 
@@ -2511,7 +2341,8 @@ buf_zip_decompress(
 	case FIL_PAGE_INDEX:
 	case FIL_PAGE_RTREE:
 		if (page_zip_decompress(&block->page.zip,
-					block->frame, TRUE)) {
+					block->page.frame, TRUE)) {
+func_exit:
 			if (space) {
 				space->release();
 			}
@@ -2530,12 +2361,8 @@ buf_zip_decompress(
 	case FIL_PAGE_TYPE_ZBLOB:
 	case FIL_PAGE_TYPE_ZBLOB2:
 		/* Copy to uncompressed storage. */
-		memcpy(block->frame, frame, block->zip_size());
-		if (space) {
-			space->release();
-		}
-
-		return(TRUE);
+		memcpy(block->page.frame, frame, block->zip_size());
+		goto func_exit;
 	}
 
 	ib::error() << "Unknown compressed page type "
@@ -2550,91 +2377,19 @@ err_exit:
 	}
 
 	if (space) {
-		if (encrypted) {
-			dict_set_encrypted_by_space(space);
-		} else {
-			dict_set_corrupted_by_space(space);
-		}
-
 		space->release();
 	}
 
 	return(FALSE);
 }
 
-/** Wait for the block to be read in.
-@param[in]	block	The block to check */
-static
-void
-buf_wait_for_read(
-	buf_block_t*	block)
-{
-	/* Note:
-
-	We are using the block->lock to check for IO state.
-	We set the IO_READ state under the protection of the hash_lock.
-	This is safe because another thread can only
-	access the block (and check for IO state) after the block has been
-	added to the page hashtable. */
-
-	while (block->page.io_fix() == BUF_IO_READ) {
-		rw_lock_s_lock(&block->lock);
-		rw_lock_s_unlock(&block->lock);
-	}
-}
-
-/** Lock the page with the given latch type.
-@param[in,out]	block		block to be locked
-@param[in]	rw_latch	RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
-@param[in]	mtr		mini-transaction
-@param[in]	file		file name
-@param[in]	line		line where called
-@return pointer to locked block */
-static buf_block_t* buf_page_mtr_lock(buf_block_t *block,
-                                      ulint rw_latch,
-                                      mtr_t* mtr,
-                                      const char *file,
-                                      unsigned line)
-{
-  mtr_memo_type_t fix_type;
-  switch (rw_latch)
-  {
-  case RW_NO_LATCH:
-    fix_type= MTR_MEMO_BUF_FIX;
-    goto done;
-  case RW_S_LATCH:
-    rw_lock_s_lock_inline(&block->lock, 0, file, line);
-    fix_type= MTR_MEMO_PAGE_S_FIX;
-    break;
-  case RW_SX_LATCH:
-    rw_lock_sx_lock_inline(&block->lock, 0, file, line);
-    fix_type= MTR_MEMO_PAGE_SX_FIX;
-    break;
-  default:
-    ut_ad(rw_latch == RW_X_LATCH);
-    rw_lock_x_lock_inline(&block->lock, 0, file, line);
-    fix_type= MTR_MEMO_PAGE_X_FIX;
-    break;
-  }
-
-#ifdef BTR_CUR_HASH_ADAPT
-  btr_search_drop_page_hash_index(block, true);
-#endif /* BTR_CUR_HASH_ADAPT */
-
-done:
-  mtr_memo_push(mtr, block, fix_type);
-  return block;
-}
-
 /** Low level function used to get access to a database page.
 @param[in]	page_id			page id
 @param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
 @param[in]	guess			guessed block or NULL
 @param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file			file name
-@param[in]	line			line where called
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
 @param[in]	mtr			mini-transaction
 @param[out]	err			DB_SUCCESS or error code
 @param[in]	allow_ibuf_merge	Allow change buffer merge to happen
@@ -2642,6 +2397,7 @@ while reading the page from file
 then it makes sure that it does merging of change buffer changes while
 reading the page from file.
 @return pointer to the block or NULL */
+TRANSACTIONAL_TARGET
 buf_block_t*
 buf_page_get_low(
 	const page_id_t		page_id,
@@ -2649,28 +2405,19 @@ buf_page_get_low(
 	ulint			rw_latch,
 	buf_block_t*		guess,
 	ulint			mode,
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr,
 	dberr_t*		err,
 	bool			allow_ibuf_merge)
 {
-	buf_block_t*	block;
 	unsigned	access_time;
 	ulint		retries = 0;
-	const ulint	fold = page_id.fold();
 
-	ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
 	ut_ad(!mtr || mtr->is_active());
+	ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL);
 	ut_ad((rw_latch == RW_S_LATCH)
 	      || (rw_latch == RW_X_LATCH)
 	      || (rw_latch == RW_SX_LATCH)
 	      || (rw_latch == RW_NO_LATCH));
-	ut_ad(!allow_ibuf_merge
-	      || mode == BUF_GET
-	      || mode == BUF_GET_POSSIBLY_FREED
-	      || mode == BUF_GET_IF_IN_POOL
-	      || mode == BUF_GET_IF_IN_POOL_OR_WATCH);
 
 	if (err) {
 		*err = DB_SUCCESS;
@@ -2678,26 +2425,18 @@ buf_page_get_low(
 
 #ifdef UNIV_DEBUG
 	switch (mode) {
-	case BUF_EVICT_IF_IN_POOL:
-		/* After DISCARD TABLESPACE, the tablespace would not exist,
-		but in IMPORT TABLESPACE, PageConverter::operator() must
-		replace any old pages, which were not evicted during DISCARD.
-		Skip the assertion on space_page_size. */
+	default:
+		ut_ad(!allow_ibuf_merge);
+		ut_ad(mode == BUF_PEEK_IF_IN_POOL);
 		break;
-	case BUF_PEEK_IF_IN_POOL:
+	case BUF_GET_POSSIBLY_FREED:
 	case BUF_GET_IF_IN_POOL:
 		/* The caller may pass a dummy page size,
 		because it does not really matter. */
 		break;
-	default:
-		ut_error;
-	case BUF_GET_POSSIBLY_FREED:
-		break;
-	case BUF_GET_NO_LATCH:
-		ut_ad(rw_latch == RW_NO_LATCH);
-		/* fall through */
 	case BUF_GET:
 	case BUF_GET_IF_IN_POOL_OR_WATCH:
+		ut_ad(!mtr->is_freeing_tree());
 		fil_space_t* s = fil_space_get(page_id.space());
 		ut_ad(s);
 		ut_ad(s->zip_size() == zip_size);
@@ -2705,487 +2444,459 @@ buf_page_get_low(
 #endif /* UNIV_DEBUG */
 
 	ut_ad(!mtr || !ibuf_inside(mtr)
-	      || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL));
+	      || ibuf_page_low(page_id, zip_size, FALSE, NULL));
 
-	buf_pool.stat.n_page_gets++;
-loop:
-	buf_block_t* fix_block;
-	block = guess;
+	++buf_pool.stat.n_page_gets;
 
-	page_hash_latch* hash_lock = buf_pool.page_hash.lock<false>(fold);
+	auto& chain= buf_pool.page_hash.cell_get(page_id.fold());
+	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+loop:
+	buf_block_t* block = guess;
+	uint32_t state;
 
 	if (block) {
-
-		/* If the guess is a compressed page descriptor that
-		has been allocated by buf_page_alloc_descriptor(),
-		it may have been freed by buf_relocate(). */
-
-		if (!buf_pool.is_uncompressed(block)
-		    || page_id != block->page.id()
-		    || block->page.state() != BUF_BLOCK_FILE_PAGE) {
-			/* Our guess was bogus or things have changed
-			since. */
-			guess = nullptr;
-			goto lookup;
-		} else {
+		transactional_shared_lock_guard<page_hash_latch> g{hash_lock};
+		if (buf_pool.is_uncompressed(block)
+		    && page_id == block->page.id()) {
 			ut_ad(!block->page.in_zip_hash);
-		}
-	} else {
-lookup:
-		block = reinterpret_cast<buf_block_t*>(
-			buf_pool.page_hash_get_low(page_id, fold));
-	}
-
-	if (!block || buf_pool.watch_is_sentinel(block->page)) {
-		hash_lock->read_unlock();
-		block = nullptr;
-	}
-
-	if (UNIV_UNLIKELY(!block)) {
-		/* Page not in buf_pool: needs to be read from file */
-		if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
-			hash_lock = buf_pool.page_hash.lock<true>(fold);
-
-			if (buf_page_t *bpage= buf_pool.watch_set(
-				    page_id, &hash_lock)) {
-				/* We can release hash_lock after we
-				increment the fix count to make
-				sure that no state change takes place. */
-				bpage->fix();
-				hash_lock->write_unlock();
-				block = reinterpret_cast<buf_block_t*>(bpage);
-				fix_block = block;
+			state = block->page.state();
+			/* Ignore guesses that point to read-fixed blocks.
+			We can only avoid a race condition by
+			looking up the block via buf_pool.page_hash. */
+			if ((state >= buf_page_t::FREED
+			     && state < buf_page_t::READ_FIX)
+			    || state >= buf_page_t::WRITE_FIX) {
+				state = block->page.fix();
 				goto got_block;
 			}
-
-			hash_lock->write_unlock();
 		}
+	}
 
-		switch (mode) {
-		case BUF_GET_IF_IN_POOL:
-		case BUF_GET_IF_IN_POOL_OR_WATCH:
-		case BUF_PEEK_IF_IN_POOL:
-		case BUF_EVICT_IF_IN_POOL:
-			return(NULL);
-		}
-
-		/* The call path is buf_read_page() ->
-		buf_read_page_low() (fil_space_t::io()) ->
-		buf_page_read_complete() ->
-		buf_decrypt_after_read(). Here fil_space_t* is used
-		and we decrypt -> buf_page_check_corrupt() where page
-		checksums are compared. Decryption, decompression as
-		well as error handling takes place at a lower level.
-		Here we only need to know whether the page really is
-		corrupted, or if an encrypted page with a valid
-		checksum cannot be decypted. */
-
-		dberr_t local_err = buf_read_page(page_id, zip_size);
-
-		if (local_err == DB_SUCCESS) {
-			buf_read_ahead_random(page_id, zip_size,
-					      ibuf_inside(mtr));
-
-			retries = 0;
-		} else if (mode == BUF_GET_POSSIBLY_FREED) {
-			if (err) {
-				*err = local_err;
-			}
-			return NULL;
-		} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
-			++retries;
-
-			DBUG_EXECUTE_IF(
-				"innodb_page_corruption_retries",
-				retries = BUF_PAGE_READ_MAX_RETRIES;
-			);
-		} else {
-			if (err) {
-				*err = local_err;
-			}
-
-			/* Pages whose encryption key is unavailable or used
-			key, encryption algorithm or encryption method is
-			incorrect are marked as encrypted in
-			buf_page_check_corrupt(). Unencrypted page could be
-			corrupted in a way where the key_id field is
-			nonzero. There is no checksum on field
-			FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
-			if (local_err == DB_DECRYPTION_FAILED) {
-				return (NULL);
-			}
-
-			if (local_err == DB_PAGE_CORRUPTED
-			    && srv_force_recovery) {
-				return NULL;
-			}
-
-			/* Try to set table as corrupted instead of
-			asserting. */
-			if (page_id.space() == TRX_SYS_SPACE) {
-			} else if (page_id.space() == SRV_TMP_SPACE_ID) {
-			} else if (fil_space_t* space= fil_space_t::get(
-					   page_id.space())) {
-				bool set = dict_set_corrupted_by_space(space);
-				space->release();
-				if (set) {
-					return NULL;
-				}
-			}
+	guess = nullptr;
+
+	/* A memory transaction would frequently be aborted here. */
+	hash_lock.lock_shared();
+	block = reinterpret_cast<buf_block_t*>(
+		buf_pool.page_hash.get(page_id, chain));
+	if (UNIV_LIKELY(block
+			&& !buf_pool.watch_is_sentinel(block->page))) {
+		state = block->page.fix();
+		hash_lock.unlock_shared();
+		goto got_block;
+	}
+	hash_lock.unlock_shared();
 
-			if (local_err == DB_IO_ERROR) {
-				return NULL;
-			}
+	/* Page not in buf_pool: needs to be read from file */
+	switch (mode) {
+	case BUF_GET_IF_IN_POOL:
+	case BUF_PEEK_IF_IN_POOL:
+		return nullptr;
+	case BUF_GET_IF_IN_POOL_OR_WATCH:
+		/* Buffer-fixing inside watch_set() will prevent eviction */
+		block = reinterpret_cast<buf_block_t*>
+			(buf_pool.watch_set(page_id, chain));
 
-			ib::fatal() << "Unable to read page " << page_id
-				<< " into the buffer pool after "
-				<< BUF_PAGE_READ_MAX_RETRIES
-				<< ". The most probable cause"
-				" of this error may be that the"
-				" table has been corrupted."
-				" See https://mariadb.com/kb/en/library/innodb-recovery-modes/";
+		if (block) {
+			state = block->page.state();
+			goto got_block_fixed;
 		}
 
-#ifdef UNIV_DEBUG
-		if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
-#endif /* UNIV_DEBUG */
-		goto loop;
-	} else {
-		fix_block = block;
+		return nullptr;
 	}
 
-	fix_block->fix();
-	hash_lock->read_unlock();
-
-got_block:
-	switch (mode) {
-	default:
-		ut_ad(block->zip_size() == zip_size);
+	/* The call path is buf_read_page() ->
+	buf_read_page_low() (fil_space_t::io()) ->
+	buf_page_t::read_complete() ->
+	buf_decrypt_after_read(). Here fil_space_t* is used
+	and we decrypt -> buf_page_check_corrupt() where page
+	checksums are compared. Decryption, decompression as
+	well as error handling takes place at a lower level.
+	Here we only need to know whether the page really is
+	corrupted, or if an encrypted page with a valid
+	checksum cannot be decypted. */
+
+	switch (dberr_t local_err = buf_read_page(page_id, zip_size)) {
+	case DB_SUCCESS:
+	case DB_SUCCESS_LOCKED_REC:
+		buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr));
 		break;
-	case BUF_GET_IF_IN_POOL:
-	case BUF_PEEK_IF_IN_POOL:
-	case BUF_EVICT_IF_IN_POOL:
-		if (fix_block->page.io_fix() == BUF_IO_READ) {
-			/* The page is being read to buffer pool,
-			but we cannot wait around for the read to
-			complete. */
-			fix_block->unfix();
-			return(NULL);
+	default:
+		if (mode != BUF_GET_POSSIBLY_FREED
+		    && retries++ < BUF_PAGE_READ_MAX_RETRIES) {
+			DBUG_EXECUTE_IF("intermittent_read_failure",
+					retries = BUF_PAGE_READ_MAX_RETRIES;);
+		}
+		/* fall through */
+	case DB_PAGE_CORRUPTED:
+		if (err) {
+			*err = local_err;
 		}
+		return nullptr;
 	}
 
-	switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) {
-	case BUF_BLOCK_FILE_PAGE:
-		if (fsp_is_system_temporary(page_id.space())
-		    && block->page.io_fix() != BUF_IO_NONE) {
-			/* This suggests that the page is being flushed.
-			Avoid returning reference to this page.
-			Instead wait for the flush action to complete. */
-			fix_block->unfix();
-			os_thread_sleep(WAIT_FOR_WRITE);
-			goto loop;
-		}
+	ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+	goto loop;
 
-		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
-evict_from_pool:
-			ut_ad(!fix_block->page.oldest_modification());
-			mysql_mutex_lock(&buf_pool.mutex);
-			fix_block->unfix();
+got_block:
+	ut_ad(!block->page.in_zip_hash);
+	state++;
+got_block_fixed:
+	ut_ad(state > buf_page_t::FREED);
 
-			if (!buf_LRU_free_page(&fix_block->page, true)) {
-				ut_ad(0);
+	if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) {
+		if (mode == BUF_PEEK_IF_IN_POOL) {
+ignore_block:
+			ut_ad(mode == BUF_GET_POSSIBLY_FREED
+			      || mode == BUF_PEEK_IF_IN_POOL);
+			block->unfix();
+			if (err) {
+				*err = DB_CORRUPTION;
 			}
-
-			mysql_mutex_unlock(&buf_pool.mutex);
-			return(NULL);
+			return nullptr;
 		}
 
-		break;
-	default:
-		ut_error;
-		break;
-
-	case BUF_BLOCK_ZIP_PAGE:
-		if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
-			goto evict_from_pool;
+		if (UNIV_UNLIKELY(!block->page.frame)) {
+			goto wait_for_unzip;
 		}
+		/* A read-fix is released after block->page.lock
+		in buf_page_t::read_complete() or
+		buf_pool_t::corrupted_evict(), or
+		after buf_zip_decompress() in this function. */
+		block->page.lock.s_lock();
+		state = block->page.state();
+		ut_ad(state < buf_page_t::READ_FIX
+		      || state >= buf_page_t::WRITE_FIX);
+		const page_id_t id{block->page.id()};
+		block->page.lock.s_unlock();
+
+		if (UNIV_UNLIKELY(id != page_id)) {
+			ut_ad(id == page_id_t{~0ULL});
+			block->page.unfix();
+			if (++retries < BUF_PAGE_READ_MAX_RETRIES) {
+				goto loop;
+			}
 
-		if (mode == BUF_PEEK_IF_IN_POOL) {
-			/* This mode is only used for dropping an
-			adaptive hash index.  There cannot be an
-			adaptive hash index for a compressed-only
-			page, so do not bother decompressing the page. */
-			fix_block->unfix();
+			if (err) {
+				*err = DB_PAGE_CORRUPTED;
+			}
 
-			return(NULL);
+			return nullptr;
 		}
+	} else if (mode != BUF_PEEK_IF_IN_POOL) {
+	} else if (!mtr) {
+		ut_ad(!block->page.oldest_modification());
+		mysql_mutex_lock(&buf_pool.mutex);
+		block->unfix();
 
-		buf_page_t* bpage = &block->page;
-
-		/* Note: We have already buffer fixed this block. */
-		if (bpage->buf_fix_count() > 1
-		    || bpage->io_fix() != BUF_IO_NONE) {
-
-			/* This condition often occurs when the buffer
-			is not buffer-fixed, but I/O-fixed by
-			buf_page_init_for_read(). */
-			fix_block->unfix();
+free_unfixed_block:
+		if (!buf_LRU_free_page(&block->page, true)) {
+			ut_ad(0);
+		}
 
-			/* The block is buffer-fixed or I/O-fixed.
-			Try again later. */
-			os_thread_sleep(WAIT_FOR_READ);
+		mysql_mutex_unlock(&buf_pool.mutex);
+		return nullptr;
+	} else if (UNIV_UNLIKELY(!block->page.frame)) {
+		/* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an
+		adaptive hash index. There cannot be an
+		adaptive hash index for a compressed-only page. */
+		goto ignore_block;
+	}
 
+	ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL
+	      || block->zip_size() == zip_size);
+
+	if (UNIV_UNLIKELY(!block->page.frame)) {
+		if (!block->page.lock.x_lock_try()) {
+wait_for_unzip:
+			/* The page is being read or written, or
+			another thread is executing buf_zip_decompress()
+			in buf_page_get_low() on it. */
+			block->page.unfix();
+			std::this_thread::sleep_for(
+				std::chrono::microseconds(100));
 			goto loop;
 		}
 
-		/* Buffer-fix the block so that it cannot be evicted
-		or relocated while we are attempting to allocate an
-		uncompressed page. */
-
-		block = buf_LRU_get_free_block(false);
-		buf_block_init_low(block);
+		buf_block_t *new_block = buf_LRU_get_free_block(false);
+		buf_block_init_low(new_block);
 
+wait_for_unfix:
 		mysql_mutex_lock(&buf_pool.mutex);
-		hash_lock = buf_pool.page_hash.lock_get(fold);
-
-		hash_lock->write_lock();
-
-		/* Buffer-fixing prevents the page_hash from changing. */
-		ut_ad(bpage == buf_pool.page_hash_get_low(page_id, fold));
-
-		fix_block->unfix(); /* hash_lock protects us after this */
-
-		if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) {
-			/* The block was buffer-fixed or I/O-fixed while
-			buf_pool.mutex was not held by this thread.
-			Free the block that was allocated and retry.
-			This should be extremely unlikely, for example,
-			if buf_page_get_zip() was invoked. */
+		page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain);
+
+		/* It does not make sense to use
+		transactional_lock_guard here, because buf_relocate()
+		would likely make a  memory transaction too large. */
+		hash_lock.lock();
+
+		/* block->page.lock implies !block->page.can_relocate() */
+		ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain));
+
+		/* Wait for any other threads to release their buffer-fix
+		on the compressed-only block descriptor.
+		FIXME: Never fix() before acquiring the lock.
+		Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free()
+		we are violating that principle. */
+		state = block->page.state();
+
+		switch (state) {
+		case buf_page_t::UNFIXED + 1:
+		case buf_page_t::IBUF_EXIST + 1:
+		case buf_page_t::REINIT + 1:
+			break;
+		default:
+			ut_ad(state < buf_page_t::READ_FIX);
+
+			if (state < buf_page_t::UNFIXED + 1) {
+				ut_ad(state > buf_page_t::FREED);
+				block->page.lock.x_unlock();
+				hash_lock.unlock();
+				buf_LRU_block_free_non_file_page(new_block);
+				mysql_mutex_unlock(&buf_pool.mutex);
+				goto ignore_block;
+			}
 
-			hash_lock->write_unlock();
-			buf_LRU_block_free_non_file_page(block);
 			mysql_mutex_unlock(&buf_pool.mutex);
-
-			/* Try again */
-			goto loop;
+			hash_lock.unlock();
+			std::this_thread::sleep_for(
+				std::chrono::microseconds(100));
+			goto wait_for_unfix;
 		}
 
-		fix_block = block;
+		/* Ensure that another buf_page_get_low() will wait for
+		new_block->page.lock.x_unlock(). */
+		block->page.set_state(buf_page_t::READ_FIX);
 
-		/* Move the compressed page from bpage to block,
+		/* Move the compressed page from block->page to new_block,
 		and uncompress it. */
 
-		/* Note: this is the uncompressed block and it is not
-		accessible by other threads yet because it is not in
-		any list or hash table */
 		mysql_mutex_lock(&buf_pool.flush_list_mutex);
-		buf_relocate(bpage, &block->page);
+		buf_relocate(&block->page, &new_block->page);
 
-		/* Set after buf_relocate(). */
-		block->page.set_buf_fix_count(1);
+		/* X-latch the block for the duration of the decompression. */
+		new_block->page.lock.x_lock();
+		ut_d(block->page.lock.x_unlock());
 
-		buf_flush_relocate_on_flush_list(bpage, &block->page);
+		buf_flush_relocate_on_flush_list(&block->page,
+						 &new_block->page);
 		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-		/* Buffer-fix, I/O-fix, and X-latch the block
-		for the duration of the decompression.
-		Also add the block to the unzip_LRU list. */
-		block->page.set_state(BUF_BLOCK_FILE_PAGE);
-
 		/* Insert at the front of unzip_LRU list */
-		buf_unzip_LRU_add_block(block, FALSE);
+		buf_unzip_LRU_add_block(new_block, FALSE);
 
-		block->page.set_io_fix(BUF_IO_READ);
-		rw_lock_x_lock_inline(&block->lock, 0, file, line);
+		mysql_mutex_unlock(&buf_pool.mutex);
+		hash_lock.unlock();
 
-		MEM_UNDEFINED(bpage, sizeof *bpage);
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+		block->page.lock.free();
+#endif
+		ut_free(reinterpret_cast<buf_page_t*>(block));
+		block = new_block;
 
-		mysql_mutex_unlock(&buf_pool.mutex);
-		hash_lock->write_unlock();
 		buf_pool.n_pend_unzip++;
 
 		access_time = block->page.is_accessed();
 
 		if (!access_time && !recv_no_ibuf_operations
-		    && ibuf_page_exists(block->page.id(), zip_size)) {
-			block->page.ibuf_exist = true;
+		    && ibuf_page_exists(block->page.id(), block->zip_size())) {
+			state = buf_page_t::IBUF_EXIST + 1;
 		}
 
-		buf_page_free_descriptor(bpage);
-
 		/* Decompress the page while not holding
 		buf_pool.mutex. */
-
-		if (!buf_zip_decompress(block, false)) {
-			rw_lock_x_unlock(&fix_block->lock);
-			fix_block->page.io_unfix();
-			fix_block->unfix();
-			--buf_pool.n_pend_unzip;
-
+		const auto ok = buf_zip_decompress(block, false);
+		--buf_pool.n_pend_unzip;
+		if (!ok) {
 			if (err) {
 				*err = DB_PAGE_CORRUPTED;
 			}
-			return NULL;
+			mysql_mutex_lock(&buf_pool.mutex);
 		}
+		state = block->page.read_unfix(state);
+		block->page.lock.x_unlock();
 
-		rw_lock_x_unlock(&block->lock);
-		fix_block->page.io_unfix();
-		--buf_pool.n_pend_unzip;
-		break;
+		if (!ok) {
+			goto free_unfixed_block;
+		}
 	}
 
-	ut_ad(block == fix_block);
-	ut_ad(fix_block->page.buf_fix_count());
-
-	ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
-
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 re_evict:
 	if (mode != BUF_GET_IF_IN_POOL
 	    && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
 	} else if (!ibuf_debug || recv_recovery_is_on()) {
 	} else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+		for (ulint i = 0; i < mtr->get_savepoint(); i++) {
+			if (buf_block_t* b = mtr->block_at_savepoint(i)) {
+				if (b->page.oldest_modification() > 2
+				    && b->page.lock.have_any()) {
+					/* We are holding a dirty page latch
+					that would hang buf_flush_sync(). */
+					space->release();
+					goto re_evict_fail;
+				}
+			}
+		}
+
 		/* Try to evict the block from the buffer pool, to use the
 		insert buffer (change buffer) as much as possible. */
 
 		mysql_mutex_lock(&buf_pool.mutex);
 
-		fix_block->unfix();
+		block->unfix();
 
 		/* Blocks cannot be relocated or enter or exit the
 		buf_pool while we are holding the buf_pool.mutex. */
-		const bool evicted = buf_LRU_free_page(&fix_block->page, true);
+		const bool evicted = buf_LRU_free_page(&block->page, true);
 		space->release();
 
-		if (evicted) {
-			hash_lock = buf_pool.page_hash.lock_get(fold);
-			hash_lock->write_lock();
-			mysql_mutex_unlock(&buf_pool.mutex);
-			/* We may set the watch, as it would have
-			been set if the page were not in the
-			buffer pool in the first place. */
-			block= reinterpret_cast<buf_block_t*>(
-				mode == BUF_GET_IF_IN_POOL_OR_WATCH
-				? buf_pool.watch_set(page_id, &hash_lock)
-				: buf_pool.page_hash_get_low(page_id, fold));
-			hash_lock->write_unlock();
-
-			if (block != NULL) {
-				/* Either the page has been read in or
-				a watch was set on that in the window
-				where we released the buf_pool.mutex
-				and before we acquire the hash_lock
-				above. Try again. */
-				guess = block;
+		if (!evicted) {
+			block->fix();
+		}
 
-				goto loop;
-			}
+		mysql_mutex_unlock(&buf_pool.mutex);
 
+		if (evicted) {
+			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+				buf_pool.watch_set(page_id, chain);
+			}
 			return(NULL);
 		}
 
-		fix_block->fix();
-		mysql_mutex_unlock(&buf_pool.mutex);
 		buf_flush_sync();
 
-		if (fix_block->page.buf_fix_count() == 1
-		    && !fix_block->page.oldest_modification()) {
+		state = block->page.state();
+
+		if (state == buf_page_t::UNFIXED + 1
+		    && !block->page.oldest_modification()) {
 			goto re_evict;
 		}
 
 		/* Failed to evict the page; change it directly */
 	}
+re_evict_fail:
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
-	ut_ad(fix_block->page.buf_fix_count());
-
-#ifdef UNIV_DEBUG
-	/* We have already buffer fixed the page, and we are committed to
-	returning this page to the caller. Register for debugging.
-	Avoid debug latching if page/block belongs to system temporary
-	tablespace (Not much needed for table with single threaded access.). */
-	if (!fsp_is_system_temporary(page_id.space())) {
-		ibool   ret;
-		ret = rw_lock_s_lock_nowait(
-			fix_block->debug_latch, file, line);
-		ut_a(ret);
-	}
-#endif /* UNIV_DEBUG */
-
-	/* While tablespace is reinited the indexes are already freed but the
-	blocks related to it still resides in buffer pool. Trying to remove
-	such blocks from buffer pool would invoke removal of AHI entries
-	associated with these blocks. Logic to remove AHI entry will try to
-	load the block but block is already in free state. Handle the said case
-	with mode = BUF_PEEK_IF_IN_POOL that is invoked from
-	"btr_search_drop_page_hash_when_freed". */
-	ut_ad(mode == BUF_GET_POSSIBLY_FREED
-	      || mode == BUF_PEEK_IF_IN_POOL
-	      || fix_block->page.status != buf_page_t::FREED);
-
-	const bool not_first_access = fix_block->page.set_accessed();
-
-	if (mode != BUF_PEEK_IF_IN_POOL) {
-		buf_page_make_young_if_needed(&fix_block->page);
+	if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+		goto ignore_block;
 	}
+	ut_ad((~buf_page_t::LRU_MASK) & state);
+	ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
 
 #ifdef UNIV_DEBUG
 	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
 #endif /* UNIV_DEBUG */
-	ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.frame);
 
-	/* We have to wait here because the IO_READ state was set
-	under the protection of the hash_lock and not block->lock. */
-	buf_wait_for_read(fix_block);
+	if (state >= buf_page_t::UNFIXED
+	    && allow_ibuf_merge
+	    && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX
+	    && page_is_leaf(block->page.frame)) {
+		block->page.lock.x_lock();
+		ut_ad(block->page.id() == page_id
+		      || (state >= buf_page_t::READ_FIX
+			  && state < buf_page_t::WRITE_FIX));
 
-	if (fix_block->page.id() != page_id) {
-		fix_block->unfix();
+#ifdef BTR_CUR_HASH_ADAPT
+		btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
 
-#ifdef UNIV_DEBUG
-		if (!fsp_is_system_temporary(page_id.space())) {
-			rw_lock_s_unlock(fix_block->debug_latch);
-		}
-#endif /* UNIV_DEBUG */
+		dberr_t e;
 
-		if (err) {
-			*err = DB_PAGE_CORRUPTED;
+		if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+page_id_mismatch:
+			state = block->page.state();
+			e = DB_CORRUPTION;
+ibuf_merge_corrupted:
+			if (err) {
+				*err = e;
+			}
+
+			if (block->page.id().is_corrupted()) {
+				buf_pool.corrupted_evict(&block->page, state);
+			}
+			return nullptr;
 		}
 
-		return NULL;
-	}
+		state = block->page.state();
+		ut_ad(state < buf_page_t::READ_FIX);
 
-	if (fix_block->page.status != buf_page_t::FREED
-	    && allow_ibuf_merge
-	    && fil_page_get_type(fix_block->frame) == FIL_PAGE_INDEX
-	    && page_is_leaf(fix_block->frame)) {
-		rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
-
-		if (fix_block->page.ibuf_exist) {
-			fix_block->page.ibuf_exist = false;
-			ibuf_merge_or_delete_for_page(fix_block, page_id,
-						      zip_size);
+		if (state >= buf_page_t::IBUF_EXIST
+		    && state < buf_page_t::REINIT) {
+			block->page.clear_ibuf_exist();
+			e = ibuf_merge_or_delete_for_page(block, page_id,
+							  block->zip_size());
+			if (UNIV_UNLIKELY(e != DB_SUCCESS)) {
+				goto ibuf_merge_corrupted;
+			}
 		}
 
 		if (rw_latch == RW_X_LATCH) {
-			mtr->memo_push(fix_block, MTR_MEMO_PAGE_X_FIX);
+			goto get_latch_valid;
 		} else {
-			rw_lock_x_unlock(&fix_block->lock);
+			block->page.lock.x_unlock();
 			goto get_latch;
 		}
 	} else {
 get_latch:
-		fix_block = buf_page_mtr_lock(fix_block, rw_latch, mtr,
-					      file, line);
-	}
+		switch (rw_latch) {
+		case RW_NO_LATCH:
+			mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+			return block;
+		case RW_S_LATCH:
+			block->page.lock.s_lock();
+			ut_ad(!block->page.is_read_fixed());
+			if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+				block->page.lock.s_unlock();
+				block->page.lock.x_lock();
+				goto page_id_mismatch;
+			}
+get_latch_valid:
+			mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+#ifdef BTR_CUR_HASH_ADAPT
+			btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+			break;
+		case RW_SX_LATCH:
+			block->page.lock.u_lock();
+			ut_ad(!block->page.is_io_fixed());
+			if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+				block->page.lock.u_x_upgrade();
+				goto page_id_mismatch;
+			}
+			goto get_latch_valid;
+		default:
+			ut_ad(rw_latch == RW_X_LATCH);
+			if (block->page.lock.x_lock_upgraded()) {
+				ut_ad(block->page.id() == page_id);
+				block->unfix();
+				mtr->page_lock_upgrade(*block);
+				return block;
+			}
+			if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+				goto page_id_mismatch;
+			}
+			goto get_latch_valid;
+		}
 
-	if (!not_first_access && mode != BUF_PEEK_IF_IN_POOL) {
-		/* In the case of a first access, try to apply linear
-		read-ahead */
+		ut_ad(page_id_t(page_get_space_id(block->page.frame),
+				page_get_page_no(block->page.frame))
+		      == page_id);
+
+		if (mode == BUF_GET_POSSIBLY_FREED
+		    || mode == BUF_PEEK_IF_IN_POOL) {
+			return block;
+		}
 
-		buf_read_ahead_linear(page_id, zip_size, ibuf_inside(mtr));
+		const bool not_first_access{block->page.set_accessed()};
+		buf_page_make_young_if_needed(&block->page);
+		if (!not_first_access) {
+			buf_read_ahead_linear(page_id, block->zip_size(),
+					      ibuf_inside(mtr));
+		}
 	}
 
-	return(fix_block);
+	return block;
 }
 
 /** Get access to a database page. Buffered redo log may be applied.
@@ -3194,10 +2905,8 @@ get_latch:
 @param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
 @param[in]	guess			guessed block or NULL
 @param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file			file name
-@param[in]	line			line where called
-@param[in]	mtr			mini-transaction
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction, or NULL
 @param[out]	err			DB_SUCCESS or error code
 @param[in]	allow_ibuf_merge	Allow change buffer merge while
 reading the pages from file.
@@ -3209,194 +2918,204 @@ buf_page_get_gen(
 	ulint			rw_latch,
 	buf_block_t*		guess,
 	ulint			mode,
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr,
 	dberr_t*		err,
 	bool			allow_ibuf_merge)
 {
-  if (buf_block_t *block= recv_sys.recover(page_id))
+  buf_block_t *block= recv_sys.recover(page_id);
+  if (UNIV_LIKELY(!block))
+    return buf_page_get_low(page_id, zip_size, rw_latch,
+                            guess, mode, mtr, err, allow_ibuf_merge);
+  else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1)))
   {
-    block->fix();
-    ut_ad(rw_lock_s_lock_nowait(block->debug_latch, file, line));
+  corrupted:
     if (err)
-      *err= DB_SUCCESS;
-    const bool must_merge= allow_ibuf_merge &&
-      ibuf_page_exists(page_id, block->zip_size());
-    if (block->page.status == buf_page_t::FREED)
-      ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
-    else if (must_merge && fil_page_get_type(block->frame) == FIL_PAGE_INDEX &&
-	     page_is_leaf(block->frame))
+      *err= DB_CORRUPTION;
+    return nullptr;
+  }
+  /* Recovery is a special case; we fix() before acquiring lock. */
+  auto s= block->page.fix();
+  ut_ad(s >= buf_page_t::FREED);
+  /* The block may be write-fixed at this point because we are not
+  holding a lock, but it must not be read-fixed. */
+  ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+  if (err)
+    *err= DB_SUCCESS;
+  const bool must_merge= allow_ibuf_merge &&
+    ibuf_page_exists(page_id, block->zip_size());
+  if (s < buf_page_t::UNFIXED)
+  {
+  got_freed_page:
+    ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+    mysql_mutex_lock(&buf_pool.mutex);
+    block->page.unfix();
+    buf_LRU_free_page(&block->page, true);
+    mysql_mutex_unlock(&buf_pool.mutex);
+    goto corrupted;
+  }
+  else if (must_merge &&
+           fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX &&
+           page_is_leaf(block->page.frame))
+  {
+    block->page.lock.x_lock();
+    s= block->page.state();
+    ut_ad(s > buf_page_t::FREED);
+    ut_ad(s < buf_page_t::READ_FIX);
+    if (s < buf_page_t::UNFIXED)
     {
-      rw_lock_x_lock_inline(&block->lock, 0, file, line);
-      block->page.ibuf_exist= false;
-      ibuf_merge_or_delete_for_page(block, page_id, block->zip_size());
-
-      if (rw_latch == RW_X_LATCH)
+      block->page.lock.x_unlock();
+      goto got_freed_page;
+    }
+    else
+    {
+      if (block->page.is_ibuf_exist())
+        block->page.clear_ibuf_exist();
+      if (dberr_t e=
+          ibuf_merge_or_delete_for_page(block, page_id, block->zip_size()))
       {
-        mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
-	return block;
+        if (err)
+          *err= e;
+        buf_pool.corrupted_evict(&block->page, s);
+        return nullptr;
       }
-      rw_lock_x_unlock(&block->lock);
     }
-    block= buf_page_mtr_lock(block, rw_latch, mtr, file, line);
-    return block;
-  }
 
-  return buf_page_get_low(page_id, zip_size, rw_latch,
-                          guess, mode, file, line, mtr, err, allow_ibuf_merge);
+    if (rw_latch == RW_X_LATCH)
+    {
+      mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+      return block;
+    }
+    block->page.lock.x_unlock();
+  }
+  mtr->page_lock(block, rw_latch);
+  return block;
 }
 
 /********************************************************************//**
 This is the general function used to get optimistic access to a database
 page.
 @return TRUE if success */
-ibool
-buf_page_optimistic_get(
-/*====================*/
-	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
-	buf_block_t*	block,	/*!< in: guessed buffer block */
-	ib_uint64_t	modify_clock,/*!< in: modify clock value */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	mtr_t*		mtr)	/*!< in: mini-transaction */
+TRANSACTIONAL_TARGET
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+                             uint64_t modify_clock, mtr_t *mtr)
 {
-	ibool		success;
-
-	ut_ad(block);
-	ut_ad(mtr);
-	ut_ad(mtr->is_active());
-	ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
-
-	if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE
-			  || block->page.io_fix() != BUF_IO_NONE)) {
-		return FALSE;
-	}
-
-	const page_id_t id(block->page.id());
-
-	page_hash_latch *hash_lock = buf_pool.hash_lock_get(id);
-	hash_lock->read_lock();
-
-	if (UNIV_UNLIKELY(id != block->page.id()
-			  || block->page.state() != BUF_BLOCK_FILE_PAGE
-			  || block->page.io_fix() != BUF_IO_NONE)) {
-		hash_lock->read_unlock();
-		return(FALSE);
-	}
-
-	buf_block_buf_fix_inc(block, file, line);
-	hash_lock->read_unlock();
-
-	block->page.set_accessed();
-
-	buf_page_make_young_if_needed(&block->page);
-
-	ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), NULL));
-
-	mtr_memo_type_t	fix_type;
-
-	if (rw_latch == RW_S_LATCH) {
-		fix_type = MTR_MEMO_PAGE_S_FIX;
-		success = rw_lock_s_lock_nowait(&block->lock, file, line);
-	} else {
-		fix_type = MTR_MEMO_PAGE_X_FIX;
-		success = rw_lock_x_lock_func_nowait_inline(
-			&block->lock, file, line);
-	}
-
-	ut_ad(id == block->page.id());
+  ut_ad(block);
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+  ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
 
-	if (!success) {
-		buf_block_buf_fix_dec(block);
-		return(FALSE);
-	}
+  if (have_transactional_memory);
+  else if (UNIV_UNLIKELY(!block->page.frame))
+    return false;
+  else
+  {
+    const auto state= block->page.state();
+    if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+                      state >= buf_page_t::READ_FIX))
+      return false;
+  }
 
-	if (modify_clock != block->modify_clock) {
+  bool success;
+  const page_id_t id{block->page.id()};
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+  bool have_u_not_x= false;
 
-		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+  {
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame))
+      return false;
+    const auto state= block->page.state();
+    if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+                      state >= buf_page_t::READ_FIX))
+      return false;
+
+    if (rw_latch == RW_S_LATCH)
+      success= block->page.lock.s_lock_try();
+    else
+    {
+      have_u_not_x= block->page.lock.have_u_not_x();
+      success= have_u_not_x || block->page.lock.x_lock_try();
+    }
+  }
 
-		if (rw_latch == RW_S_LATCH) {
-			rw_lock_s_unlock(&block->lock);
-		} else {
-			rw_lock_x_unlock(&block->lock);
-		}
+  if (!success)
+    return false;
 
-		buf_block_buf_fix_dec(block);
-		return(FALSE);
-	}
+  if (have_u_not_x)
+  {
+    block->page.lock.u_x_upgrade();
+    mtr->page_lock_upgrade(*block);
+    ut_ad(id == block->page.id());
+    ut_ad(modify_clock == block->modify_clock);
+  }
+  else
+  {
+    ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed());
+    ut_ad(id == block->page.id());
+    ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr));
 
-	mtr_memo_push(mtr, block, fix_type);
+    if (modify_clock != block->modify_clock || block->page.is_freed())
+    {
+      if (rw_latch == RW_S_LATCH)
+        block->page.lock.s_unlock();
+      else
+        block->page.lock.x_unlock();
+      return false;
+    }
 
-#ifdef UNIV_DEBUG
-	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
-#endif /* UNIV_DEBUG */
-	ut_ad(block->page.buf_fix_count());
-	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+    block->page.fix();
+    ut_ad(!block->page.is_read_fixed());
+    block->page.set_accessed();
+    buf_page_make_young_if_needed(&block->page);
+    mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+  }
 
-	buf_pool.stat.n_page_gets++;
+  ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+  ut_d(const auto state = block->page.state());
+  ut_ad(state > buf_page_t::UNFIXED);
+  ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX);
+  ut_ad(~buf_page_t::LRU_MASK & state);
+  ut_ad(block->page.frame);
 
-	return(TRUE);
+  ++buf_pool.stat.n_page_gets;
+  return true;
 }
 
-/** Given a tablespace id and page number tries to get that page. If the
-page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the lock_sys_t::mutex.
-@param[in]	page_id	page id
-@param[in]	file	file name
-@param[in]	line	line where called
-@param[in]	mtr	mini-transaction
-@return pointer to a page or NULL */
-buf_block_t*
-buf_page_try_get_func(
-	const page_id_t		page_id,
-	const char*		file,
-	unsigned		line,
-	mtr_t*			mtr)
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
+@param[in]	page_id	page identifier
+@param[in,out]	mtr	mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+TRANSACTIONAL_TARGET
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr)
 {
   ut_ad(mtr);
   ut_ad(mtr->is_active());
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  buf_block_t *block;
 
-  page_hash_latch *hash_lock;
-  buf_page_t *bpage= buf_pool.page_hash_get_locked<false>(page_id,
-                                                          page_id.fold(),
-                                                          &hash_lock);
-  if (!bpage)
-    return nullptr;
-  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
   {
-    hash_lock->read_unlock();
-    return nullptr;
-  }
-
-  buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
-  buf_block_buf_fix_inc(block, file, line);
-  hash_lock->read_unlock();
-
-  mtr_memo_type_t fix_type= MTR_MEMO_PAGE_S_FIX;
-  if (!rw_lock_s_lock_nowait(&block->lock, file, line))
-  {
-    /* Let us try to get an X-latch. If the current thread
-    is holding an X-latch on the page, we cannot get an S-latch. */
-    fix_type= MTR_MEMO_PAGE_X_FIX;
-    if (!rw_lock_x_lock_func_nowait_inline(&block->lock, file, line))
-    {
-      buf_block_buf_fix_dec(block);
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash.get(page_id, chain));
+    if (!block || !block->page.frame || !block->page.lock.s_lock_try())
       return nullptr;
-    }
   }
 
-  mtr_memo_push(mtr, block, fix_type);
+  block->page.fix();
+  ut_ad(!block->page.is_read_fixed());
+  mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX);
 
 #ifdef UNIV_DEBUG
   if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
 #endif /* UNIV_DEBUG */
-  ut_ad(bpage->buf_fix_count());
-  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
-  ut_ad(bpage->id() == page_id);
-  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+  ut_ad(block->page.buf_fix_count());
+  ut_ad(block->page.id() == page_id);
 
-  buf_pool.stat.n_page_gets++;
+  ++buf_pool.stat.n_page_gets;
   return block;
 }
 
@@ -3407,121 +3126,134 @@ buf_page_try_get_func(
 void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
                              uint32_t fix)
 {
-  ut_ad(page.state() != BUF_BLOCK_FILE_PAGE);
+  ut_ad(!page.in_file());
   buf_block_init_low(this);
-  page.init(page_id, fix);
+  page.init(fix, page_id);
   page_zip_set_size(&page.zip, zip_size);
 }
 
-/** Initialize a page in the buffer pool. The page is usually not read
-from a file even if it cannot be found in the buffer buf_pool. This is one
-of the functions which perform to a block a state transition NOT_USED =>
-FILE_PAGE (the other is buf_page_get_gen).
-@param[in,out]	space		space object
-@param[in]	offset		offset of the tablespace
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out]	mtr		mini-transaction
-@param[in,out]	free_block	pre-allocated buffer block
-@return pointer to the block, page bufferfixed */
-buf_block_t*
-buf_page_create(fil_space_t *space, uint32_t offset,
-                ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+TRANSACTIONAL_TARGET
+static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size,
+                                        mtr_t *mtr, buf_block_t *free_block)
 {
-  page_id_t page_id(space->id, offset);
   ut_ad(mtr->is_active());
   ut_ad(page_id.space() != 0 || !zip_size);
 
-  space->free_page(offset, false);
-  free_block->initialise(page_id, zip_size, 1);
+  free_block->initialise(page_id, zip_size, buf_page_t::MEMORY);
 
-  const ulint fold= page_id.fold();
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+retry:
   mysql_mutex_lock(&buf_pool.mutex);
 
-loop:
-  buf_block_t *block= reinterpret_cast<buf_block_t*>
-    (buf_pool.page_hash_get_low(page_id, fold));
+  buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain);
 
-  if (block && block->page.in_file() &&
-      !buf_pool.watch_is_sentinel(block->page))
+  if (bpage && !buf_pool.watch_is_sentinel(*bpage))
   {
 #ifdef BTR_CUR_HASH_ADAPT
     const dict_index_t *drop_hash_entry= nullptr;
 #endif
-    switch (UNIV_EXPECT(block->page.state(), BUF_BLOCK_FILE_PAGE)) {
-    default:
-      ut_ad(0);
-      break;
-    case BUF_BLOCK_FILE_PAGE:
-      if (!mtr->have_x_latch(*block))
+    bool ibuf_exist= false;
+
+    if (!mtr->have_x_latch(reinterpret_cast<const buf_block_t&>(*bpage)))
+    {
+      const bool got= bpage->lock.x_lock_try();
+      if (!got)
       {
-        buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-        while (!rw_lock_x_lock_nowait(&block->lock))
+        mysql_mutex_unlock(&buf_pool.mutex);
+        bpage->lock.x_lock();
+        const page_id_t id{bpage->id()};
+        if (UNIV_UNLIKELY(id != page_id))
         {
-          /* Wait for buf_page_write_complete() to release block->lock.
-          We must not hold buf_pool.mutex while waiting. */
-          timespec abstime;
-          set_timespec_nsec(abstime, 1000000);
-          my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
-                            &abstime);
+          ut_ad(id.is_corrupted());
+          bpage->lock.x_unlock();
+          goto retry;
         }
-        mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
+        mysql_mutex_lock(&buf_pool.mutex);
       }
+
+      auto state= bpage->fix();
+      ut_ad(state >= buf_page_t::FREED);
+      ut_ad(state < buf_page_t::READ_FIX);
+
+      if (state < buf_page_t::UNFIXED)
+        bpage->set_reinit(buf_page_t::FREED);
       else
       {
-        ut_ad(!block->page.ibuf_exist);
-#ifdef BTR_CUR_HASH_ADAPT
-        ut_ad(!block->index);
-#endif
+        bpage->set_reinit(state & buf_page_t::LRU_MASK);
+        ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
       }
+
+      if (UNIV_LIKELY(bpage->frame != nullptr))
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+        mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
 #ifdef BTR_CUR_HASH_ADAPT
-      drop_hash_entry= block->index;
+        drop_hash_entry= block->index;
 #endif
-      break;
-    case BUF_BLOCK_ZIP_PAGE:
-      page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
-      hash_lock->write_lock();
-      if (block->page.io_fix() != BUF_IO_NONE)
-      {
-        hash_lock->write_unlock();
-        /* Wait for buf_page_write_complete() to release the I/O fix. */
-        timespec abstime;
-        set_timespec_nsec(abstime, 1000000);
-        my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
-                          &abstime);
-        goto loop;
       }
+      else
+      {
+        auto state= bpage->state();
+        ut_ad(state >= buf_page_t::FREED);
+        ut_ad(state < buf_page_t::READ_FIX);
+
+        page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+        /* It does not make sense to use transactional_lock_guard here,
+        because buf_relocate() would likely make the memory transaction
+        too large. */
+        hash_lock.lock();
+
+        if (state < buf_page_t::UNFIXED)
+          bpage->set_reinit(buf_page_t::FREED);
+        else
+        {
+          bpage->set_reinit(state & buf_page_t::LRU_MASK);
+          ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
+        }
 
-      rw_lock_x_lock(&free_block->lock);
-      mysql_mutex_lock(&buf_pool.flush_list_mutex);
-      buf_relocate(&block->page, &free_block->page);
-      buf_flush_relocate_on_flush_list(&block->page, &free_block->page);
-      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-
-      free_block->page.set_state(BUF_BLOCK_FILE_PAGE);
-      buf_unzip_LRU_add_block(free_block, FALSE);
-      hash_lock->write_unlock();
-      buf_page_free_descriptor(&block->page);
-      block= free_block;
-      buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-      mtr_memo_push(mtr, block, MTR_MEMO_PAGE_X_FIX);
-      break;
-    }
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        buf_relocate(bpage, &free_block->page);
+        free_block->page.lock.x_lock();
+        buf_flush_relocate_on_flush_list(bpage, &free_block->page);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-    mysql_mutex_unlock(&buf_pool.mutex);
+        buf_unzip_LRU_add_block(free_block, FALSE);
+
+        mysql_mutex_unlock(&buf_pool.mutex);
+        hash_lock.unlock();
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+        bpage->lock.x_unlock();
+        bpage->lock.free();
+#endif
+        ut_free(bpage);
+        mtr->memo_push(free_block, MTR_MEMO_PAGE_X_FIX);
+        bpage= &free_block->page;
+      }
+    }
+    else
+    {
+      mysql_mutex_unlock(&buf_pool.mutex);
+      ut_ad(bpage->frame);
+#ifdef BTR_CUR_HASH_ADAPT
+      ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
+#endif
+      const auto state= bpage->state();
+      ut_ad(state >= buf_page_t::FREED);
+      bpage->set_reinit(state < buf_page_t::UNFIXED ? buf_page_t::FREED
+                        : state & buf_page_t::LRU_MASK);
+    }
 
 #ifdef BTR_CUR_HASH_ADAPT
     if (drop_hash_entry)
-      btr_search_drop_page_hash_index(block, false);
+      btr_search_drop_page_hash_index(reinterpret_cast<buf_block_t*>(bpage),
+                                      false);
 #endif /* BTR_CUR_HASH_ADAPT */
 
-    if (block->page.ibuf_exist)
-    {
-      if (!recv_recovery_is_on())
-        ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
-      block->page.ibuf_exist= false;
-    }
+    if (ibuf_exist && !recv_recovery_is_on())
+      ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
 
-    return block;
+    return reinterpret_cast<buf_block_t*>(bpage);
   }
 
   /* If we get here, the page was not in buf_pool: init it there */
@@ -3529,52 +3261,37 @@ loop:
   DBUG_PRINT("ib_buf", ("create page %u:%u",
                         page_id.space(), page_id.page_no()));
 
-  block= free_block;
+  bpage= &free_block->page;
 
-  /* Duplicate buf_block_buf_fix_inc_func() */
-  ut_ad(block->page.buf_fix_count() == 1);
-  ut_ad(fsp_is_system_temporary(page_id.space()) ||
-        rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
+  ut_ad(bpage->state() == buf_page_t::MEMORY);
+  bpage->lock.x_lock();
 
   /* The block must be put to the LRU list */
-  buf_LRU_add_block(&block->page, false);
-  page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
-  hash_lock->write_lock();
-  block->page.set_state(BUF_BLOCK_FILE_PAGE);
-  ut_d(block->page.in_page_hash= true);
-  HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, &block->page);
-
-  rw_lock_x_lock(&block->lock);
+  buf_LRU_add_block(bpage, false);
+  {
+    transactional_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    bpage->set_state(buf_page_t::REINIT + 1);
+    buf_pool.page_hash.append(chain, bpage);
+  }
+
   if (UNIV_UNLIKELY(zip_size))
   {
-    /* Prevent race conditions during buf_buddy_alloc(), which may
-    release and reacquire buf_pool.mutex, by IO-fixing and X-latching
-    the block. */
-    block->page.set_io_fix(BUF_IO_READ);
-    hash_lock->write_unlock();
-
-    /* buf_pool.mutex may be released and reacquired by
-    buf_buddy_alloc(). We must defer this operation until
-    after the block descriptor has been added to
-    buf_pool.LRU and buf_pool.page_hash. */
-    block->page.zip.data= buf_buddy_alloc(zip_size);
+    bpage->zip.data= buf_buddy_alloc(zip_size);
 
     /* To maintain the invariant block->in_unzip_LRU_list ==
     block->page.belongs_to_unzip_LRU() we have to add this
     block to unzip_LRU after block->page.zip.data is set. */
-    ut_ad(block->page.belongs_to_unzip_LRU());
-    buf_unzip_LRU_add_block(block, FALSE);
-
-    block->page.set_io_fix(BUF_IO_NONE);
+    ut_ad(bpage->belongs_to_unzip_LRU());
+    buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), FALSE);
   }
-  else
-    hash_lock->write_unlock();
 
+  buf_pool.stat.n_pages_created++;
   mysql_mutex_unlock(&buf_pool.mutex);
 
-  mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
-  block->page.set_accessed();
-  buf_pool.stat.n_pages_created++;
+  mtr->memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX);
+
+  bpage->set_accessed();
 
   /* Delete possible entries for the page from the insert buffer:
   such can exist if the page belonged to an index which was dropped */
@@ -3584,8 +3301,8 @@ loop:
     ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
 
   static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
-  memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
-  mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+  memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
 
   /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
   following pages:
@@ -3593,30 +3310,57 @@ loop:
   (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
   (3) key_version on encrypted pages (not page 0:0) */
 
-  memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
-  memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8);
+  memset(bpage->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+  memset_aligned<8>(bpage->frame + FIL_PAGE_LSN, 0, 8);
 
 #ifdef UNIV_DEBUG
   if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
 #endif /* UNIV_DEBUG */
-  return block;
+  return reinterpret_cast<buf_block_t*>(bpage);
+}
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
+				or deferred space id if space
+				object is null
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+{
+  space->free_page(offset, false);
+  return buf_page_create_low({space->id, offset}, zip_size, mtr, free_block);
+}
+
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id		space identfier
+@param zip_size		ROW_FORMAT=COMPRESSED page size or 0
+@param mtr		mini-transaction
+@param free_block 	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size,
+                                      mtr_t *mtr, buf_block_t *free_block)
+{
+  return buf_page_create_low({space_id, 0}, zip_size, mtr, free_block);
 }
 
 /** Monitor the buffer page read/write activity, and increment corresponding
 counter value in MONITOR_MODULE_BUF_PAGE.
 @param bpage   buffer page whose read or write was completed
-@param io_type BUF_IO_READ or BUF_IO_WRITE */
-ATTRIBUTE_COLD __attribute__((nonnull))
-void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type)
+@param read    true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read)
 {
-	const byte*	frame;
 	monitor_id_t	counter;
 
-	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
-
-	frame = bpage->zip.data
-		? bpage->zip.data
-		: ((buf_block_t*) bpage)->frame;
+	const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame;
 
 	switch (fil_page_get_type(frame)) {
 		ulint	level;
@@ -3631,135 +3375,74 @@ void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type)
 		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
 			if (level == 0) {
 				counter = MONITOR_RW_COUNTER(
-					io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
+					read, MONITOR_INDEX_IBUF_LEAF_PAGE);
 			} else {
 				counter = MONITOR_RW_COUNTER(
-					io_type,
+					read,
 					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
 			}
 		} else {
 			if (level == 0) {
 				counter = MONITOR_RW_COUNTER(
-					io_type, MONITOR_INDEX_LEAF_PAGE);
+					read, MONITOR_INDEX_LEAF_PAGE);
 			} else {
 				counter = MONITOR_RW_COUNTER(
-					io_type, MONITOR_INDEX_NON_LEAF_PAGE);
+					read, MONITOR_INDEX_NON_LEAF_PAGE);
 			}
 		}
 		break;
 
 	case FIL_PAGE_UNDO_LOG:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_UNDO_LOG_PAGE);
 		break;
 
 	case FIL_PAGE_INODE:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE);
 		break;
 
 	case FIL_PAGE_IBUF_FREE_LIST:
-		counter = MONITOR_RW_COUNTER(io_type,
-					     MONITOR_IBUF_FREELIST_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE);
 		break;
 
 	case FIL_PAGE_IBUF_BITMAP:
-		counter = MONITOR_RW_COUNTER(io_type,
-					     MONITOR_IBUF_BITMAP_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_SYS:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_TRX_SYS:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_TRX_SYSTEM_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_FSP_HDR:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_FSP_HDR_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_XDES:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_XDES_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_BLOB:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_BLOB_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_ZBLOB:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB_PAGE);
 		break;
 
 	case FIL_PAGE_TYPE_ZBLOB2:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB2_PAGE);
 		break;
 
 	default:
-		counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
+		counter = MONITOR_RW_COUNTER(read, MONITOR_OTHER_PAGE);
 	}
 
 	MONITOR_INC_NOCHECK(counter);
 }
 
-/** Mark a table corrupted.
-@param[in]	bpage	corrupted page
-@param[in]	space	tablespace of the corrupted page */
-ATTRIBUTE_COLD
-static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space)
-{
-	/* If block is not encrypted find the table with specified
-	space id, and mark it corrupted. Encrypted tables
-	are marked unusable later e.g. in ::open(). */
-	if (!space.crypt_data
-	    || space.crypt_data->type == CRYPT_SCHEME_UNENCRYPTED) {
-		dict_set_corrupted_by_space(&space);
-	} else {
-		dict_set_encrypted_by_space(&space);
-	}
-}
-
-/** Release and evict a corrupted page.
-@param bpage    page that was being read */
-ATTRIBUTE_COLD void buf_pool_t::corrupted_evict(buf_page_t *bpage)
-{
-  const page_id_t id(bpage->id());
-  page_hash_latch *hash_lock= hash_lock_get(id);
-
-  mysql_mutex_lock(&mutex);
-  hash_lock->write_lock();
-
-  ut_ad(bpage->io_fix() == BUF_IO_READ);
-  ut_ad(!bpage->oldest_modification());
-  bpage->set_corrupt_id();
-
-  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
-    rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
-                         BUF_IO_READ);
-
-  bpage->io_unfix();
-
-  /* remove from LRU and page_hash */
-  buf_LRU_free_one_page(bpage, id, hash_lock);
-  mysql_mutex_unlock(&mutex);
-
-  ut_d(auto n=) n_pend_reads--;
-  ut_ad(n > 0);
-}
-
-/** Mark a table corrupted.
-@param[in]	bpage	Corrupted page
-@param[in]	node	data file
-Also remove the bpage from LRU list. */
-ATTRIBUTE_COLD
-static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node)
-{
-  ut_ad(bpage->id().space() == node.space->id);
-  buf_pool.corrupted_evict(bpage);
-
-  if (!srv_force_recovery)
-    buf_mark_space_corrupt(bpage, *node.space);
-}
-
 /** Check if the encrypted page is corrupted for the full crc32 format.
 @param[in]	space_id	page belongs to space id
 @param[in]	d		page
@@ -3788,15 +3471,13 @@ or decrypt/decompress just failed.
 @retval	DB_SUCCESS		if page has been read and is not corrupted
 @retval	DB_PAGE_CORRUPTED	if page based on checksum check is corrupted
 @retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
-after decryption normal page checksum does not match.
-@retval	DB_TABLESPACE_DELETED	if accessed tablespace is not found */
+after decryption normal page checksum does not match. */
 static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
                                       const fil_node_t &node)
 {
 	ut_ad(node.space->referenced());
 
-	byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
-		((buf_block_t*) bpage)->frame;
+	byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame;
 	dberr_t err = DB_SUCCESS;
 	uint key_version = buf_page_get_key_version(dst_frame,
 						    node.space->flags);
@@ -3836,129 +3517,103 @@ static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
 		ib::error()
 			<< "The page " << bpage->id()
 			<< " in file '" << node.name
-			<< "' cannot be decrypted.";
-
-		ib::info()
-			<< "However key management plugin or used key_version "
-			<< key_version
-			<< " is not found or"
-			" used encryption algorithm or method does not match.";
-
-		if (bpage->id().space() != TRX_SYS_SPACE) {
-			ib::info()
-				<< "Marking tablespace as missing."
-				" You may drop this table or"
-				" install correct key management plugin"
-				" and key file.";
-		}
+			<< "' cannot be decrypted; key_version="
+			<< key_version;
 	}
 
 	return (err);
 }
 
-/** Complete a read request of a file page to buf_pool.
-@param bpage    recently read page
+/** Complete a read of a page.
 @param node     data file
 @return whether the operation succeeded
-@retval DB_SUCCESS              always when writing, or if a read page was OK
-@retval DB_PAGE_CORRUPTED       if the checksum fails on a page read
-@retval DB_DECRYPTION_FAILED    if the page cannot be decrypted */
-dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node)
+@retval DB_PAGE_CORRUPTED    if the checksum fails
+@retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+@retval DB_FAIL              if the page contains the wrong ID */
+dberr_t buf_page_t::read_complete(const fil_node_t &node)
 {
-  const page_id_t id(bpage->id());
-  ut_ad(bpage->in_file());
-  ut_ad(!buf_dblwr.is_inside(id));
-  ut_ad(id.space() == node.space->id);
-  ut_ad(bpage->zip_size() == node.space->zip_size());
-
-  /* We do not need protect io_fix here by mutex to read it because
-  this and buf_page_write_complete() are the only functions where we can
-  change the value from BUF_IO_READ or BUF_IO_WRITE to some other
-  value, and our code ensures that this is the only thread that handles
-  the i/o for this block. */
-
-  ut_ad(bpage->io_fix() == BUF_IO_READ);
-  ut_ad(!!bpage->zip.ssize == !!bpage->zip.data);
-  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data);
-
-  const byte *frame= bpage->zip.data
-    ? bpage->zip.data
-    : reinterpret_cast<buf_block_t*>(bpage)->frame;
-  ut_ad(frame);
+  const page_id_t expected_id{id()};
+  ut_ad(is_read_fixed());
+  ut_ad(!buf_dblwr.is_inside(id()));
+  ut_ad(id().space() == node.space->id);
+  ut_ad(zip_size() == node.space->zip_size());
+  ut_ad(!!zip.ssize == !!zip.data);
+
+  const byte *read_frame= zip.data ? zip.data : frame;
+  ut_ad(read_frame);
 
   dberr_t err;
-  if (!buf_page_decrypt_after_read(bpage, node))
+  if (!buf_page_decrypt_after_read(this, node))
   {
     err= DB_DECRYPTION_FAILED;
     goto database_corrupted;
   }
 
-  if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
+  if (belongs_to_unzip_LRU())
   {
     buf_pool.n_pend_unzip++;
-    auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+    auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(this), false);
     buf_pool.n_pend_unzip--;
 
     if (!ok)
     {
-      ib::info() << "Page " << id << " zip_decompress failure.";
+      ib::info() << "Page " << expected_id << " zip_decompress failure.";
       err= DB_PAGE_CORRUPTED;
       goto database_corrupted;
     }
   }
 
   {
-    const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID),
-                            mach_read_from_4(frame + FIL_PAGE_OFFSET));
+    const page_id_t read_id(mach_read_from_4(read_frame + FIL_PAGE_SPACE_ID),
+                            mach_read_from_4(read_frame + FIL_PAGE_OFFSET));
 
-    if (read_id == id);
+    if (read_id == expected_id);
     else if (read_id == page_id_t(0, 0))
-      /* This is likely an uninitialized page. */;
+    {
+      /* This is likely an uninitialized (all-zero) page. */
+      err= DB_FAIL;
+      goto release_page;
+    }
     else if (!node.space->full_crc32() &&
-             page_id_t(0, read_id.page_no()) == id)
+             page_id_t(0, read_id.page_no()) == expected_id)
       /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
       before MySQL 4.1.1, which introduced innodb_file_per_table. */;
     else if (node.space->full_crc32() &&
              *reinterpret_cast<const uint32_t*>
-             (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+             (&read_frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
              node.space->crypt_data &&
              node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
     {
-      ib::error() << "Cannot decrypt " << id;
+      ib::error() << "Cannot decrypt " << expected_id;
       err= DB_DECRYPTION_FAILED;
       goto release_page;
     }
     else
+    {
       ib::error() << "Space id and page no stored in the page, read in are "
-                  << read_id << ", should be " << id;
+                  << read_id << ", should be " << expected_id;
+      err= DB_PAGE_CORRUPTED;
+      goto release_page;
+    }
   }
 
-  err= buf_page_check_corrupt(bpage, node);
+  err= buf_page_check_corrupt(this, node);
   if (UNIV_UNLIKELY(err != DB_SUCCESS))
   {
 database_corrupted:
-    /* Not a real corruption if it was triggered by error injection */
-    DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
-                    if (!is_predefined_tablespace(id.space()))
-                    {
-                      buf_corrupt_page_release(bpage, node);
-                      ib::info() << "Simulated IMPORT corruption";
-                      return err;
-                    }
-                    err= DB_SUCCESS;
-                    goto page_not_corrupt;);
-
-    if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE)
-      memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size);
+    if (belongs_to_unzip_LRU())
+      memset_aligned<UNIV_PAGE_SIZE_MIN>(frame, 0, srv_page_size);
 
     if (err == DB_PAGE_CORRUPTED)
     {
       ib::error() << "Database page corruption on disk"
                      " or a failed read of file '"
-                  << node.name << "' page " << id
+                  << node.name << "' page " << expected_id
                   << ". You may have to recover from a backup.";
 
-      buf_page_print(frame, bpage->zip_size());
+      buf_page_print(read_frame, zip_size());
+
+      node.space->set_corrupted();
 
       ib::info() << " You can use CHECK TABLE to scan"
                     " your table for corruption. "
@@ -3966,52 +3621,42 @@ database_corrupted:
     }
 
     if (!srv_force_recovery)
-    {
-      /* If the corruption is in the system tablespace, we will
-      intentionally crash the server. */
-      if (id.space() == TRX_SYS_SPACE)
-        ib::fatal() << "Aborting because of a corrupt database page.";
-      buf_corrupt_page_release(bpage, node);
-      return err;
-    }
+      goto release_page;
   }
 
-  DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
-                  page_not_corrupt: bpage= bpage; );
-
   if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
   {
 release_page:
-    buf_corrupt_page_release(bpage, node);
-    if (recv_recovery_is_on())
-      recv_sys.free_corrupted_page(id);
+    buf_pool.corrupted_evict(this, buf_page_t::READ_FIX);
     return err;
   }
 
-  if (recv_recovery_is_on())
-    recv_recover_page(node.space, bpage);
+  const bool recovery= recv_recovery_is_on();
 
-  if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations &&
-      (!id.space() || !is_predefined_tablespace(id.space())) &&
-      fil_page_get_type(frame) == FIL_PAGE_INDEX &&
-      page_is_leaf(frame))
-    bpage->ibuf_exist= true;
+  if (recovery && !recv_recover_page(node.space, this))
+    return DB_PAGE_CORRUPTED;
+
+  const bool ibuf_may_exist= frame && !recv_no_ibuf_operations &&
+    (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) &&
+    fil_page_get_type(read_frame) == FIL_PAGE_INDEX &&
+    page_is_leaf(read_frame);
 
   if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
-    buf_page_monitor(bpage, BUF_IO_READ);
-  DBUG_PRINT("ib_buf", ("read page %u:%u",
-                        id.space(), id.page_no()));
+    buf_page_monitor(*this, true);
+  DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no()));
 
-  /* Because this thread which does the unlocking might not be the same that
-  did the locking, we use a pass value != 0 in unlock, which simply
-  removes the newest lock debug record, without checking the thread id. */
-  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
-    rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ);
-  bpage->io_unfix();
+  if (!recovery)
+  {
+    ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist
+                                    ? READ_FIX - IBUF_EXIST
+                                    : READ_FIX - UNFIXED);
+    ut_ad(f >= READ_FIX);
+    ut_ad(f < WRITE_FIX);
+  }
+  else if (ibuf_may_exist)
+    set_ibuf_exist();
 
-  ut_d(auto n=) buf_pool.n_pend_reads--;
-  ut_ad(n > 0);
-  buf_pool.stat.n_pages_read++;
+  lock.x_unlock(true);
 
   return DB_SUCCESS;
 }
@@ -4044,9 +3689,6 @@ void buf_pool_invalidate()
 {
 	mysql_mutex_lock(&buf_pool.mutex);
 
-	buf_flush_wait_batch_end(true);
-	buf_flush_wait_batch_end(false);
-
 	/* It is possible that a write batch that has been posted
 	earlier is still not complete. For buffer pool invalidation to
 	proceed we must ensure there is NO write activity happening. */
@@ -4055,16 +3697,17 @@ void buf_pool_invalidate()
 	ut_d(buf_pool.assert_all_freed());
 	ut_d(mysql_mutex_lock(&buf_pool.mutex));
 
-	while (buf_LRU_scan_and_free_block());
+	while (UT_LIST_GET_LEN(buf_pool.LRU)) {
+		buf_LRU_scan_and_free_block();
+	}
 
-	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
 	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
 
 	buf_pool.freed_page_clock = 0;
 	buf_pool.LRU_old = NULL;
 	buf_pool.LRU_old_len = 0;
+	buf_pool.stat.init();
 
-	memset(&buf_pool.stat, 0x00, sizeof(buf_pool.stat));
 	buf_refresh_io_stats();
 	mysql_mutex_unlock(&buf_pool.mutex);
 }
@@ -4085,34 +3728,34 @@ void buf_pool_t::validate()
 	/* Check the uncompressed blocks. */
 
 	for (auto i = n_chunks; i--; chunk++) {
-
-		ulint		j;
 		buf_block_t*	block = chunk->blocks;
 
-		for (j = chunk->size; j--; block++) {
-			switch (block->page.state()) {
-			case BUF_BLOCK_ZIP_PAGE:
-				/* This kind of block descriptors should
-				be allocated by malloc() only. */
-				ut_error;
-				break;
-
-			case BUF_BLOCK_NOT_USED:
+		for (auto j = chunk->size; j--; block++) {
+			ut_ad(block->page.frame);
+			switch (const auto f = block->page.state()) {
+			case buf_page_t::NOT_USED:
 				n_free++;
 				break;
 
-			case BUF_BLOCK_MEMORY:
-			case BUF_BLOCK_REMOVE_HASH:
+			case buf_page_t::MEMORY:
+			case buf_page_t::REMOVE_HASH:
 				/* do nothing */
 				break;
 
-			case BUF_BLOCK_FILE_PAGE:
-				const page_id_t id = block->page.id();
-				ut_ad(page_hash_get_low(id, id.fold())
+			default:
+				if (f >= buf_page_t::READ_FIX
+				    && f < buf_page_t::WRITE_FIX) {
+					/* A read-fixed block is not
+					necessarily in the page_hash yet. */
+					break;
+				}
+				ut_ad(f >= buf_page_t::FREED);
+				const page_id_t id{block->page.id()};
+				ut_ad(page_hash.get(
+					      id,
+					      page_hash.cell_get(id.fold()))
 				      == &block->page);
 				n_lru++;
-				break;
-
 			}
 		}
 	}
@@ -4122,33 +3765,24 @@ void buf_pool_t::validate()
 	mysql_mutex_lock(&flush_list_mutex);
 	for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
 	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_ad(b->in_file());
 		ut_ad(b->oldest_modification());
 		ut_ad(!fsp_is_system_temporary(b->id().space()));
 		n_flushing++;
 
-		switch (b->state()) {
-		case BUF_BLOCK_ZIP_PAGE:
+		if (UNIV_UNLIKELY(!b->frame)) {
 			n_lru++;
 			n_zip++;
-			break;
-		case BUF_BLOCK_FILE_PAGE:
-			/* uncompressed page */
-			break;
-		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
-			ut_error;
-			break;
 		}
-		const page_id_t id = b->id();
-		ut_ad(page_hash_get_low(id, id.fold()) == b);
+		const page_id_t id{b->id()};
+		ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b);
 	}
 
 	ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
 
 	mysql_mutex_unlock(&flush_list_mutex);
 
-	if (curr_size == old_size
+	if (n_chunks_new == n_chunks
 	    && n_lru + n_free > curr_size + n_zip) {
 
 		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
@@ -4158,7 +3792,7 @@ void buf_pool_t::validate()
 
 	ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
 
-	if (curr_size == old_size
+	if (n_chunks_new == n_chunks
 	    && UT_LIST_GET_LEN(free) != n_free) {
 
 		ib::fatal() << "Free list len "
@@ -4204,9 +3838,8 @@ void buf_pool_t::print()
 		<< ", modified database pages="
 		<< UT_LIST_GET_LEN(flush_list)
 		<< ", n pending decompressions=" << n_pend_unzip
-		<< ", n pending reads=" << n_pend_reads
-		<< ", n pending flush LRU=" << n_flush_LRU_
-		<< " list=" << n_flush_list_
+		<< ", n pending flush LRU=" << n_flush()
+		<< " list=" << os_aio_pending_writes()
 		<< ", pages made young=" << stat.n_pages_made_young
 		<< ", not young=" << stat.n_pages_not_made_young
 		<< ", pages read=" << stat.n_pages_read
@@ -4226,7 +3859,7 @@ void buf_pool_t::print()
 		ulint		n_blocks	= chunk->size;
 
 		for (; n_blocks--; block++) {
-			const buf_frame_t* frame = block->frame;
+			const buf_frame_t* frame = block->page.frame;
 
 			if (fil_page_index_page_check(frame)) {
 
@@ -4288,7 +3921,7 @@ ulint buf_get_latched_pages_number()
 
   for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
        b= UT_LIST_GET_NEXT(LRU, b))
-    if (b->in_file() && (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE))
+    if (b->state() > buf_page_t::UNFIXED)
       fixed_pages_number++;
 
   mysql_mutex_unlock(&buf_pool.mutex);
@@ -4318,13 +3951,13 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
 	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
 
 	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
-	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-	pool_info->n_pend_reads = buf_pool.n_pend_reads;
+	pool_info->n_pend_reads = os_aio_pending_reads_approx();
 
-	pool_info->n_pending_flush_lru = buf_pool.n_flush_LRU_;
+	pool_info->n_pending_flush_lru = buf_pool.n_flush();
 
-	pool_info->n_pending_flush_list = buf_pool.n_flush_list_;
+	pool_info->n_pending_flush_list = os_aio_pending_writes();
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
 	current_time = time(NULL);
 	time_elapsed = 0.001 + difftime(current_time,
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
index e98dc18452e..c9818fa600f 100644
--- a/storage/innobase/buf/buf0checksum.cc
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -33,9 +33,6 @@ Created Aug 11, 2011 Vasil Dimov
 #include "srv0srv.h"
 #endif /* !UNIV_INNOCHECKSUM */
 
-/** the value of innodb_checksum_algorithm */
-ulong	srv_checksum_algorithm;
-
 /** Calculate the CRC32 checksum of a page. The value is stored to the page
 when it is written to a file and also checked for a match when reading from
 the file. Note that we must be careful to calculate the same value on all
@@ -57,6 +54,7 @@ uint32_t buf_calc_page_crc32(const byte* page)
 			   - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM));
 }
 
+#ifndef UNIV_INNOCHECKSUM
 /** Calculate a checksum which is stored to the page when it is written
 to a file. Note that we must be careful to calculate the same value on
 32-bit and 64-bit architectures.
@@ -98,32 +96,4 @@ buf_calc_page_old_checksum(const byte* page)
 	return(static_cast<uint32_t>
 	       (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)));
 }
-
-/** Return a printable string describing the checksum algorithm.
-@param[in]	algo	algorithm
-@return algorithm name */
-const char*
-buf_checksum_algorithm_name(srv_checksum_algorithm_t algo)
-{
-	switch (algo) {
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
-		return("crc32");
-	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-		return("strict_crc32");
-	case SRV_CHECKSUM_ALGORITHM_INNODB:
-		return("innodb");
-	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-		return("strict_innodb");
-	case SRV_CHECKSUM_ALGORITHM_NONE:
-		return("none");
-	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		return("strict_none");
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-		return("full_crc32");
-	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-		return("strict_full_crc32");
-	}
-
-	ut_error;
-	return(NULL);
-}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index 1d582b6cfbf..c28a9a3337a 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -29,7 +29,6 @@ Created 2011/12/19
 #include "buf0checksum.h"
 #include "srv0start.h"
 #include "srv0srv.h"
-#include "sync0sync.h"
 #include "page0zip.h"
 #include "trx0sys.h"
 #include "fil0crypt.h"
@@ -43,13 +42,21 @@ buf_dblwr_t buf_dblwr;
 /** @return the TRX_SYS page */
 inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
 {
-  buf_block_t *block= buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
-                                   0, RW_X_LATCH, mtr);
-  buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-  return block;
+  return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                      0, RW_X_LATCH, mtr);
 }
 
-/** Initialize the doublewrite buffer data structure.
+void buf_dblwr_t::init()
+{
+  if (!active_slot)
+  {
+    active_slot= &slots[0];
+    mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+    pthread_cond_init(&cond, nullptr);
+  }
+}
+
+/** Initialise the persistent storage of the doublewrite buffer.
 @param header   doublewrite page header in the TRX_SYS page */
 inline void buf_dblwr_t::init(const byte *header)
 {
@@ -57,8 +64,6 @@ inline void buf_dblwr_t::init(const byte *header)
   ut_ad(!active_slot->reserved);
   ut_ad(!batch_running);
 
-  mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
-  pthread_cond_init(&cond, nullptr);
   block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
   block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
 
@@ -77,7 +82,7 @@ inline void buf_dblwr_t::init(const byte *header)
 @return whether the operation succeeded */
 bool buf_dblwr_t::create()
 {
-  if (is_initialised())
+  if (is_created())
     return true;
 
   mtr_t mtr;
@@ -86,24 +91,31 @@ bool buf_dblwr_t::create()
 start_again:
   mtr.start();
 
+  dberr_t err;
   buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+  if (!trx_sys_block)
+  {
+    mtr.commit();
+    return false;
+  }
 
   if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
-                       trx_sys_block->frame) == TRX_SYS_DOUBLEWRITE_MAGIC_N)
+                       trx_sys_block->page.frame) ==
+      TRX_SYS_DOUBLEWRITE_MAGIC_N)
   {
     /* The doublewrite buffer has already been created: just read in
     some numbers */
-    init(TRX_SYS_DOUBLEWRITE + trx_sys_block->frame);
+    init(TRX_SYS_DOUBLEWRITE + trx_sys_block->page.frame);
     mtr.commit();
     return true;
   }
 
   if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
   {
-too_small:
     ib::error() << "Cannot create doublewrite buffer: "
                    "the first file in innodb_data_file_path must be at least "
                 << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+fail:
     mtr.commit();
     return false;
   }
@@ -111,9 +123,13 @@ too_small:
   {
     buf_block_t *b= fseg_create(fil_system.sys_space,
                                 TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
-                                &mtr, false, trx_sys_block);
+                                &mtr, &err, false, trx_sys_block);
     if (!b)
-      goto too_small;
+    {
+      ib::error() << "Cannot create doublewrite buffer: " << err;
+      goto fail;
+    }
+
     ib::info() << "Doublewrite buffer not found: creating new";
 
     /* FIXME: After this point, the doublewrite buffer creation
@@ -121,19 +137,16 @@ too_small:
     the InnoDB system tablespace file in the first place.
     It could be located in separate optional file(s) in a
     user-specified location. */
-
-    /* fseg_create acquires a second latch on the page,
-    therefore we must declare it: */
-    buf_block_dbg_add_level(b, SYNC_NO_ORDER_CHECK);
   }
 
   byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
-    trx_sys_block->frame;
+    trx_sys_block->page.frame;
   for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
        i < 2 * size + extent_size / 2; i++)
   {
-    buf_block_t *new_block= fseg_alloc_free_page(fseg_header, prev_page_no + 1,
-                                                 FSP_UP, &mtr);
+    buf_block_t *new_block=
+      fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP,
+                                   false, &mtr, &mtr, &err);
     if (!new_block)
     {
       ib::error() << "Cannot create doublewrite buffer: "
@@ -156,12 +169,12 @@ too_small:
     tablespace, then the page has not been written to in
     doublewrite. */
 
-    ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
+    ut_ad(new_block->page.lock.not_recursive());
     const page_id_t id= new_block->page.id();
     /* We only do this in the debug build, to ensure that the check in
     buf_flush_init_for_writing() will see a valid page type. The
     flushes of new_block are actually unnecessary here.  */
-    ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->frame,
+    ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame,
                       FIL_PAGE_TYPE_SYS));
 
     if (i == size / 2)
@@ -169,10 +182,10 @@ too_small:
       ut_a(id.page_no() == size);
       mtr.write<4>(*trx_sys_block,
                    TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
-                   trx_sys_block->frame, id.page_no());
+                   trx_sys_block->page.frame, id.page_no());
       mtr.write<4>(*trx_sys_block,
                    TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
-                   TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->frame,
+                   TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame,
                    id.page_no());
     }
     else if (i == size / 2 + size)
@@ -180,10 +193,10 @@ too_small:
       ut_a(id.page_no() == 2 * size);
       mtr.write<4>(*trx_sys_block,
                    TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
-                   trx_sys_block->frame, id.page_no());
+                   trx_sys_block->page.frame, id.page_no());
       mtr.write<4>(*trx_sys_block,
                    TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
-                   TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->frame,
+                   TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame,
                    id.page_no());
     }
     else if (i > size / 2)
@@ -200,7 +213,7 @@ too_small:
       mtr.start();
       trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
       fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
-        trx_sys_block->frame;
+        trx_sys_block->page.frame;
     }
 
     prev_page_no= id.page_no();
@@ -208,15 +221,16 @@ too_small:
 
   mtr.write<4>(*trx_sys_block,
                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
-               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+               trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
   mtr.write<4>(*trx_sys_block,
                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
-               TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->frame,
+               TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame,
                TRX_SYS_DOUBLEWRITE_MAGIC_N);
 
   mtr.write<4>(*trx_sys_block,
                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
-               trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+               trx_sys_block->page.frame,
+               TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
   mtr.commit();
 
   buf_flush_wait_flushed(mtr.commit_lsn());
@@ -247,7 +261,7 @@ dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
   /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
   dberr_t err= os_file_read(IORequestRead, file, read_buf,
                             TRX_SYS_PAGE_NO << srv_page_size_shift,
-                            srv_page_size);
+                            srv_page_size, nullptr);
 
   if (err != DB_SUCCESS)
   {
@@ -279,7 +293,7 @@ func_exit:
   /* Read the pages from the doublewrite buffer to memory */
   err= os_file_read(IORequestRead, file, write_buf,
                     block1.page_no() << srv_page_size_shift,
-                    size << srv_page_size_shift);
+                    size << srv_page_size_shift, nullptr);
 
   if (err != DB_SUCCESS)
   {
@@ -290,7 +304,7 @@ func_exit:
   err= os_file_read(IORequestRead, file,
                     write_buf + (size << srv_page_size_shift),
                     block2.page_no() << srv_page_size_shift,
-                    size << srv_page_size_shift);
+                    size << srv_page_size_shift, nullptr);
   if (err != DB_SUCCESS)
   {
     ib::error() << "Failed to read the second double write buffer extent";
@@ -306,7 +320,7 @@ func_exit:
     for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
     {
       memset(page + FIL_PAGE_SPACE_ID, 0, 4);
-      /* For innodb_checksum_algorithm=innodb, we do not need to
+      /* For pre-MySQL-4.1 innodb_checksum_algorithm=innodb, we do not need to
       calculate new checksums for the pages because the field
       .._SPACE_ID does not affect them. Write the page back to where
       we read it from. */
@@ -337,7 +351,7 @@ func_exit:
 void buf_dblwr_t::recover()
 {
   ut_ad(recv_sys.parse_start_lsn);
-  if (!is_initialised())
+  if (!is_created())
     return;
 
   uint32_t page_no_dblwr= 0;
@@ -380,7 +394,7 @@ void buf_dblwr_t::recover()
       if (!srv_is_undo_tablespace(space_id))
         ib::warn() << "A copy of page " << page_no
                    << " in the doublewrite buffer slot " << page_no_dblwr
-                   << " is beyond the end of tablespace " << space->name
+                   << " is beyond the end of " << space->chain.start->name
                    << " (" << space->size << " pages)";
 next_page:
       space->release();
@@ -400,9 +414,12 @@ next_page:
                             physical_size, read_buf);
 
     if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+    {
        ib::warn() << "Double write buffer recovery: " << page_id
-                  << " (tablespace '" << space->name
+                  << " ('" << space->chain.start->name
                   << "') read failed with error: " << fio.err;
+       continue;
+    }
 
     if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
     {
@@ -443,10 +460,9 @@ next_page:
 /** Free the doublewrite buffer. */
 void buf_dblwr_t::close()
 {
-  if (!is_initialised())
+  if (!active_slot)
     return;
 
-  /* Free the double write data structures. */
   ut_ad(!active_slot->reserved);
   ut_ad(!active_slot->first_free);
   ut_ad(!batch_running);
@@ -460,19 +476,18 @@ void buf_dblwr_t::close()
   mysql_mutex_destroy(&mutex);
 
   memset((void*) this, 0, sizeof *this);
-  active_slot= &slots[0];
 }
 
 /** Update the doublewrite buffer on write completion. */
 void buf_dblwr_t::write_completed()
 {
   ut_ad(this == &buf_dblwr);
-  ut_ad(srv_use_doublewrite_buf);
-  ut_ad(is_initialised());
   ut_ad(!srv_read_only_mode);
 
   mysql_mutex_lock(&mutex);
 
+  ut_ad(is_created());
+  ut_ad(srv_use_doublewrite_buf);
   ut_ad(batch_running);
   slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
   ut_ad(flush_slot->reserved);
@@ -525,8 +540,9 @@ static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
 /** Check the LSN values on the page with which this block is associated. */
 static void buf_dblwr_check_block(const buf_page_t *bpage)
 {
-  ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
-  const page_t *page= reinterpret_cast<const buf_block_t*>(bpage)->frame;
+  ut_ad(bpage->in_file());
+  const page_t *page= bpage->frame;
+  ut_ad(page);
 
   switch (fil_page_get_type(page)) {
   case FIL_PAGE_INDEX:
@@ -600,8 +616,8 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
     ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
   }
 #endif /* UNIV_DEBUG */
-  const IORequest request(nullptr, fil_system.sys_space->chain.start,
-                          IORequest::DBLWR_BATCH);
+  const IORequest request{nullptr, nullptr, fil_system.sys_space->chain.start,
+                          IORequest::DBLWR_BATCH};
   ut_a(fil_system.sys_space->acquire());
   if (multi_batch)
   {
@@ -620,11 +636,19 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size)
   return true;
 }
 
+static void *get_frame(const IORequest &request)
+{
+  if (request.slot)
+    return request.slot->out_buf;
+  const buf_page_t *bpage= request.bpage;
+  return bpage->zip.data ? bpage->zip.data : bpage->frame;
+}
+
 void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
 {
   ut_ad(this == &buf_dblwr);
   ut_ad(srv_use_doublewrite_buf);
-  ut_ad(is_initialised());
+  ut_ad(is_created());
   ut_ad(!srv_read_only_mode);
   ut_ad(!request.bpage);
   ut_ad(request.node == fil_system.sys_space->chain.start);
@@ -657,9 +681,8 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
     buf_page_t* bpage= e.request.bpage;
     ut_ad(bpage->in_file());
 
-    /* We request frame here to get correct buffer in case of
-    encryption and/or page compression */
-    void *frame= buf_page_get_frame(bpage);
+    void *frame= get_frame(e.request);
+    ut_ad(frame);
 
     auto e_size= e.size;
 
@@ -670,7 +693,6 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
     }
     else
     {
-      ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
       ut_ad(!bpage->zip_size());
       ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
     }
@@ -692,7 +714,7 @@ posted, and also when we may have to wait for a page latch!
 Otherwise a deadlock of threads can occur. */
 void buf_dblwr_t::flush_buffered_writes()
 {
-  if (!is_initialised() || !srv_use_doublewrite_buf)
+  if (!is_created() || !srv_use_doublewrite_buf)
   {
     fil_flush_file_spaces();
     return;
@@ -717,6 +739,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
   ut_ad(request.bpage);
   ut_ad(request.bpage->in_file());
   ut_ad(request.node);
+  ut_ad(request.node->space->purpose == FIL_TYPE_TABLESPACE);
   ut_ad(request.node->space->id == request.bpage->id().space());
   ut_ad(request.node->space->referenced());
   ut_ad(!srv_read_only_mode);
@@ -737,13 +760,9 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
 
   byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
 
-  /* We request frame here to get correct buffer in case of
-  encryption and/or page compression */
-  void *frame= buf_page_get_frame(request.bpage);
-
   /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
   and at least srv_page_size (4096-byte) for everything else. */
-  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, frame, size);
+  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, get_frame(request), size);
   /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
   memset_aligned<256>(p + size, 0, srv_page_size - size);
   /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
index 269ef448c31..05b18de1d5b 100644
--- a/storage/innobase/buf/buf0dump.cc
+++ b/storage/innobase/buf/buf0dump.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -35,10 +35,8 @@ Created April 08, 2011 Vasil Dimov
 #include "buf0dump.h"
 #include "dict0dict.h"
 #include "os0file.h"
-#include "os0thread.h"
 #include "srv0srv.h"
 #include "srv0start.h"
-#include "sync0rw.h"
 #include "ut0byte.h"
 
 #include <algorithm>
@@ -182,8 +180,8 @@ static void buf_dump_generate_path(char *path, size_t path_size)
 	char	buf[FN_REFLEN];
 
 	mysql_mutex_lock(&LOCK_global_system_variables);
-	snprintf(buf, sizeof(buf), "%s%c%s", get_buf_dump_dir(),
-		 OS_PATH_SEPARATOR, srv_buf_dump_filename);
+	snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(),
+		 srv_buf_dump_filename);
 	mysql_mutex_unlock(&LOCK_global_system_variables);
 
 	os_file_type_t	type;
@@ -206,22 +204,25 @@ static void buf_dump_generate_path(char *path, size_t path_size)
 		char	srv_data_home_full[FN_REFLEN];
 
 		my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
+		const char *format;
 
-		if (srv_data_home_full[strlen(srv_data_home_full) - 1]
-		    == OS_PATH_SEPARATOR) {
-
-			snprintf(path, path_size, "%s%s",
-				 srv_data_home_full,
-				 srv_buf_dump_filename);
-		} else {
-			snprintf(path, path_size, "%s%c%s",
-				 srv_data_home_full,
-				 OS_PATH_SEPARATOR,
-				 srv_buf_dump_filename);
+		switch (srv_data_home_full[strlen(srv_data_home_full) - 1]) {
+#ifdef _WIN32
+		case '\\':
+#endif
+		case '/':
+			format = "%s%s";
+			break;
+		default:
+			format = "%s/%s";
 		}
+
+		snprintf(path, path_size, format,
+			 srv_data_home_full, srv_buf_dump_filename);
 	}
 }
 
+
 /*****************************************************************//**
 Perform a buffer pool dump into the file specified by
 innodb_buffer_pool_filename. If any errors occur then the value of
@@ -251,7 +252,10 @@ buf_dump(
 	buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
 			full_filename);
 
-#if defined(__GLIBC__) || defined(__WIN__) || O_CLOEXEC == 0
+#ifdef _WIN32
+	/* use my_fopen() for correct permissions during bootstrap*/
+	f = my_fopen(tmp_filename, O_RDWR|O_TRUNC|O_CREAT, 0);
+#elif defined(__GLIBC__) || O_CLOEXEC == 0
 	f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
 #else
 	{
@@ -323,16 +327,15 @@ buf_dump(
 	for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
 	     bpage != NULL && j < n_pages;
 	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
-
-		ut_a(bpage->in_file());
-		const page_id_t id(bpage->id());
-
-		if (id.space() == SRV_TMP_SPACE_ID) {
-			/* Ignore the innodb_temporary tablespace. */
+		const auto status = bpage->state();
+		if (status < buf_page_t::UNFIXED) {
+			ut_a(status >= buf_page_t::FREED);
 			continue;
 		}
+		const page_id_t id{bpage->id()};
 
-		if (bpage->status == buf_page_t::FREED) {
+		if (id.space() == SRV_TMP_SPACE_ID) {
+			/* Ignore the innodb_temporary tablespace. */
 			continue;
 		}
 
@@ -367,7 +370,7 @@ buf_dump(
 	ut_free(dump);
 
 done:
-	ret = fclose(f);
+	ret = IF_WIN(my_fclose(f,0),fclose(f));
 	if (ret != 0) {
 		buf_dump_status(STATUS_ERR,
 				"Cannot close '%s': %s",
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 36ea302a403..5a9e3cbb34e 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -47,14 +47,14 @@ Created 11/11/1995 Heikki Tuuri
 #endif
 
 /** Number of pages flushed via LRU. Protected by buf_pool.mutex.
-Also included in buf_flush_page_count. */
+Also included in buf_pool.stat.n_pages_written. */
 ulint buf_lru_flush_page_count;
 
-/** Number of pages flushed. Protected by buf_pool.mutex. */
-ulint buf_flush_page_count;
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+ulint buf_lru_freed_page_count;
 
 /** Flag indicating if the page_cleaner is in active state. */
-bool buf_page_cleaner_is_active;
+Atomic_relaxed<bool> buf_page_cleaner_is_active;
 
 /** Factor for scan length to determine n_pages for intended oldest LSN
 progress */
@@ -112,8 +112,7 @@ static void buf_flush_validate_skip()
 }
 #endif /* UNIV_DEBUG */
 
-/** Wake up the page cleaner if needed */
-void buf_pool_t::page_cleaner_wakeup()
+void buf_pool_t::page_cleaner_wakeup(bool for_LRU)
 {
   if (!page_cleaner_idle())
     return;
@@ -146,11 +145,12 @@ void buf_pool_t::page_cleaner_wakeup()
   - by allowing last_activity_count to updated when page-cleaner is made
     active and has work to do. This ensures that the last_activity signal
     is consumed by the page-cleaner before the next one is generated. */
-  if ((pct_lwm != 0.0 && pct_lwm <= dirty_pct) ||
-      (pct_lwm != 0.0 && last_activity_count == srv_get_activity_count()) ||
+  if (for_LRU ||
+      (pct_lwm != 0.0 && (pct_lwm <= dirty_pct ||
+                          last_activity_count == srv_get_activity_count())) ||
       srv_max_buf_pool_modified_pct <= dirty_pct)
   {
-    page_cleaner_is_idle= false;
+    page_cleaner_status-= PAGE_CLEANER_IDLE;
     pthread_cond_signal(&do_flush_list);
   }
 }
@@ -180,12 +180,12 @@ void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
     delete_from_flush_list_low(&block->page);
   }
   else
-    stat.flush_list_bytes+= block->physical_size();
-  ut_ad(stat.flush_list_bytes <= curr_pool_size);
+    flush_list_bytes+= block->physical_size();
+  ut_ad(flush_list_bytes <= curr_pool_size);
 
   block->page.set_oldest_modification(lsn);
   MEM_CHECK_DEFINED(block->page.zip.data
-                    ? block->page.zip.data : block->frame,
+                    ? block->page.zip.data : block->page.frame,
                     block->physical_size());
   UT_LIST_ADD_FIRST(flush_list, &block->page);
   ut_d(buf_flush_validate_skip());
@@ -194,14 +194,12 @@ void buf_pool_t::insert_into_flush_list(buf_block_t *block, lsn_t lsn)
 }
 
 /** Remove a block from flush_list.
-@param bpage   buffer pool page
-@param clear   whether to invoke buf_page_t::clear_oldest_modification() */
-void buf_pool_t::delete_from_flush_list(buf_page_t *bpage, bool clear)
+@param bpage   buffer pool page */
+void buf_pool_t::delete_from_flush_list(buf_page_t *bpage)
 {
   delete_from_flush_list_low(bpage);
-  stat.flush_list_bytes-= bpage->physical_size();
-  if (clear)
-    bpage->clear_oldest_modification();
+  flush_list_bytes-= bpage->physical_size();
+  bpage->clear_oldest_modification();
 #ifdef UNIV_DEBUG
   buf_flush_validate_skip();
 #endif /* UNIV_DEBUG */
@@ -216,25 +214,25 @@ void buf_flush_remove_pages(ulint id)
 {
   const page_id_t first(id, 0), end(id + 1, 0);
   ut_ad(id);
-  mysql_mutex_lock(&buf_pool.mutex);
 
   for (;;)
   {
+    mysql_mutex_lock(&buf_pool.mutex);
     bool deferred= false;
 
     mysql_mutex_lock(&buf_pool.flush_list_mutex);
 
     for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
     {
-      ut_d(const auto s= bpage->state());
-      ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
-            s == BUF_BLOCK_REMOVE_HASH);
+      const auto s= bpage->state();
+      ut_ad(s >= buf_page_t::REMOVE_HASH);
+      ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
       buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
 
       const page_id_t bpage_id(bpage->id());
 
       if (bpage_id < first || bpage_id >= end);
-      else if (bpage->io_fix() != BUF_IO_NONE)
+      else if (s >= buf_page_t::WRITE_FIX)
         deferred= true;
       else
         buf_pool.delete_from_flush_list(bpage);
@@ -242,18 +240,14 @@ void buf_flush_remove_pages(ulint id)
       bpage= prev;
     }
 
+    mysql_mutex_unlock(&buf_pool.mutex);
     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
     if (!deferred)
       break;
 
-    mysql_mutex_unlock(&buf_pool.mutex);
-    os_thread_yield();
-    mysql_mutex_lock(&buf_pool.mutex);
-    buf_flush_wait_batch_end(false);
+    os_aio_wait_until_no_pending_writes();
   }
-
-  mysql_mutex_unlock(&buf_pool.mutex);
 }
 
 /*******************************************************************//**
@@ -298,7 +292,7 @@ buf_flush_relocate_on_flush_list(
 	bpage->clear_oldest_modification();
 
 	if (lsn == 1) {
-		buf_pool.stat.flush_list_bytes -= dpage->physical_size();
+		buf_pool.flush_list_bytes -= dpage->physical_size();
 		dpage->list.prev = nullptr;
 		dpage->list.next = nullptr;
 		dpage->clear_oldest_modification();
@@ -312,83 +306,95 @@ buf_flush_relocate_on_flush_list(
 	ut_d(buf_flush_validate_low());
 }
 
+/** Note that a block is no longer dirty, while not removing
+it from buf_pool.flush_list */
+inline void buf_page_t::write_complete(bool temporary)
+{
+  ut_ad(temporary == fsp_is_system_temporary(id().space()));
+  if (temporary)
+  {
+    ut_ad(oldest_modification() == 2);
+    oldest_modification_= 0;
+  }
+  else
+  {
+    /* We use release memory order to guarantee that callers of
+    oldest_modification_acquire() will observe the block as
+    being detached from buf_pool.flush_list, after reading the value 0. */
+    ut_ad(oldest_modification() > 2);
+    oldest_modification_.store(1, std::memory_order_release);
+  }
+  const auto s= state();
+  ut_ad(s >= WRITE_FIX);
+  zip.fix.fetch_sub((s >= WRITE_FIX_REINIT)
+                    ? (WRITE_FIX_REINIT - UNFIXED)
+                    : (WRITE_FIX - UNFIXED));
+  lock.u_unlock(true);
+}
+
+inline void buf_pool_t::n_flush_inc()
+{
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  page_cleaner_status+= LRU_FLUSH;
+}
+
+inline void buf_pool_t::n_flush_dec()
+{
+  mysql_mutex_lock(&flush_list_mutex);
+  ut_ad(page_cleaner_status >= LRU_FLUSH);
+  if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH)
+    pthread_cond_broadcast(&done_flush_LRU);
+  mysql_mutex_unlock(&flush_list_mutex);
+}
+
 /** Complete write of a file page from buf_pool.
 @param request write request */
 void buf_page_write_complete(const IORequest &request)
 {
   ut_ad(request.is_write());
-  ut_ad(!srv_read_only_mode/* ||
-        request.node->space->purpose == FIL_TYPE_TEMPORARY*/);
+  ut_ad(!srv_read_only_mode);
   buf_page_t *bpage= request.bpage;
   ut_ad(bpage);
-  ut_ad(bpage->in_file());
-  /* bpage->io_fix() can only be changed by buf_page_write_complete()
-  and buf_page_read_complete() from BUF_IO_READ or BUF_IO_WRITE */
-  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
+  const auto state= bpage->state();
+  /* io-fix can only be cleared by buf_page_t::write_complete()
+  and buf_page_t::read_complete() */
+  ut_ad(state >= buf_page_t::WRITE_FIX);
   ut_ad(!buf_dblwr.is_inside(bpage->id()));
   ut_ad(request.node->space->id == bpage->id().space());
 
-  if (bpage->status == buf_page_t::INIT_ON_FLUSH)
-    bpage->status= buf_page_t::NORMAL;
-  else
-  {
-    ut_ad(bpage->status == buf_page_t::NORMAL);
-    if (request.node->space->use_doublewrite())
-    {
-      ut_ad(request.node->space != fil_system.temp_space);
-      buf_dblwr.write_completed();
-    }
-  }
-
-  if (bpage->slot)
-  {
-    bpage->slot->release();
-    bpage->slot= nullptr;
-  }
+  if (request.slot)
+    request.slot->release();
 
   if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
-    buf_page_monitor(bpage, BUF_IO_WRITE);
+    buf_page_monitor(*bpage, false);
   DBUG_PRINT("ib_buf", ("write page %u:%u",
                         bpage->id().space(), bpage->id().page_no()));
-  const bool temp= fsp_is_system_temporary(bpage->id().space());
 
-  mysql_mutex_lock(&buf_pool.mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
   mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
-  buf_pool.stat.n_pages_written++;
-  /* While we do not need any mutex for clearing oldest_modification
-  here, we hope that it will be in the same cache line with io_fix,
-  whose changes must be protected by buf_pool.mutex. */
-  ut_ad(temp || bpage->oldest_modification() > 2);
-  bpage->clear_oldest_modification(temp);
-  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
-  bpage->set_io_fix(BUF_IO_NONE);
-
-  /* Because this thread which does the unlocking might not be the same that
-  did the locking, we use a pass value != 0 in unlock, which simply
-  removes the newest lock debug record, without checking the thread id. */
-  if (bpage->state() == BUF_BLOCK_FILE_PAGE)
-    rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
 
   if (request.is_LRU())
   {
+    const bool temp= bpage->oldest_modification() == 2;
+    if (!temp && state < buf_page_t::WRITE_FIX_REINIT &&
+        request.node->space->use_doublewrite())
+      buf_dblwr.write_completed();
+    /* We must hold buf_pool.mutex while releasing the block, so that
+    no other thread can access it before we have freed it. */
+    mysql_mutex_lock(&buf_pool.mutex);
+    bpage->write_complete(temp);
     buf_LRU_free_page(bpage, true);
+    mysql_mutex_unlock(&buf_pool.mutex);
 
-    ut_ad(buf_pool.n_flush_LRU_);
-    if (!--buf_pool.n_flush_LRU_)
-    {
-      pthread_cond_broadcast(&buf_pool.done_flush_LRU);
-      pthread_cond_signal(&buf_pool.done_free);
-    }
+    buf_pool.n_flush_dec();
   }
   else
   {
-    ut_ad(!temp);
-    ut_ad(buf_pool.n_flush_list_);
-    if (!--buf_pool.n_flush_list_)
-      pthread_cond_broadcast(&buf_pool.done_flush_list);
+    if (state < buf_page_t::WRITE_FIX_REINIT &&
+        request.node->space->use_doublewrite())
+      buf_dblwr.write_completed();
+    bpage->write_complete(false);
   }
-
-  mysql_mutex_unlock(&buf_pool.mutex);
 }
 
 /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
@@ -398,9 +404,7 @@ void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
 {
   ut_ad(size > 0);
   mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-                  page_zip_calc_checksum(page, size,
-                                         static_cast<srv_checksum_algorithm_t>
-                                         (srv_checksum_algorithm)));
+                  page_zip_calc_checksum(page, size, false));
 }
 
 /** Assign the full crc32 checksum for non-compressed page.
@@ -432,16 +436,14 @@ buf_flush_init_for_writing(
 	void*			page_zip_,
 	bool			use_full_checksum)
 {
-	if (block != NULL && block->frame != page) {
+	if (block && block->page.frame != page) {
 		/* If page is encrypted in full crc32 format then
 		checksum stored already as a part of fil_encrypt_buf() */
 		ut_ad(use_full_checksum);
 		return;
 	}
 
-	ut_ad(block == NULL || block->frame == page);
-	ut_ad(block == NULL || page_zip_ == NULL
-	      || &block->page.zip == page_zip_);
+	ut_ad(!block || block->page.frame == page);
 	ut_ad(page);
 
 	if (page_zip_) {
@@ -449,6 +451,7 @@ buf_flush_init_for_writing(
 		ulint		size;
 
 		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		ut_ad(!block || &block->page.zip == page_zip);
 		size = page_zip_get_size(page_zip);
 
 		ut_ad(size);
@@ -555,38 +558,8 @@ buf_flush_init_for_writing(
 		}
 	}
 
-	uint32_t checksum = BUF_NO_CHECKSUM_MAGIC;
-
-	switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
-	case SRV_CHECKSUM_ALGORITHM_INNODB:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-		checksum = buf_calc_page_new_checksum(page);
-		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-				checksum);
-		/* With the InnoDB checksum, we overwrite the first 4 bytes of
-		the end lsn field to store the old formula checksum. Since it
-		depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
-		be calculated after storing the new formula checksum. */
-		checksum = buf_calc_page_old_checksum(page);
-		break;
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-		/* In other cases we write the same checksum to both fields. */
-		checksum = buf_calc_page_crc32(page);
-		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-				checksum);
-		break;
-	case SRV_CHECKSUM_ALGORITHM_NONE:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
-				checksum);
-		break;
-		/* no default so the compiler will emit a warning if
-		new enum is added and not handled here */
-	}
-
+	const uint32_t checksum = buf_calc_page_crc32(page);
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
 	mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
 			checksum);
 }
@@ -601,7 +574,7 @@ static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
   buffer be bigger than input buffer. Adjust the allocated size. */
   ulint size= srv_page_size;
 #ifdef HAVE_LZO
-  size+= LZO1X_1_15_MEM_COMPRESS;
+  size= size + LZO1X_1_15_MEM_COMPRESS;
 #elif defined HAVE_SNAPPY
   size= snappy_max_compressed_length(size);
 #endif
@@ -644,12 +617,12 @@ a page is written to disk.
 @return page frame to be written to file
 (may be src_frame or an encrypted/compressed copy of it) */
 static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
-                              size_t *size)
+                              buf_tmp_buffer_t **slot, size_t *size)
 {
-  ut_ad(bpage->status != buf_page_t::FREED);
+  ut_ad(!bpage->is_freed());
   ut_ad(space->id == bpage->id().space());
+  ut_ad(!*slot);
 
-  ut_d(fil_page_type_validate(space, s));
   const uint32_t page_no= bpage->id().page_no();
 
   switch (page_no) {
@@ -703,31 +676,25 @@ static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
 
   ut_ad(!bpage->zip_size() || !page_compressed);
   /* Find free slot from temporary memory array */
-  buf_tmp_buffer_t *slot= buf_pool.io_buf_reserve();
-  ut_a(slot);
-  slot->allocate();
-  slot->out_buf= NULL;
-  bpage->slot= slot;
+  *slot= buf_pool.io_buf_reserve();
+  ut_a(*slot);
+  (*slot)->allocate();
 
-  byte *d= slot->crypt_buf;
+  byte *d= (*slot)->crypt_buf;
 
   if (!page_compressed)
   {
 not_compressed:
-    byte *tmp= space->purpose == FIL_TYPE_TEMPORARY
+    d= space->purpose == FIL_TYPE_TEMPORARY
       ? buf_tmp_page_encrypt(page_no, s, d)
       : fil_space_encrypt(space, page_no, s, d);
-
-    slot->out_buf= d= tmp;
-
-    ut_d(fil_page_type_validate(space, tmp));
   }
   else
   {
     ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
     /* First we compress the page content */
-    buf_tmp_reserve_compression_buf(slot);
-    byte *tmp= slot->comp_buf;
+    buf_tmp_reserve_compression_buf(*slot);
+    byte *tmp= (*slot)->comp_buf;
     ulint len= fil_page_compress(s, tmp, space->flags,
                                  fil_space_get_block_size(space, page_no),
                                  encrypted);
@@ -752,10 +719,9 @@ not_compressed:
 
     /* Workaround for MDEV-15527. */
     memset(tmp + len, 0 , srv_page_size - len);
-    ut_d(fil_page_type_validate(space, tmp));
 
     if (encrypted)
-      tmp = fil_space_encrypt(space, page_no, tmp, d);
+      tmp= fil_space_encrypt(space, page_no, tmp, d);
 
     if (full_crc32)
     {
@@ -764,228 +730,207 @@ not_compressed:
       ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
     }
 
-    slot->out_buf= d= tmp;
+    d= tmp;
   }
 
-  ut_d(fil_page_type_validate(space, d));
+  (*slot)->out_buf= d;
   return d;
 }
 
 /** Free a page whose underlying file page has been freed. */
-inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
+ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage)
 {
-  ut_ad(bpage->in_file());
-  const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE;
-  mysql_mutex_lock(&mutex);
-  bpage->set_io_fix(BUF_IO_NONE);
-  bpage->status= buf_page_t::NORMAL;
-  mysql_mutex_lock(&flush_list_mutex);
+  mysql_mutex_assert_owner(&mutex);
   ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
   if (fsp_is_system_temporary(bpage->id().space()))
   {
-    ut_ad(uncompressed);
+    ut_ad(bpage->frame);
     ut_ad(oldest_modification == 2);
+    bpage->clear_oldest_modification();
   }
   else
   {
+    mysql_mutex_lock(&flush_list_mutex);
     ut_ad(oldest_modification > 2);
-    delete_from_flush_list(bpage, false);
+    delete_from_flush_list(bpage);
+    mysql_mutex_unlock(&flush_list_mutex);
   }
-  bpage->clear_oldest_modification();
-  mysql_mutex_unlock(&flush_list_mutex);
-
-  if (uncompressed)
-    rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock,
-                          BUF_IO_WRITE);
 
+  bpage->lock.u_unlock(true);
   buf_LRU_free_page(bpage, true);
-  mysql_mutex_unlock(&mutex);
 }
 
-/** Write a flushable page from buf_pool to a file.
-buf_pool.mutex must be held.
-@param bpage       buffer control block
-@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+/** Write a flushable page to a file or free a freeable block.
+@param evict       whether to evict the page on write completion
 @param space       tablespace
-@return whether the page was flushed and buf_pool.mutex was released */
-static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
+@return whether a page write was initiated and buf_pool.mutex released */
+bool buf_page_t::flush(bool evict, fil_space_t *space)
 {
-  ut_ad(bpage->in_file());
-  ut_ad(bpage->ready_for_flush());
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  ut_ad(in_file());
+  ut_ad(in_LRU_list);
   ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
         (space == fil_system.temp_space));
-  ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
-        space->atomic_write_supported);
+  ut_ad(evict || space != fil_system.temp_space);
   ut_ad(space->referenced());
-  ut_ad(lru || space != fil_system.temp_space);
 
-  rw_lock_t *rw_lock;
+  const auto s= state();
+  ut_a(s >= FREED);
 
-  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
-    rw_lock= nullptr;
-  else
+  if (s < UNFIXED)
   {
-    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
-    if (!rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE))
-      return false;
+    buf_pool.release_freed_page(this);
+    return false;
   }
 
-  bpage->set_io_fix(BUF_IO_WRITE);
-  /* Because bpage->status can only be changed while buf_block_t
-  exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages
-  without first allocating the uncompressed page frame. Such
-  allocation cannot be completed due to our io_fix. So, bpage->status
-  is protected even if !rw_lock. */
-  const auto status= bpage->status;
+  ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
+  ut_ad(f >= UNFIXED);
+  ut_ad(f < READ_FIX);
+  ut_ad((space == fil_system.temp_space)
+        ? oldest_modification() == 2
+        : oldest_modification() > 2);
 
-  if (status != buf_page_t::FREED)
+  /* Increment the I/O operation count used for selecting LRU policy. */
+  buf_LRU_stat_inc_io();
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  IORequest::Type type= IORequest::WRITE_ASYNC;
+  if (UNIV_UNLIKELY(evict))
   {
-    if (lru)
-      buf_pool.n_flush_LRU_++;
-    else
-      buf_pool.n_flush_list_++;
-    buf_flush_page_count++;
+    type= IORequest::WRITE_LRU;
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    buf_pool.n_flush_inc();
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
   }
 
-  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
-
-  /* We are holding rw_lock = buf_block_t::lock in SX mode except if
-  this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame
-  has been evicted from the buffer pool.
-
-  Apart from possible rw_lock protection, bpage is also protected by
-  io_fix and oldest_modification()!=0. Thus, it cannot be relocated in
-  the buffer pool or removed from flush_list or LRU_list. */
+  /* Apart from the U-lock, this block will also be protected by
+  is_write_fixed() and oldest_modification()>1.
+  Thus, it cannot be relocated or removed. */
 
   DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
-                        lru ? "LRU" : "flush_list",
-                        bpage->id().space(), bpage->id().page_no()));
-  ut_ad(bpage->io_fix() == BUF_IO_WRITE);
-  ut_d(const lsn_t oldest_modification= bpage->oldest_modification());
-  ut_ad(space == fil_system.temp_space
-        ? oldest_modification == 2
-        : oldest_modification > 2);
-  ut_ad(bpage->state() ==
-        (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE));
-  ut_ad(ULINT_UNDEFINED >
-        (lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_));
-  mysql_mutex_unlock(&buf_pool.mutex);
+                        evict ? "LRU" : "flush_list",
+                        id().space(), id().page_no()));
+
+  buf_block_t *block= reinterpret_cast<buf_block_t*>(this);
+  page_t *write_frame= zip.data;
 
-  buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
-  page_t *frame= bpage->zip.data;
+  space->reacquire();
+  size_t size;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+  size_t orig_size;
+#endif
+  buf_tmp_buffer_t *slot= nullptr;
 
-  if (status == buf_page_t::FREED)
-    buf_pool.release_freed_page(&block->page);
+  if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */
+  {
+    ut_ad(!space->full_crc32());
+    ut_ad(!space->is_compressed()); /* not page_compressed */
+    size= zip_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    orig_size= size;
+#endif
+    buf_flush_update_zip_checksum(write_frame, size);
+    write_frame= buf_page_encrypt(space, this, write_frame, &slot, &size);
+    ut_ad(size == zip_size());
+  }
   else
   {
-    space->reacquire();
-    ut_ad(status == buf_page_t::NORMAL || status == buf_page_t::INIT_ON_FLUSH);
-    size_t size;
+    byte *page= frame;
+    size= block->physical_size();
 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
-    size_t orig_size;
+    orig_size= size;
 #endif
-    IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
 
-    if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */
+    if (space->full_crc32())
     {
-      ut_ad(!space->full_crc32());
-      ut_ad(!space->is_compressed()); /* not page_compressed */
-      size= bpage->zip_size();
-#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
-      orig_size= size;
-#endif
-      buf_flush_update_zip_checksum(frame, size);
-      frame= buf_page_encrypt(space, bpage, frame, &size);
-      ut_ad(size == bpage->zip_size());
+      /* innodb_checksum_algorithm=full_crc32 is not implemented for
+      ROW_FORMAT=COMPRESSED pages. */
+      ut_ad(!write_frame);
+      page= buf_page_encrypt(space, this, page, &slot, &size);
+      buf_flush_init_for_writing(block, page, nullptr, true);
     }
     else
     {
-      byte *page= block->frame;
-      size= block->physical_size();
-#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
-      orig_size= size;
-#endif
-
-      if (space->full_crc32())
-      {
-        /* innodb_checksum_algorithm=full_crc32 is not implemented for
-        ROW_FORMAT=COMPRESSED pages. */
-        ut_ad(!frame);
-        page= buf_page_encrypt(space, bpage, page, &size);
-        buf_flush_init_for_writing(block, page, nullptr, true);
-      }
-      else
-      {
-        buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr,
-                                   false);
-        page= buf_page_encrypt(space, bpage, frame ? frame : page, &size);
-      }
-
-#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
-      if (size != orig_size && space->punch_hole)
-        type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
-#endif
-      frame=page;
+      buf_flush_init_for_writing(block, page, write_frame ? &zip : nullptr,
+                                 false);
+      page= buf_page_encrypt(space, this, write_frame ? write_frame : page,
+                             &slot, &size);
     }
 
-    ut_ad(status == bpage->status);
-    ut_ad(oldest_modification == bpage->oldest_modification());
-
-    if (status != buf_page_t::NORMAL || !space->use_doublewrite())
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    if (size != orig_size)
     {
-      if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
-      {
-        const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
-                                          (FIL_PAGE_LSN + (frame ? frame
-                                                           : block->frame)));
-        ut_ad(lsn >= oldest_modification);
-        if (lsn > log_sys.get_flushed_lsn())
-          log_write_up_to(lsn, true);
+      switch (space->chain.start->punch_hole) {
+      case 1:
+        static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH ==
+                      IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, "");
+        type=
+          IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC));
+        break;
+      case 2:
+        size= orig_size;
       }
-      space->io(IORequest(type, bpage),
-                bpage->physical_offset(), size, frame, bpage);
     }
-    else
-      buf_dblwr.add_to_batch(IORequest(bpage, space->chain.start, type), size);
+#endif
+    write_frame= page;
   }
 
-  /* Increment the I/O operation count used for selecting LRU policy. */
-  buf_LRU_stat_inc_io();
+  if ((s & LRU_MASK) == REINIT || !space->use_doublewrite())
+  {
+    if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+    {
+      const lsn_t lsn=
+        mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN +
+                                              (write_frame ? write_frame
+                                               : frame)));
+      ut_ad(lsn >= oldest_modification());
+      if (lsn > log_sys.get_flushed_lsn())
+        log_write_up_to(lsn, true);
+    }
+    space->io(IORequest{type, this, slot}, physical_offset(), size,
+              write_frame, this);
+  }
+  else
+    buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type},
+                           size);
   return true;
 }
 
 /** Check whether a page can be flushed from the buf_pool.
 @param id          page identifier
 @param fold        id.fold()
-@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param evict       true=buf_pool.LRU; false=buf_pool.flush_list
 @return whether the page can be flushed */
-static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold,
+                                     bool evict)
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
   ut_ad(fold == id.fold());
 
-  buf_page_t *bpage= buf_pool.page_hash_get_low(id, fold);
+  /* FIXME: cell_get() is being invoked while holding buf_pool.mutex */
+  const buf_page_t *bpage=
+    buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold));
 
   if (!bpage || buf_pool.watch_is_sentinel(*bpage))
     return false;
 
-  /* We avoid flushing 'non-old' blocks in an LRU flush, because the
+  /* We avoid flushing 'non-old' blocks in an eviction flush, because the
   flushed blocks are soon freed */
-  if (lru && !bpage->is_old())
+  if (evict && !bpage->is_old())
     return false;
 
-  return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
+  return bpage->oldest_modification() > 1 && !bpage->is_io_fixed();
 }
 
 /** Check which neighbors of a page can be flushed from the buf_pool.
 @param space       tablespace
 @param id          page identifier of a dirty page
 @param contiguous  whether to consider contiguous areas of pages
-@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param evict       true=buf_pool.LRU; false=buf_pool.flush_list
 @return last page number that can be flushed */
 static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
                                            page_id_t &id, bool contiguous,
-                                           bool lru)
+                                           bool evict)
 {
   ut_ad(id.page_no() < space.size +
         (space.physical_size() == 2048 ? 1
@@ -1018,7 +963,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
     for (page_id_t i= id - 1;; --i)
     {
       fold--;
-      if (!buf_flush_check_neighbor(i, fold, lru))
+      if (!buf_flush_check_neighbor(i, fold, evict))
       {
         low= i + 1;
         break;
@@ -1034,7 +979,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
   while (++i < high)
   {
     ++fold;
-    if (!buf_flush_check_neighbor(i, fold, lru))
+    if (!buf_flush_check_neighbor(i, fold, evict))
       break;
   }
 
@@ -1042,63 +987,58 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
   return i;
 }
 
-MY_ATTRIBUTE((nonnull, warn_unused_result))
-/** Write punch-hole or zeroes of the freed ranges when
-innodb_immediate_scrub_data_uncompressed from the freed ranges.
-@param space    tablespace which may contain ranges of freed pages
-@param writable whether the tablespace is writable
+MY_ATTRIBUTE((warn_unused_result))
+/** Apply freed_ranges to the file.
+@param writable whether the file is writable
 @return number of pages written or hole-punched */
-static uint32_t buf_flush_freed_pages(fil_space_t *space, bool writable)
+uint32_t fil_space_t::flush_freed(bool writable)
 {
-  const bool punch_hole= space->punch_hole;
+  const bool punch_hole= chain.start->punch_hole == 1;
   if (!punch_hole && !srv_immediate_scrub_data_uncompressed)
     return 0;
 
   mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
   mysql_mutex_assert_not_owner(&buf_pool.mutex);
 
-  space->freed_range_mutex.lock();
-  if (space->freed_ranges.empty() ||
-      log_sys.get_flushed_lsn() < space->get_last_freed_lsn())
+  freed_range_mutex.lock();
+  if (freed_ranges.empty() || log_sys.get_flushed_lsn() < get_last_freed_lsn())
   {
-    space->freed_range_mutex.unlock();
+    freed_range_mutex.unlock();
     return 0;
   }
 
-  const unsigned physical_size{space->physical_size()};
+  const unsigned physical{physical_size()};
 
-  range_set freed_ranges= std::move(space->freed_ranges);
+  range_set freed= std::move(freed_ranges);
   uint32_t written= 0;
 
   if (!writable);
   else if (punch_hole)
   {
-    for (const auto &range : freed_ranges)
+    for (const auto &range : freed)
     {
       written+= range.last - range.first + 1;
-      space->reacquire();
-      space->io(IORequest(IORequest::PUNCH_RANGE),
-                          os_offset_t{range.first} * physical_size,
-                          (range.last - range.first + 1) * physical_size,
-                          nullptr);
+      reacquire();
+      io(IORequest(IORequest::PUNCH_RANGE),
+         os_offset_t{range.first} * physical,
+         (range.last - range.first + 1) * physical, nullptr);
     }
   }
   else
   {
-    for (const auto &range : freed_ranges)
+    for (const auto &range : freed)
     {
       written+= range.last - range.first + 1;
       for (os_offset_t i= range.first; i <= range.last; i++)
       {
-        space->reacquire();
-        space->io(IORequest(IORequest::WRITE_ASYNC),
-                  i * physical_size, physical_size,
-                  const_cast<byte*>(field_ref_zero));
+        reacquire();
+        io(IORequest(IORequest::WRITE_ASYNC), i * physical, physical,
+           const_cast<byte*>(field_ref_zero));
       }
     }
   }
 
-  space->freed_range_mutex.unlock();
+  freed_range_mutex.unlock();
   return written;
 }
 
@@ -1106,28 +1046,37 @@ static uint32_t buf_flush_freed_pages(fil_space_t *space, bool writable)
 and also write zeroes or punch the hole for the freed ranges of pages.
 @param space       tablespace
 @param page_id     page identifier
+@param bpage       buffer page
 @param contiguous  whether to consider contiguous areas of pages
-@param lru         true=buf_pool.LRU; false=buf_pool.flush_list
+@param evict       true=buf_pool.LRU; false=buf_pool.flush_list
 @param n_flushed   number of pages flushed so far in this batch
 @param n_to_flush  maximum number of pages we are allowed to flush
 @return number of pages flushed */
 static ulint buf_flush_try_neighbors(fil_space_t *space,
                                      const page_id_t page_id,
-                                     bool contiguous, bool lru,
+                                     buf_page_t *bpage,
+                                     bool contiguous, bool evict,
                                      ulint n_flushed, ulint n_to_flush)
 {
   ut_ad(space->id == page_id.space());
+  ut_ad(bpage->id() == page_id);
 
   ulint count= 0;
   page_id_t id= page_id;
-  page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru);
+  page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
 
   ut_ad(page_id >= id);
   ut_ad(page_id < high);
 
-  for (ulint id_fold= id.fold(); id < high && !space->is_stopping();
-       ++id, ++id_fold)
+  for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold)
   {
+    if (UNIV_UNLIKELY(space->is_stopping()))
+    {
+      if (bpage)
+        bpage->lock.u_unlock(true);
+      break;
+    }
+
     if (count + n_flushed >= n_to_flush)
     {
       if (id > page_id)
@@ -1138,33 +1087,45 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
       id_fold= id.fold();
     }
 
+    const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold);
     mysql_mutex_lock(&buf_pool.mutex);
 
-    if (buf_page_t *bpage= buf_pool.page_hash_get_low(id, id_fold))
+    if (buf_page_t *b= buf_pool.page_hash.get(id, chain))
     {
-      ut_ad(bpage->in_file());
-      /* We avoid flushing 'non-old' blocks in an LRU flush,
-      because the flushed blocks are soon freed */
-      if (!lru || id == page_id || bpage->is_old())
+      ut_ad(b->in_file());
+      if (id == page_id)
       {
-        if (!buf_pool.watch_is_sentinel(*bpage) &&
-            bpage->oldest_modification() > 1 &&
-            bpage->ready_for_flush() && buf_flush_page(bpage, lru, space))
+        ut_ad(bpage == b);
+        bpage= nullptr;
+        ut_ad(!buf_pool.watch_is_sentinel(*b));
+        ut_ad(b->oldest_modification() > 1);
+      flush:
+        if (b->flush(evict, space))
         {
           ++count;
           continue;
         }
       }
+      /* We avoid flushing 'non-old' blocks in an eviction flush,
+      because the flushed blocks are soon freed */
+      else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) &&
+               b->oldest_modification() > 1 && b->lock.u_lock_try(true))
+      {
+        if (b->oldest_modification() < 2)
+          b->lock.u_unlock(true);
+        else
+          goto flush;
+      }
     }
 
     mysql_mutex_unlock(&buf_pool.mutex);
   }
 
-  if (auto n= count - 1)
+  if (count > 1)
   {
     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
                                  MONITOR_FLUSH_NEIGHBOR_COUNT,
-                                 MONITOR_FLUSH_NEIGHBOR_PAGES, n);
+                                 MONITOR_FLUSH_NEIGHBOR_PAGES, count - 1);
   }
 
   return count;
@@ -1175,12 +1136,8 @@ This utility moves the uncompressed frames of pages to the free list.
 Note that this function does not actually flush any data to disk. It
 just detaches the uncompressed frames from the compressed pages at the
 tail of the unzip_LRU and puts those freed frames in the free list.
-Note that it is a best effort attempt and it is not guaranteed that
-after a call to this function there will be 'max' blocks in the free
-list.
-@param[in]	max		desired number of blocks in the free_list
 @return number of blocks moved to the free list. */
-static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
+static ulint buf_free_from_unzip_LRU_list_batch()
 {
 	ulint		scanned = 0;
 	ulint		count = 0;
@@ -1190,7 +1147,6 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
 	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
 
 	while (block
-	       && count < max
 	       && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
 	       && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
 	       > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
@@ -1225,7 +1181,7 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
 static std::pair<fil_space_t*, uint32_t> buf_flush_space(const uint32_t id)
 {
   if (fil_space_t *space= fil_space_t::get(id))
-    return {space, buf_flush_freed_pages(space, true)};
+    return {space, space->flush_freed(true)};
   return {nullptr, 0};
 }
 
@@ -1237,47 +1193,36 @@ struct flush_counters_t
   ulint evicted;
 };
 
-/** Try to discard a dirty page.
+/** Discard a dirty page, and release buf_pool.flush_list_mutex.
 @param bpage      dirty page whose tablespace is not accessible */
 static void buf_flush_discard_page(buf_page_t *bpage)
 {
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
   ut_ad(bpage->in_file());
   ut_ad(bpage->oldest_modification());
 
-  rw_lock_t *rw_lock;
-
-  if (bpage->state() != BUF_BLOCK_FILE_PAGE)
-    rw_lock= nullptr;
-  else
-  {
-    rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock;
-    if (!rw_lock_sx_lock_nowait(rw_lock, 0))
-      return;
-  }
-
-  bpage->status= buf_page_t::NORMAL;
-  mysql_mutex_lock(&buf_pool.flush_list_mutex);
   buf_pool.delete_from_flush_list(bpage);
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-  if (rw_lock)
-    rw_lock_sx_unlock(rw_lock);
-
+  ut_d(const auto state= bpage->state());
+  ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED ||
+        state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT);
+  bpage->lock.u_unlock(true);
   buf_LRU_free_page(bpage, true);
 }
 
-/** Flush dirty blocks from the end of the LRU list.
-@param max   maximum number of blocks to make available in buf_pool.free
-@param n     counts of flushed and evicted pages */
-static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
+/** Flush dirty blocks from the end buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+@param max    maximum number of blocks to flush
+@param evict  whether dirty pages are to be evicted after flushing them
+@param n      counts of flushed and evicted pages */
+static void buf_flush_LRU_list_batch(ulint max, bool evict,
+                                     flush_counters_t *n)
 {
   ulint scanned= 0;
   ulint free_limit= srv_LRU_scan_depth;
 
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size)
+  if (buf_pool.withdraw_target && buf_pool.is_shrinking())
     free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
 
   const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
@@ -1290,25 +1235,48 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
        bpage &&
        ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
-         UT_LIST_GET_LEN(buf_pool.free) < free_limit &&
-         n->flushed + n->evicted < max) ||
-        recv_recovery_is_on()); ++scanned)
+         UT_LIST_GET_LEN(buf_pool.free) < free_limit) ||
+        recv_recovery_is_on());
+       ++scanned, bpage= buf_pool.lru_hp.get())
   {
     buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
-    const lsn_t oldest_modification= bpage->oldest_modification();
     buf_pool.lru_hp.set(prev);
-
-    if (oldest_modification <= 1 && bpage->can_relocate())
-    {
-      /* block is ready for eviction i.e., it is clean and is not
-      IO-fixed or buffer fixed. */
-      if (buf_LRU_free_page(bpage, true))
-        ++n->evicted;
+    auto state= bpage->state();
+    ut_ad(state >= buf_page_t::FREED);
+    ut_ad(bpage->in_LRU_list);
+
+    switch (bpage->oldest_modification()) {
+    case 0:
+    evict:
+      if (state != buf_page_t::FREED &&
+          (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state)))
+        continue;
+      buf_LRU_free_page(bpage, true);
+      ++n->evicted;
+      /* fall through */
+    case 1:
+      continue;
     }
-    else if (oldest_modification > 1 && bpage->ready_for_flush())
+
+    if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true))
     {
-      /* Block is ready for flush. Dispatch an IO request. The IO
-      helper thread will put it on free list in IO completion routine. */
+      ut_ad(!bpage->is_io_fixed());
+      bool do_evict= evict;
+      switch (bpage->oldest_modification()) {
+      case 1:
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        buf_pool.delete_from_flush_list(bpage);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        /* fall through */
+      case 0:
+        bpage->lock.u_unlock(true);
+        goto evict;
+      case 2:
+        /* LRU flushing will always evict pages of the temporary tablespace. */
+        do_evict= true;
+      }
+      /* Block is ready for flush. Dispatch an IO request.
+      If do_evict, the page may be evicted by buf_page_write_complete(). */
       const page_id_t page_id(bpage->id());
       const uint32_t space_id= page_id.space();
       if (!space || space->id != space_id)
@@ -1325,7 +1293,6 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
           mysql_mutex_lock(&buf_pool.mutex);
           if (p.second)
             buf_pool.stat.n_pages_written+= p.second;
-          goto retry;
         }
         else
           ut_ad(!space);
@@ -1337,16 +1304,25 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
       }
 
       if (!space)
+      {
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
         buf_flush_discard_page(bpage);
+      }
       else if (neighbors && space->is_rotational())
       {
         mysql_mutex_unlock(&buf_pool.mutex);
-        n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
-                                             true, n->flushed, max);
+        n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
+                                             neighbors == 1,
+                                             do_evict, n->flushed, max);
 reacquire_mutex:
         mysql_mutex_lock(&buf_pool.mutex);
       }
-      else if (buf_flush_page(bpage, true, space))
+      else if (n->flushed >= max && !recv_recovery_is_on())
+      {
+        bpage->lock.u_unlock(true);
+        break;
+      }
+      else if (bpage->flush(do_evict, space))
       {
         ++n->flushed;
         goto reacquire_mutex;
@@ -1355,8 +1331,6 @@ reacquire_mutex:
     else
       /* Can't evict or dispatch this block. Go to previous. */
       ut_ad(buf_pool.lru_hp.is_hp(prev));
-  retry:
-    bpage= buf_pool.lru_hp.get();
   }
 
   buf_pool.lru_hp.set(nullptr);
@@ -1364,13 +1338,6 @@ reacquire_mutex:
   if (space)
     space->release();
 
-  /* We keep track of all flushes happening as part of LRU flush. When
-  estimating the desired rate at which flush_list should be flushed,
-  we factor in this value. */
-  buf_lru_flush_page_count+= n->flushed;
-
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-
   if (scanned)
     MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
                                  MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
@@ -1380,27 +1347,21 @@ reacquire_mutex:
 
 /** Flush and move pages from LRU or unzip_LRU list to the free list.
 Whether LRU or unzip_LRU is used depends on the state of the system.
-@param max   maximum number of blocks to make available in buf_pool.free
-@return number of flushed pages */
-static ulint buf_do_LRU_batch(ulint max)
+@param max    maximum number of blocks to flush
+@param evict  whether dirty pages are to be evicted after flushing them
+@param n      counts of flushed and evicted pages */
+static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n)
 {
-  const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU()
-    ? buf_free_from_unzip_LRU_list_batch(max)
-    : 0;
-  flush_counters_t n;
-  n.flushed= 0;
-  n.evicted= n_unzip_LRU_evicted;
-  buf_flush_LRU_list_batch(max, &n);
-
-  if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
-  {
-    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
-                                 MONITOR_LRU_BATCH_EVICT_COUNT,
-                                 MONITOR_LRU_BATCH_EVICT_PAGES,
-                                 evicted);
-  }
+  if (buf_LRU_evict_from_unzip_LRU())
+    buf_free_from_unzip_LRU_list_batch();
+  n->evicted= 0;
+  n->flushed= 0;
+  buf_flush_LRU_list_batch(max, evict, n);
 
-  return n.flushed;
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  buf_lru_freed_page_count+= n->evicted;
+  buf_lru_flush_page_count+= n->flushed;
+  buf_pool.stat.n_pages_written+= n->flushed;
 }
 
 /** This utility flushes dirty blocks from the end of the flush_list.
@@ -1414,6 +1375,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
   ulint scanned= 0;
 
   mysql_mutex_assert_owner(&buf_pool.mutex);
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
 
   const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
     ? 0 : srv_flush_neighbors;
@@ -1424,7 +1386,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
 
   /* Start from the end of the list looking for a suitable block to be
   flushed. */
-  mysql_mutex_lock(&buf_pool.flush_list_mutex);
   ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
 
   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
@@ -1435,33 +1396,42 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
       break;
     ut_ad(bpage->in_file());
 
-    buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
-
-    if (oldest_modification == 1)
     {
-      buf_pool.delete_from_flush_list(bpage);
-    skip:
-      bpage= prev;
-      continue;
-    }
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
 
-    ut_ad(oldest_modification > 2);
-    ut_ad(bpage->in_file());
+      if (oldest_modification == 1)
+      {
+      clear:
+        buf_pool.delete_from_flush_list(bpage);
+      skip:
+        bpage= prev;
+        continue;
+      }
 
-    if (!bpage->ready_for_flush())
-      goto skip;
+      ut_ad(oldest_modification > 2);
 
-    /* In order not to degenerate this scan to O(n*n) we attempt to
-    preserve the pointer position. Any thread that would remove 'prev'
-    from buf_pool.flush_list must adjust the hazard pointer.
+      if (!bpage->lock.u_lock_try(true))
+        goto skip;
 
-    Note: A concurrent execution of buf_flush_list_space() may
-    terminate this scan prematurely. The buf_pool.n_flush_list()
-    should prevent multiple threads from executing
-    buf_do_flush_list_batch() concurrently,
-    but buf_flush_list_space() is ignoring that. */
-    buf_pool.flush_hp.set(prev);
-    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      ut_ad(!bpage->is_io_fixed());
+
+      if (bpage->oldest_modification() == 1)
+      {
+        bpage->lock.u_unlock(true);
+        goto clear;
+      }
+
+      /* In order not to degenerate this scan to O(n*n) we attempt to
+      preserve the pointer position. Any thread that would remove 'prev'
+      from buf_pool.flush_list must adjust the hazard pointer.
+
+      Note: A concurrent execution of buf_flush_list_space() may
+      terminate this scan prematurely. The buf_pool.flush_list_active
+      should prevent multiple threads from executing
+      buf_do_flush_list_batch() concurrently,
+      but buf_flush_list_space() is ignoring that. */
+      buf_pool.flush_hp.set(prev);
+    }
 
     const page_id_t page_id(bpage->id());
     const uint32_t space_id= page_id.space();
@@ -1469,8 +1439,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
     {
       if (last_space_id != space_id)
       {
-        mysql_mutex_lock(&buf_pool.flush_list_mutex);
-        buf_pool.flush_hp.set(bpage);
         mysql_mutex_unlock(&buf_pool.flush_list_mutex);
         mysql_mutex_unlock(&buf_pool.mutex);
         if (space)
@@ -1479,18 +1447,8 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
         space= p.first;
         last_space_id= space_id;
         mysql_mutex_lock(&buf_pool.mutex);
-        if (p.second)
-          buf_pool.stat.n_pages_written+= p.second;
+        buf_pool.stat.n_pages_written+= p.second;
         mysql_mutex_lock(&buf_pool.flush_list_mutex);
-        bpage= buf_pool.flush_hp.get();
-        if (!bpage)
-          break;
-        if (bpage->id() != page_id)
-          continue;
-        buf_pool.flush_hp.set(UT_LIST_GET_PREV(list, bpage));
-        if (bpage->oldest_modification() <= 1 || !bpage->ready_for_flush())
-          goto next;
-        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
       }
       else
         ut_ad(!space);
@@ -1503,27 +1461,29 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
 
     if (!space)
       buf_flush_discard_page(bpage);
-    else if (neighbors && space->is_rotational())
-    {
-      mysql_mutex_unlock(&buf_pool.mutex);
-      count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
-                                      false, count, max_n);
-    reacquire_mutex:
-      mysql_mutex_lock(&buf_pool.mutex);
-    }
-    else if (buf_flush_page(bpage, false, space))
+    else
     {
-      ++count;
-      goto reacquire_mutex;
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      if (neighbors && space->is_rotational())
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        count+= buf_flush_try_neighbors(space, page_id, bpage, neighbors == 1,
+                                        false, count, max_n);
+      reacquire_mutex:
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+      else if (bpage->flush(false, space))
+      {
+        ++count;
+        goto reacquire_mutex;
+      }
     }
 
     mysql_mutex_lock(&buf_pool.flush_list_mutex);
-  next:
     bpage= buf_pool.flush_hp.get();
   }
 
   buf_pool.flush_hp.set(nullptr);
-  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
   if (space)
     space->release();
@@ -1533,76 +1493,86 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
                                  MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
                                  MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
                                  scanned);
-  if (count)
-    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
-                                 MONITOR_FLUSH_BATCH_COUNT,
-                                 MONITOR_FLUSH_BATCH_PAGES,
-                                 count);
-  mysql_mutex_assert_owner(&buf_pool.mutex);
   return count;
 }
 
-/** Wait until a flush batch ends.
-@param lru    true=buf_pool.LRU; false=buf_pool.flush_list */
-void buf_flush_wait_batch_end(bool lru)
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end()
 {
-  const auto &n_flush= lru ? buf_pool.n_flush_LRU_ : buf_pool.n_flush_list_;
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
 
-  if (n_flush)
+  if (buf_pool.n_flush())
   {
-    auto cond= lru ? &buf_pool.done_flush_LRU : &buf_pool.done_flush_list;
     tpool::tpool_wait_begin();
     thd_wait_begin(nullptr, THD_WAIT_DISKIO);
     do
-      my_cond_wait(cond, &buf_pool.mutex.m_mutex);
-    while (n_flush);
+      my_cond_wait(&buf_pool.done_flush_LRU,
+                   &buf_pool.flush_list_mutex.m_mutex);
+    while (buf_pool.n_flush());
     tpool::tpool_wait_end();
     thd_wait_end(nullptr);
-    pthread_cond_broadcast(cond);
   }
 }
 
 /** Write out dirty blocks from buf_pool.flush_list.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
 @param max_n    wished maximum mumber of blocks flushed
 @param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
 @return the number of processed pages
 @retval 0 if a buf_pool.flush_list batch is already running */
-static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX)
+static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED,
+                                          lsn_t lsn= LSN_MAX)
 {
   ut_ad(lsn);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
 
-  if (buf_pool.n_flush_list())
-    return 0;
-
-  mysql_mutex_lock(&buf_pool.mutex);
-  const bool running= buf_pool.n_flush_list_ != 0;
-  /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
-  while not holding buf_pool.flush_list_mutex */
-  if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  if (buf_pool.flush_list_active())
   {
-    if (!running)
-      pthread_cond_broadcast(&buf_pool.done_flush_list);
-    mysql_mutex_unlock(&buf_pool.mutex);
+nothing_to_do:
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
     return 0;
   }
-
-  buf_pool.n_flush_list_++;
-  const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
-  const ulint n_flushing= --buf_pool.n_flush_list_;
-
-  buf_pool.try_LRU_scan= true;
-
-  mysql_mutex_unlock(&buf_pool.mutex);
-
-  if (!n_flushing)
+  if (!buf_pool.get_oldest_modification(0))
+  {
     pthread_cond_broadcast(&buf_pool.done_flush_list);
+    goto nothing_to_do;
+  }
+  buf_pool.flush_list_set_active();
+  const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
+  if (n_flushed)
+    buf_pool.stat.n_pages_written+= n_flushed;
+  buf_pool.flush_list_set_inactive();
+  pthread_cond_broadcast(&buf_pool.done_flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-  buf_dblwr.flush_buffered_writes();
+  if (n_flushed)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+                                 MONITOR_FLUSH_BATCH_COUNT,
+                                 MONITOR_FLUSH_BATCH_PAGES,
+                                 n_flushed);
 
   DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
   return n_flushed;
 }
 
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n    wished maximum mumber of blocks flushed
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED,
+                            lsn_t lsn= LSN_MAX)
+{
+  mysql_mutex_lock(&buf_pool.mutex);
+  ulint n= buf_flush_list_holding_mutex(max_n, lsn);
+  mysql_mutex_unlock(&buf_pool.mutex);
+  buf_dblwr.flush_buffered_writes();
+  return n;
+}
+
 /** Try to flush all the dirty pages that belong to a given tablespace.
 @param space       tablespace
 @param n_flushed   number of pages written
@@ -1614,10 +1584,11 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
 
   bool may_have_skipped= false;
   ulint max_n_flush= srv_io_capacity;
+  ulint n_flush= 0;
 
   bool acquired= space->acquire();
   {
-    const uint32_t written{buf_flush_freed_pages(space, acquired)};
+    const uint32_t written{space->flush_freed(acquired)};
     mysql_mutex_lock(&buf_pool.mutex);
     if (written)
       buf_pool.stat.n_pages_written+= written;
@@ -1626,18 +1597,21 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
 
   for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
   {
-    ut_d(const auto s= bpage->state());
-    ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
-          s == BUF_BLOCK_REMOVE_HASH);
     ut_ad(bpage->oldest_modification());
     ut_ad(bpage->in_file());
 
     buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
-    if (bpage->id().space() != space_id);
-    else if (bpage->oldest_modification() == 1)
+    if (bpage->oldest_modification() == 1)
+    clear:
       buf_pool.delete_from_flush_list(bpage);
-    else if (!bpage->ready_for_flush())
+    else if (bpage->id().space() != space_id);
+    else if (!bpage->lock.u_lock_try(true))
       may_have_skipped= true;
+    else if (bpage->oldest_modification() == 1)
+    {
+      bpage->lock.u_unlock(true);
+      goto clear;
+    }
     else
     {
       /* In order not to degenerate this scan to O(n*n) we attempt to
@@ -1649,13 +1623,10 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
       concurrently. This may terminate our iteration prematurely,
       leading us to return may_have_skipped=true. */
       buf_pool.flush_hp.set(prev);
-      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
       if (!acquired)
-      {
       was_freed:
         buf_flush_discard_page(bpage);
-      }
       else
       {
         if (space->is_stopping())
@@ -1664,28 +1635,24 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
           acquired= false;
           goto was_freed;
         }
-        if (!buf_flush_page(bpage, false, space))
-        {
-          may_have_skipped= true;
-          mysql_mutex_lock(&buf_pool.flush_list_mutex);
-          goto next_after_skip;
-        }
-        if (n_flushed)
-          ++*n_flushed;
-        if (!--max_n_flush)
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        if (bpage->flush(false, space))
         {
+          ++n_flush;
+          if (!--max_n_flush)
+          {
+            mysql_mutex_lock(&buf_pool.mutex);
+            mysql_mutex_lock(&buf_pool.flush_list_mutex);
+            may_have_skipped= true;
+            goto done;
+          }
           mysql_mutex_lock(&buf_pool.mutex);
-          mysql_mutex_lock(&buf_pool.flush_list_mutex);
-          may_have_skipped= true;
-          break;
         }
-        mysql_mutex_lock(&buf_pool.mutex);
       }
 
       mysql_mutex_lock(&buf_pool.flush_list_mutex);
       if (!buf_pool.flush_hp.is_hp(prev))
         may_have_skipped= true;
-    next_after_skip:
       bpage= buf_pool.flush_hp.get();
       continue;
     }
@@ -1698,14 +1665,19 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
   buf_flush_list_space(). We should always return true from
   buf_flush_list_space() if that should be the case; in
   buf_do_flush_list_batch() we will simply perform less work. */
-
+done:
   buf_pool.flush_hp.set(nullptr);
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
   buf_pool.try_LRU_scan= true;
+  pthread_cond_broadcast(&buf_pool.done_free);
 
+  buf_pool.stat.n_pages_written+= n_flush;
   mysql_mutex_unlock(&buf_pool.mutex);
 
+  if (n_flushed)
+    *n_flushed= n_flush;
+
   if (acquired)
     space->release();
 
@@ -1717,43 +1689,32 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
   return may_have_skipped;
 }
 
-/** Write out dirty blocks from buf_pool.LRU.
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
 @param max_n    wished maximum mumber of blocks flushed
-@return the number of processed pages
+@param evict    whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written
 @retval 0 if a buf_pool.LRU batch is already running */
-ulint buf_flush_LRU(ulint max_n)
+ulint buf_flush_LRU(ulint max_n, bool evict)
 {
-  if (buf_pool.n_flush_LRU())
-    return 0;
-
-  log_buffer_flush_to_disk(true);
-
-  mysql_mutex_lock(&buf_pool.mutex);
-  if (buf_pool.n_flush_LRU_)
-  {
-    mysql_mutex_unlock(&buf_pool.mutex);
-    return 0;
-  }
-  buf_pool.n_flush_LRU_++;
-
-  ulint n_flushed= buf_do_LRU_batch(max_n);
-
-  const ulint n_flushing= --buf_pool.n_flush_LRU_;
+  mysql_mutex_assert_owner(&buf_pool.mutex);
 
-  buf_pool.try_LRU_scan= true;
+  flush_counters_t n;
+  buf_do_LRU_batch(max_n, evict, &n);
 
-  mysql_mutex_unlock(&buf_pool.mutex);
+  ulint pages= n.flushed;
 
-  if (!n_flushing)
+  if (n.evicted)
   {
-    pthread_cond_broadcast(&buf_pool.done_flush_LRU);
-    pthread_cond_signal(&buf_pool.done_free);
+    if (evict)
+      pages+= n.evicted;
+    buf_pool.try_LRU_scan= true;
+    pthread_cond_broadcast(&buf_pool.done_free);
   }
 
-  buf_dblwr.flush_buffered_writes();
-
-  DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
-  return n_flushed;
+  return pages;
 }
 
 /** Initiate a log checkpoint, discarding the start of the log.
@@ -1885,9 +1846,14 @@ static void buf_flush_wait(lsn_t lsn)
       buf_flush_sync_lsn= lsn;
       buf_pool.page_cleaner_set_idle(false);
       pthread_cond_signal(&buf_pool.do_flush_list);
+      my_cond_wait(&buf_pool.done_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
+      if (buf_pool.get_oldest_modification(lsn) >= lsn)
+        break;
     }
-    my_cond_wait(&buf_pool.done_flush_list,
-                 &buf_pool.flush_list_mutex.m_mutex);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    os_aio_wait_until_no_pending_writes();
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
   }
 }
 
@@ -1908,6 +1874,7 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
   if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
   {
     MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+
 #if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
     if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
     {
@@ -1915,26 +1882,22 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
       {
         mysql_mutex_unlock(&buf_pool.flush_list_mutex);
         ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
-        buf_flush_wait_batch_end_acquiring_mutex(false);
         if (n_pages)
         {
           MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
                                        MONITOR_FLUSH_SYNC_COUNT,
                                        MONITOR_FLUSH_SYNC_PAGES, n_pages);
         }
+        os_aio_wait_until_no_pending_writes();
         mysql_mutex_lock(&buf_pool.flush_list_mutex);
       }
       while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
     }
     else
 #endif
-    {
-      thd_wait_begin(nullptr, THD_WAIT_DISKIO);
-      tpool::tpool_wait_begin();
       buf_flush_wait(sync_lsn);
-      tpool::tpool_wait_end();
-      thd_wait_end(nullptr);
-    }
+
+    thd_wait_end(nullptr);
   }
 
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
@@ -1981,28 +1944,16 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
   }
 }
 
-/** Wait for pending flushes to complete. */
-void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
-{
-  if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
-  {
-    mysql_mutex_lock(&buf_pool.mutex);
-    buf_flush_wait_batch_end(lru);
-    mysql_mutex_unlock(&buf_pool.mutex);
-  }
-}
-
 /** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
 and try to initiate checkpoints until the target is met.
 @param lsn   minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
 ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
 {
   ut_ad(!srv_read_only_mode);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
 
   for (;;)
   {
-    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-
     if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
     {
       MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
@@ -2053,6 +2004,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
 
     /* wake up buf_flush_wait() */
     pthread_cond_broadcast(&buf_pool.done_flush_list);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
     lsn= std::max(lsn, target);
 
@@ -2103,8 +2055,9 @@ af_get_pct_for_lsn(
 		 / 7.5));
 }
 
-/** This function is called approximately once every second by the
-page_cleaner thread if innodb_adaptive_flushing=ON.
+/** This function is called approximately once every second by
+buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0
+and innodb_adaptive_flushing=ON.
 Based on various factors it decides if there is a need to do flushing.
 @return number of pages recommended to be flushed
 @param last_pages_in  number of pages flushed in previous batch
@@ -2142,52 +2095,43 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
 			n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
 		}
 
+func_exit:
+		page_cleaner.flush_pass++;
 		return n_pages;
 	}
 
 	sum_pages += last_pages_in;
 
-	double	time_elapsed = difftime(curr_time, prev_time);
+	const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1);
 
-	/* We update our variables every srv_flushing_avg_loops
+	/* We update our variables every innodb_flushing_avg_loops
 	iterations to smooth out transition in workload. */
 	if (++n_iterations >= srv_flushing_avg_loops
-	    || time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) {
-
-		if (time_elapsed < 1) {
-			time_elapsed = 1;
-		}
+	    || time_elapsed >= srv_flushing_avg_loops) {
 
-		avg_page_rate = static_cast<ulint>(
-			((static_cast<double>(sum_pages)
-			  / time_elapsed)
-			 + static_cast<double>(avg_page_rate)) / 2);
+		avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2;
 
 		/* How much LSN we have generated since last call. */
-		lsn_rate = static_cast<lsn_t>(
-			static_cast<double>(cur_lsn - prev_lsn)
-			/ time_elapsed);
+		lsn_rate = (cur_lsn - prev_lsn) / time_elapsed;
 
 		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
 
-		ulint	flush_tm = page_cleaner.flush_time;
-		ulint	flush_pass = page_cleaner.flush_pass;
-
-		page_cleaner.flush_time = 0;
-		page_cleaner.flush_pass = 0;
-
-		if (flush_pass) {
-			flush_tm /= flush_pass;
+		if (page_cleaner.flush_pass) {
+			page_cleaner.flush_time /= page_cleaner.flush_pass;
 		}
 
-		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
-		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
-
 		prev_lsn = cur_lsn;
 		prev_time = curr_time;
 
-		n_iterations = 0;
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+			    page_cleaner.flush_time);
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+			    page_cleaner.flush_pass);
+
+		page_cleaner.flush_time = 0;
+		page_cleaner.flush_pass = 0;
 
+		n_iterations = 0;
 		sum_pages = 0;
 	}
 
@@ -2237,14 +2181,12 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
 	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
 
-	return(n_pages);
+	goto func_exit;
 }
 
-/******************************************************************//**
-page_cleaner thread tasked with flushing dirty pages from the buffer
-pools. As of now we'll have only one coordinator.
-@return a dummy parameter */
-static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
+/** page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one coordinator. */
+static void buf_flush_page_cleaner()
 {
   my_thread_init();
 #ifdef UNIV_PFS_THREAD
@@ -2257,8 +2199,6 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
   timespec abstime;
   set_timespec(abstime, 1);
 
-  mysql_mutex_lock(&buf_pool.flush_list_mutex);
-
   lsn_t lsn_limit;
   ulint last_activity_count= srv_get_activity_count();
 
@@ -2266,44 +2206,34 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
   {
     lsn_limit= buf_flush_sync_lsn;
 
-    if (UNIV_UNLIKELY(lsn_limit != 0))
+    if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync))
     {
-furious_flush:
-      if (UNIV_LIKELY(srv_flush_sync))
-      {
-        buf_flush_sync_for_checkpoint(lsn_limit);
-        last_pages= 0;
-        set_timespec(abstime, 1);
-        continue;
-      }
+    furious_flush:
+      buf_flush_sync_for_checkpoint(lsn_limit);
+      last_pages= 0;
+      set_timespec(abstime, 1);
+      continue;
     }
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    if (buf_pool.ran_out())
+      goto no_wait;
     else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
       break;
 
-    /* If buf pager cleaner is idle and there is no work
-    (either dirty pages are all flushed or adaptive flushing
-    is not enabled) then opt for non-timed wait */
     if (buf_pool.page_cleaner_idle() &&
         (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
          srv_max_dirty_pages_pct_lwm == 0.0))
-      my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex);
+      /* We are idle; wait for buf_pool.page_cleaner_wakeup() */
+      my_cond_wait(&buf_pool.do_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
     else
       my_cond_timedwait(&buf_pool.do_flush_list,
                         &buf_pool.flush_list_mutex.m_mutex, &abstime);
-
+  no_wait:
     set_timespec(abstime, 1);
 
-    lsn_t soft_lsn_limit= buf_flush_async_lsn;
     lsn_limit= buf_flush_sync_lsn;
-
-    if (UNIV_UNLIKELY(lsn_limit != 0))
-    {
-      if (UNIV_LIKELY(srv_flush_sync))
-        goto furious_flush;
-    }
-    else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
-      break;
-
     const lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
 
     if (!oldest_lsn)
@@ -2314,120 +2244,150 @@ furious_flush:
         /* wake up buf_flush_wait() */
         pthread_cond_broadcast(&buf_pool.done_flush_list);
       }
-unemployed:
+    unemployed:
       buf_flush_async_lsn= 0;
+    set_idle:
       buf_pool.page_cleaner_set_idle(true);
+      if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+        break;
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    end_of_batch:
+      buf_dblwr.flush_buffered_writes();
+
+      do
+      {
+        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
+        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;);
+
+        if (!recv_recovery_is_on() &&
+            !srv_startup_is_before_trx_rollback_phase &&
+            srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
+          log_checkpoint();
+      }
+      while (false);
+
+      if (!buf_pool.ran_out())
+        continue;
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    }
 
-      DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
-      DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;);
+    lsn_t soft_lsn_limit= buf_flush_async_lsn;
 
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+      if (srv_flush_sync)
+        goto do_furious_flush;
+      if (oldest_lsn >= lsn_limit)
+      {
+        buf_flush_sync_lsn= 0;
+        pthread_cond_broadcast(&buf_pool.done_flush_list);
+      }
+      else if (lsn_limit > soft_lsn_limit)
+        soft_lsn_limit= lsn_limit;
+    }
+
+    bool idle_flush= false;
+    ulint n_flushed= 0, n;
+
+    if (UNIV_UNLIKELY(soft_lsn_limit != 0))
+    {
+      if (oldest_lsn >= soft_lsn_limit)
+        buf_flush_async_lsn= soft_lsn_limit= 0;
+    }
+    else if (buf_pool.ran_out())
+    {
+      buf_pool.page_cleaner_set_idle(false);
       mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      n= srv_max_io_capacity;
+      mysql_mutex_lock(&buf_pool.mutex);
+    LRU_flush:
+      n= buf_flush_LRU(n, false);
+      mysql_mutex_unlock(&buf_pool.mutex);
+      last_pages+= n;
 
-      if (!recv_recovery_is_on() &&
-          !srv_startup_is_before_trx_rollback_phase &&
-          srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
-        log_checkpoint();
+      if (!idle_flush)
+        goto end_of_batch;
 
+      /* when idle flushing kicks in page_cleaner is marked active.
+      reset it back to idle since the it was made active as part of
+      idle flushing stage. */
       mysql_mutex_lock(&buf_pool.flush_list_mutex);
-      continue;
+      goto set_idle;
     }
+    else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+      break;
 
     const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
-    ut_ad(dirty_blocks);
     /* We perform dirty reads of the LRU+free list lengths here.
     Division by zero is not possible, because buf_pool.flush_list is
     guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
     const double dirty_pct= double(dirty_blocks) * 100.0 /
       double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
-
-    bool idle_flush= false;
-
-    if (lsn_limit || soft_lsn_limit);
-    else if (af_needed_for_redo(oldest_lsn));
-    else if (srv_max_dirty_pages_pct_lwm != 0.0)
+    if (srv_max_dirty_pages_pct_lwm != 0.0)
     {
       const ulint activity_count= srv_get_activity_count();
       if (activity_count != last_activity_count)
+      {
         last_activity_count= activity_count;
-      else if (buf_pool.page_cleaner_idle() && buf_pool.n_pend_reads == 0)
+        goto maybe_unemployed;
+      }
+      else if (buf_pool.page_cleaner_idle() && !os_aio_pending_reads())
       {
-         /* reaching here means 3 things:
-         - last_activity_count == activity_count: suggesting server is idle
-           (no trx_t::commit activity)
-         - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
-         - there are no pending reads but there are dirty pages to flush */
-        idle_flush= true;
+        /* reaching here means 3 things:
+           - last_activity_count == activity_count: suggesting server is idle
+           (no trx_t::commit() activity)
+           - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
+           - there are no pending reads but there are dirty pages to flush */
         buf_pool.update_last_activity_count(activity_count);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        idle_flush= true;
+        goto idle_flush;
       }
-
-      if (!idle_flush && dirty_pct < srv_max_dirty_pages_pct_lwm)
-        goto unemployed;
+      else
+      maybe_unemployed:
+        if (dirty_pct < srv_max_dirty_pages_pct_lwm)
+          goto possibly_unemployed;
     }
     else if (dirty_pct < srv_max_buf_pool_modified_pct)
-      goto unemployed;
-
-    if (UNIV_UNLIKELY(lsn_limit != 0) && oldest_lsn >= lsn_limit)
-      lsn_limit= buf_flush_sync_lsn= 0;
-    if (UNIV_UNLIKELY(soft_lsn_limit != 0) && oldest_lsn >= soft_lsn_limit)
-      soft_lsn_limit= buf_flush_async_lsn= 0;
+    possibly_unemployed:
+      if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn))
+        goto unemployed;
 
     buf_pool.page_cleaner_set_idle(false);
     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
-    if (!lsn_limit)
-      lsn_limit= soft_lsn_limit;
-
-    ulint n_flushed;
-
-    if (UNIV_UNLIKELY(lsn_limit != 0))
+    if (UNIV_UNLIKELY(soft_lsn_limit != 0))
     {
-      n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
-      /* wake up buf_flush_wait() */
-      pthread_cond_broadcast(&buf_pool.done_flush_list);
-      goto try_checkpoint;
+      n= srv_max_io_capacity;
+      goto background_flush;
     }
-    else if (idle_flush || !srv_adaptive_flushing)
+
+    if (!srv_adaptive_flushing)
     {
-      n_flushed= buf_flush_list(srv_io_capacity);
-try_checkpoint:
-      if (n_flushed)
-      {
-        MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
-                                     MONITOR_FLUSH_BACKGROUND_COUNT,
-                                     MONITOR_FLUSH_BACKGROUND_PAGES,
-                                     n_flushed);
-do_checkpoint:
-        /* The periodic log_checkpoint() call here makes it harder to
-        reproduce bugs in crash recovery or mariabackup --prepare, or
-        in code that writes the redo log records. Omitting the call
-        here should not affect correctness, because log_free_check()
-        should still be invoking checkpoints when needed. */
-        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
-        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto next;);
-
-        if (!recv_recovery_is_on()
-            && srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
-          log_checkpoint();
-      }
+    idle_flush:
+      n= srv_io_capacity;
+      soft_lsn_limit= LSN_MAX;
+    background_flush:
+      mysql_mutex_lock(&buf_pool.mutex);
+      n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit);
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+                                   MONITOR_FLUSH_BACKGROUND_COUNT,
+                                   MONITOR_FLUSH_BACKGROUND_PAGES,
+                                   n_flushed);
     }
-    else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
-                                                              oldest_lsn,
-                                                              dirty_blocks,
-                                                              dirty_pct))
+    else if ((n= page_cleaner_flush_pages_recommendation(last_pages,
+                                                         oldest_lsn,
+                                                         dirty_blocks,
+                                                         dirty_pct)) != 0)
     {
-      page_cleaner.flush_pass++;
       const ulint tm= ut_time_ms();
-      last_pages= n_flushed= buf_flush_list(n);
+      mysql_mutex_lock(&buf_pool.mutex);
+      last_pages= n_flushed= buf_flush_list_holding_mutex(n);
       page_cleaner.flush_time+= ut_time_ms() - tm;
-
-      if (n_flushed)
-      {
-        MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
-                                     MONITOR_FLUSH_ADAPTIVE_COUNT,
-                                     MONITOR_FLUSH_ADAPTIVE_PAGES,
-                                     n_flushed);
-        goto do_checkpoint;
-      }
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+                                   MONITOR_FLUSH_ADAPTIVE_COUNT,
+                                   MONITOR_FLUSH_ADAPTIVE_PAGES,
+                                   n_flushed);
     }
     else if (buf_flush_async_lsn <= oldest_lsn)
     {
@@ -2435,38 +2395,38 @@ do_checkpoint:
       goto unemployed;
     }
 
-#ifndef DBUG_OFF
-next:
-#endif /* !DBUG_OFF */
-    mysql_mutex_lock(&buf_pool.flush_list_mutex);
-
-    /* when idle flushing kicks in page_cleaner is marked active.
-    reset it back to idle since the it was made active as part of
-    idle flushing stage. */
-    if (idle_flush)
-      buf_pool.page_cleaner_set_idle(true);
+    n= n >= n_flushed ? n - n_flushed : 0;
+    goto LRU_flush;
   }
 
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
   if (srv_fast_shutdown != 2)
   {
-    buf_flush_wait_batch_end_acquiring_mutex(true);
-    buf_flush_wait_batch_end_acquiring_mutex(false);
+    buf_dblwr.flush_buffered_writes();
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    buf_flush_wait_LRU_batch_end();
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    os_aio_wait_until_no_pending_writes();
   }
 
   mysql_mutex_lock(&buf_pool.flush_list_mutex);
   lsn_limit= buf_flush_sync_lsn;
   if (UNIV_UNLIKELY(lsn_limit != 0))
+  {
+  do_furious_flush:
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
     goto furious_flush;
+  }
   buf_page_cleaner_is_active= false;
   pthread_cond_broadcast(&buf_pool.done_flush_list);
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
   my_thread_end();
-  /* We count the number of threads in os_thread_exit(). A created
-  thread should always use that to exit and not use return() to exit. */
-  return os_thread_exit();
+
+#ifdef UNIV_PFS_THREAD
+  pfs_delete_thread();
+#endif
 }
 
 /** Initialize page_cleaner. */
@@ -2479,20 +2439,9 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
   buf_flush_async_lsn= 0;
   buf_flush_sync_lsn= 0;
   buf_page_cleaner_is_active= true;
-  os_thread_create(buf_flush_page_cleaner);
+  std::thread(buf_flush_page_cleaner).detach();
 }
 
-#if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY)
-/** @return the number of dirty pages in the buffer pool */
-static ulint buf_flush_list_length()
-{
-  mysql_mutex_lock(&buf_pool.flush_list_mutex);
-  const ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
-  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-  return len;
-}
-#endif
-
 /** Flush the buffer pool on shutdown. */
 ATTRIBUTE_COLD void buf_flush_buffer_pool()
 {
@@ -2508,45 +2457,32 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
   {
     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
     buf_flush_list(srv_max_io_capacity);
-    if (buf_pool.n_flush_list())
-    {
-      timespec abstime;
-      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
-                                     "Waiting to flush " ULINTPF " pages",
-                                     buf_flush_list_length());
-      set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
-      mysql_mutex_lock(&buf_pool.mutex);
-      while (buf_pool.n_flush_list_)
-        my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
-                          &abstime);
-      mysql_mutex_unlock(&buf_pool.mutex);
-    }
+    os_aio_wait_until_no_pending_writes();
     mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                   "Waiting to flush " ULINTPF " pages",
+                                   UT_LIST_GET_LEN(buf_pool.flush_list));
   }
 
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-  ut_ad(!buf_pool.any_io_pending());
+  ut_ad(!os_aio_pending_writes());
+  ut_ad(!os_aio_pending_reads());
 }
 
 /** Synchronously flush dirty blocks during recv_sys_t::apply().
 NOTE: The calling thread is not allowed to hold any buffer page latches! */
 void buf_flush_sync_batch(lsn_t lsn)
 {
-  thd_wait_begin(nullptr, THD_WAIT_DISKIO);
-  tpool::tpool_wait_begin();
+  lsn= std::max(lsn, log_sys.get_lsn());
   mysql_mutex_lock(&buf_pool.flush_list_mutex);
   buf_flush_wait(lsn);
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
-  tpool::tpool_wait_end();
-  thd_wait_end(nullptr);
 }
 
 /** Synchronously flush dirty blocks.
 NOTE: The calling thread is not allowed to hold any buffer page latches! */
 void buf_flush_sync()
 {
-  ut_ad(!sync_check_iterate(dict_sync_check()));
-
   if (recv_recovery_is_on())
     recv_sys.apply(true);
 
@@ -2557,9 +2493,14 @@ void buf_flush_sync()
   {
     const lsn_t lsn= log_sys.get_lsn();
     buf_flush_wait(lsn);
+    /* Wait for the page cleaner to be idle (for log resizing at startup) */
+    while (buf_flush_sync_lsn)
+      my_cond_wait(&buf_pool.done_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
     if (lsn == log_sys.get_lsn())
       break;
   }
+
   mysql_mutex_unlock(&buf_pool.flush_list_mutex);
   tpool::tpool_wait_end();
   thd_wait_end(nullptr);
@@ -2595,8 +2536,7 @@ static void buf_flush_validate_low()
 		in the flush list waiting to acquire the
 		buf_pool.flush_list_mutex to complete the relocation. */
 		ut_d(const auto s= bpage->state());
-		ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE
-		      || s == BUF_BLOCK_REMOVE_HASH);
+		ut_ad(s >= buf_page_t::REMOVE_HASH);
 		ut_ad(om == 1 || om > 2);
 
 		bpage = UT_LIST_GET_NEXT(list, bpage);
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 19a0b5a3eb5..844e288843b 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -25,7 +25,6 @@ Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
 #include "buf0lru.h"
-#include "sync0rw.h"
 #include "fil0fil.h"
 #include "btr0btr.h"
 #include "buf0buddy.h"
@@ -38,6 +37,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "log0recv.h"
 #include "srv0srv.h"
 #include "srv0mon.h"
+#include "my_cpu.h"
 
 /** Flush this many pages in buf_LRU_get_free_block() */
 size_t innodb_lru_flush_size;
@@ -108,21 +108,22 @@ uint	buf_LRU_old_threshold_ms;
 
 /** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
 
-If bpage->state() == BUF_BLOCK_ZIP_PAGE && bpage->oldest_modification() <= 1,
+If !bpage->frame && bpage->oldest_modification() <= 1,
 the object will be freed.
 
 @param bpage      buffer block
 @param id         page identifier
-@param hash_lock  buf_pool.page_hash latch (will be released here)
+@param chain      locked buf_pool.page_hash chain (will be released here)
 @param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
 
 If a compressed page is freed other compressed pages may be relocated.
-@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+@retval true if bpage with bpage->frame was removed from page_hash. The
 caller needs to free the page to the free list
-@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+@retval false if block without bpage->frame was removed from page_hash. In
 this case the block is already returned to the buddy allocator. */
 static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
-                                        page_hash_latch *hash_lock, bool zip);
+                                        buf_pool_t::hash_chain &chain,
+                                        bool zip);
 
 /** Free a block to buf_pool */
 static void buf_LRU_block_free_hashed_page(buf_block_t *block)
@@ -135,7 +136,6 @@ static void buf_LRU_block_free_hashed_page(buf_block_t *block)
 @param[in]	bpage		control block */
 static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
 {
-	/* FIXME: use atomics, not mutex */
 	mysql_mutex_assert_owner(&buf_pool.mutex);
 
 	buf_pool.stat.LRU_bytes += bpage->physical_size();
@@ -200,7 +200,8 @@ static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
 	     block && scanned < limit; ++scanned) {
 		buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
 
-		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->page.in_file());
+		ut_ad(block->page.belongs_to_unzip_LRU());
 		ut_ad(block->in_unzip_LRU_list);
 		ut_ad(block->page.in_LRU_list);
 
@@ -265,17 +266,6 @@ static bool buf_LRU_free_from_common_LRU_list(ulint limit)
 	return(freed);
 }
 
-/** Try to free a replaceable block.
-@param limit  maximum number of blocks to scan
-@return true if found and freed */
-bool buf_LRU_scan_and_free_block(ulint limit)
-{
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-
-  return buf_LRU_free_from_unzip_LRU_list(limit) ||
-    buf_LRU_free_from_common_LRU_list(limit);
-}
-
 /** @return a buffer block from the buf_pool.free list
 @retval	NULL	if the free list is empty */
 buf_block_t* buf_LRU_get_free_only()
@@ -295,7 +285,7 @@ buf_block_t* buf_LRU_get_free_only()
 		ut_a(!block->page.in_file());
 		UT_LIST_REMOVE(buf_pool.free, &block->page);
 
-		if (buf_pool.curr_size >= buf_pool.old_size
+		if (!buf_pool.is_shrinking()
 		    || UT_LIST_GET_LEN(buf_pool.withdraw)
 			>= buf_pool.withdraw_target
 		    || !buf_pool.will_be_withdrawn(block->page)) {
@@ -303,15 +293,13 @@ buf_block_t* buf_LRU_get_free_only()
 			a free block. */
 			assert_block_ahi_empty(block);
 
-			block->page.set_state(BUF_BLOCK_MEMORY);
-			MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+			block->page.set_state(buf_page_t::MEMORY);
+			MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
 			break;
 		}
 
 		/* This should be withdrawn */
-		UT_LIST_ADD_LAST(
-			buf_pool.withdraw,
-			&block->page);
+		UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page);
 		ut_d(block->in_withdraw_list = true);
 
 		block = reinterpret_cast<buf_block_t*>(
@@ -330,7 +318,7 @@ static void buf_LRU_check_size_of_non_data_objects()
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
 
-  if (recv_recovery_is_on() || buf_pool.curr_size != buf_pool.old_size)
+  if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks)
     return;
 
   const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
@@ -411,8 +399,10 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex)
 	DBUG_EXECUTE_IF("recv_ran_out_of_buffer",
 			if (recv_recovery_is_on()
 			    && recv_sys.apply_log_recs) {
+				mysql_mutex_lock(&buf_pool.mutex);
 				goto flush_lru;
 			});
+get_mutex:
 	mysql_mutex_lock(&buf_pool.mutex);
 got_mutex:
 	buf_LRU_check_size_of_non_data_objects();
@@ -430,7 +420,7 @@ got_block:
 		if (!have_mutex) {
 			mysql_mutex_unlock(&buf_pool.mutex);
 		}
-		memset(&block->page.zip, 0, sizeof block->page.zip);
+		block->page.zip.clear();
 		return block;
 	}
 
@@ -455,20 +445,32 @@ got_block:
 		if ((block = buf_LRU_get_free_only()) != nullptr) {
 			goto got_block;
 		}
-		if (!buf_pool.n_flush_LRU_) {
-			break;
+		mysql_mutex_unlock(&buf_pool.mutex);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		const auto n_flush = buf_pool.n_flush();
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		mysql_mutex_lock(&buf_pool.mutex);
+		if (!n_flush) {
+			goto not_found;
+		}
+		if (!buf_pool.try_LRU_scan) {
+			mysql_mutex_lock(&buf_pool.flush_list_mutex);
+			buf_pool.page_cleaner_wakeup(true);
+			mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+			my_cond_wait(&buf_pool.done_free,
+				     &buf_pool.mutex.m_mutex);
 		}
-		my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex);
 	}
 
-#ifndef DBUG_OFF
 not_found:
-#endif
-	mysql_mutex_unlock(&buf_pool.mutex);
+	if (n_iterations > 1) {
+		MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
+	}
 
-	if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
+	if (n_iterations == 21 && !buf_lru_free_blocks_error_printed
 	    && srv_buf_pool_old_size == srv_buf_pool_size) {
-
+		buf_lru_free_blocks_error_printed = true;
+		mysql_mutex_unlock(&buf_pool.mutex);
 		ib::warn() << "Difficult to find free blocks in the buffer pool"
 			" (" << n_iterations << " search iterations)! "
 			<< flush_failures << " failed attempts to"
@@ -482,12 +484,7 @@ not_found:
 			<< os_n_file_writes << " OS file writes, "
 			<< os_n_fsyncs
 			<< " OS fsyncs.";
-
-		buf_lru_free_blocks_error_printed = true;
-	}
-
-	if (n_iterations > 1) {
-		MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
+		mysql_mutex_lock(&buf_pool.mutex);
 	}
 
 	/* No free block was found: try to flush the LRU list.
@@ -501,15 +498,16 @@ not_found:
 #ifndef DBUG_OFF
 flush_lru:
 #endif
-	if (!buf_flush_LRU(innodb_lru_flush_size)) {
+	if (!buf_flush_LRU(innodb_lru_flush_size, true)) {
 		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
 		++flush_failures;
 	}
 
 	n_iterations++;
-	mysql_mutex_lock(&buf_pool.mutex);
 	buf_pool.stat.LRU_waits++;
-	goto got_mutex;
+	mysql_mutex_unlock(&buf_pool.mutex);
+	buf_dblwr.flush_buffered_writes();
+	goto get_mutex;
 }
 
 /** Move the LRU_old pointer so that the length of the old blocks list
@@ -775,6 +773,9 @@ buf_LRU_add_block(
 /** Move a block to the start of the LRU list. */
 void buf_page_make_young(buf_page_t *bpage)
 {
+  if (bpage->is_read_fixed())
+    return;
+
   ut_ad(bpage->in_file());
 
   mysql_mutex_lock(&buf_pool.mutex);
@@ -797,12 +798,10 @@ The caller must hold buf_pool.mutex.
 @retval false if the page was not freed */
 bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
 {
-	const page_id_t id(bpage->id());
+	const page_id_t id{bpage->id()};
 	buf_page_t*	b = nullptr;
 
 	mysql_mutex_assert_owner(&buf_pool.mutex);
-	ut_ad(bpage->in_file());
-	ut_ad(bpage->in_LRU_list);
 
 	/* First, perform a quick check before we acquire hash_lock. */
 	if (!bpage->can_relocate()) {
@@ -812,59 +811,66 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
 	/* We must hold an exclusive hash_lock to prevent
 	bpage->can_relocate() from changing due to a concurrent
 	execution of buf_page_get_low(). */
-	const ulint fold = id.fold();
-	page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
-	hash_lock->write_lock();
-	lsn_t oldest_modification = bpage->oldest_modification_acquire();
+	buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold());
+	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+	/* We cannot use transactional_lock_guard here,
+	because buf_buddy_relocate() in buf_buddy_free() could get stuck. */
+	hash_lock.lock();
+	const lsn_t oldest_modification = bpage->oldest_modification_acquire();
 
 	if (UNIV_UNLIKELY(!bpage->can_relocate())) {
 		/* Do not free buffer fixed and I/O-fixed blocks. */
 		goto func_exit;
 	}
 
-	if (oldest_modification == 1) {
+	switch (oldest_modification) {
+	case 2:
+		ut_ad(id.space() == SRV_TMP_SPACE_ID);
+		ut_ad(!bpage->zip.data);
+		if (!bpage->is_freed()) {
+			goto func_exit;
+		}
+		bpage->clear_oldest_modification();
+		break;
+	case 1:
 		mysql_mutex_lock(&buf_pool.flush_list_mutex);
-		oldest_modification = bpage->oldest_modification();
-		if (oldest_modification) {
-			ut_ad(oldest_modification == 1);
+		if (const lsn_t om = bpage->oldest_modification()) {
+			ut_ad(om == 1);
 			buf_pool.delete_from_flush_list(bpage);
 		}
 		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 		ut_ad(!bpage->oldest_modification());
-		oldest_modification = 0;
-	}
-
-	if (zip || !bpage->zip.data) {
-		/* This would completely free the block. */
-		/* Do not completely free dirty blocks. */
-
-		if (oldest_modification) {
-			goto func_exit;
+		/* fall through */
+	case 0:
+		if (zip || !bpage->zip.data || !bpage->frame) {
+			break;
 		}
-	} else if (oldest_modification
-		   && bpage->state() != BUF_BLOCK_FILE_PAGE) {
-func_exit:
-		hash_lock->write_unlock();
-		return(false);
-
-	} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
-		b = buf_page_alloc_descriptor();
+relocate_compressed:
+		b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b));
 		ut_a(b);
 		mysql_mutex_lock(&buf_pool.flush_list_mutex);
 		new (b) buf_page_t(*bpage);
-		b->set_state(BUF_BLOCK_ZIP_PAGE);
+		b->frame = nullptr;
+		b->set_state(buf_page_t::UNFIXED + 1);
+		break;
+	default:
+		if (zip || !bpage->zip.data || !bpage->frame) {
+			/* This would completely free the block. */
+			/* Do not completely free dirty blocks. */
+func_exit:
+			hash_lock.unlock();
+			return(false);
+		}
+		goto relocate_compressed;
 	}
 
 	mysql_mutex_assert_owner(&buf_pool.mutex);
-	ut_ad(bpage->in_file());
-	ut_ad(bpage->in_LRU_list);
 
-	DBUG_PRINT("ib_buf", ("free page %u:%u",
-			      id.space(), id.page_no()));
+	DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no()));
 
 	ut_ad(bpage->can_relocate());
 
-	if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) {
+	if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) {
 		ut_ad(!b);
 		mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
 		return(true);
@@ -880,7 +886,7 @@ func_exit:
 	if (UNIV_LIKELY_NULL(b)) {
 		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
 
-		ut_ad(!buf_pool.page_hash_get_low(id, fold));
+		ut_ad(!buf_pool.page_hash.get(id, chain));
 		ut_ad(b->zip_size());
 
 		/* The field in_LRU_list of
@@ -889,9 +895,7 @@ func_exit:
 		buf_LRU_block_remove_hashed(), which
 		invokes buf_LRU_remove_block(). */
 		ut_ad(!bpage->in_LRU_list);
-
-		/* bpage->state was BUF_BLOCK_FILE_PAGE because
-		b != nullptr. The type cast below is thus valid. */
+		ut_ad(bpage->frame);
 		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
 
 		/* The fields of bpage were copied to b before
@@ -899,8 +903,10 @@ func_exit:
 		ut_ad(!b->in_zip_hash);
 		ut_ad(b->in_LRU_list);
 		ut_ad(b->in_page_hash);
+		ut_d(b->in_page_hash = false);
+		b->hash = nullptr;
 
-		HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, b);
+		buf_pool.page_hash.append(chain, b);
 
 		/* Insert b where bpage was in the LRU list. */
 		if (prev_b) {
@@ -952,13 +958,10 @@ func_exit:
 
 		page_zip_set_size(&bpage->zip, 0);
 
-		/* Prevent buf_page_get_gen() from
-		decompressing the block while we release
-		hash_lock. */
-		b->set_io_fix(BUF_IO_PIN);
-		hash_lock->write_unlock();
+		b->lock.x_lock();
+		hash_lock.unlock();
 	} else if (!zip) {
-		hash_lock->write_unlock();
+		hash_lock.unlock();
 	}
 
 	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
@@ -973,21 +976,16 @@ func_exit:
 		the contents of the page valid (which it still is) in
 		order to avoid bogus Valgrind or MSAN warnings.*/
 
-		MEM_MAKE_DEFINED(block->frame, srv_page_size);
+		MEM_MAKE_DEFINED(block->page.frame, srv_page_size);
 		btr_search_drop_page_hash_index(block, false);
-		MEM_UNDEFINED(block->frame, srv_page_size);
-
-		if (UNIV_LIKELY_NULL(b)) {
-			ut_ad(b->zip_size());
-			b->io_unfix();
-		}
-
+		MEM_UNDEFINED(block->page.frame, srv_page_size);
 		mysql_mutex_lock(&buf_pool.mutex);
-	} else
+	}
 #endif
 	if (UNIV_LIKELY_NULL(b)) {
 		ut_ad(b->zip_size());
-		b->io_unfix();
+		b->lock.x_unlock();
+		b->unfix();
 	}
 
 	buf_LRU_block_free_hashed_page(block);
@@ -1004,21 +1002,22 @@ buf_LRU_block_free_non_file_page(
 {
 	void*		data;
 
-	ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+	ut_ad(block->page.state() == buf_page_t::MEMORY);
 	assert_block_ahi_empty(block);
 	ut_ad(!block->page.in_free_list);
 	ut_ad(!block->page.oldest_modification());
 	ut_ad(!block->page.in_LRU_list);
+	ut_ad(!block->page.hash);
 
-	block->page.set_state(BUF_BLOCK_NOT_USED);
+	block->page.set_state(buf_page_t::NOT_USED);
 
-	MEM_UNDEFINED(block->frame, srv_page_size);
+	MEM_UNDEFINED(block->page.frame, srv_page_size);
 	/* Wipe page_no and space_id */
 	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-	memset_aligned<4>(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+	memset_aligned<4>(block->page.frame + FIL_PAGE_OFFSET, 0xfe, 4);
 	static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
 		      "not perfect alignment");
-	memset_aligned<2>(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+	memset_aligned<2>(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
 			  0xfe, 4);
 	data = block->page.zip.data;
 
@@ -1034,7 +1033,7 @@ buf_LRU_block_free_non_file_page(
 		page_zip_set_size(&block->page.zip, 0);
 	}
 
-	if (buf_pool.curr_size < buf_pool.old_size
+	if (buf_pool.is_shrinking()
 	    && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
 	    && buf_pool.will_be_withdrawn(block->page)) {
 		/* This should be withdrawn */
@@ -1045,10 +1044,11 @@ buf_LRU_block_free_non_file_page(
 	} else {
 		UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
 		ut_d(block->page.in_free_list = true);
-		pthread_cond_signal(&buf_pool.done_free);
+		buf_pool.try_LRU_scan= true;
+		pthread_cond_broadcast(&buf_pool.done_free);
 	}
 
-	MEM_NOACCESS(block->frame, srv_page_size);
+	MEM_NOACCESS(block->page.frame, srv_page_size);
 }
 
 /** Release a memory block to the buffer pool. */
@@ -1063,12 +1063,11 @@ ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
 
 /** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
 
-If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(),
-the object will be freed.
+If !bpage->frame && !bpage->oldest_modification(), the object will be freed.
 
 @param bpage      buffer block
 @param id         page identifier
-@param hash_lock  buf_pool.page_hash latch (will be released here)
+@param chain      locked buf_pool.page_hash chain (will be released here)
 @param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
 
 If a compressed page is freed other compressed pages may be relocated.
@@ -1077,26 +1076,22 @@ caller needs to free the page to the free list
 @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
 this case the block is already returned to the buddy allocator. */
 static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
-                                        page_hash_latch *hash_lock, bool zip)
+                                        buf_pool_t::hash_chain &chain,
+                                        bool zip)
 {
-	mysql_mutex_assert_owner(&buf_pool.mutex);
-        ut_ad(hash_lock->is_write_locked());
-
-	ut_a(bpage->io_fix() == BUF_IO_NONE);
-	ut_a(!bpage->buf_fix_count());
+	ut_a(bpage->can_relocate());
+	ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
 
 	buf_LRU_remove_block(bpage);
 
 	buf_pool.freed_page_clock += 1;
 
-	switch (bpage->state()) {
-	case BUF_BLOCK_FILE_PAGE:
+	if (UNIV_LIKELY(bpage->frame != nullptr)) {
 		MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
-		MEM_CHECK_ADDRESSABLE(((buf_block_t*) bpage)->frame,
-				      srv_page_size);
+		MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size);
 		buf_block_modify_clock_inc((buf_block_t*) bpage);
-		if (bpage->zip.data) {
-			const page_t*	page = ((buf_block_t*) bpage)->frame;
+		if (UNIV_LIKELY_NULL(bpage->zip.data)) {
+			const page_t*	page = bpage->frame;
 
 			ut_a(!zip || !bpage->oldest_modification());
 			ut_ad(bpage->zip_size());
@@ -1135,52 +1130,45 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
 				putc('\n', stderr);
 				ut_error;
 			}
-
-			break;
+		} else {
+			goto evict_zip;
 		}
-		/* fall through */
-	case BUF_BLOCK_ZIP_PAGE:
+	} else {
+evict_zip:
 		ut_a(!bpage->oldest_modification());
 		MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
-		break;
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		ut_error;
-		break;
 	}
 
 	ut_ad(!bpage->in_zip_hash);
-	HASH_DELETE(buf_page_t, hash, &buf_pool.page_hash, id.fold(), bpage);
+	buf_pool.page_hash.remove(chain, bpage);
+	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
 
-	switch (bpage->state()) {
-	case BUF_BLOCK_ZIP_PAGE:
+	if (UNIV_UNLIKELY(!bpage->frame)) {
 		ut_ad(!bpage->in_free_list);
 		ut_ad(!bpage->in_LRU_list);
 		ut_a(bpage->zip.data);
 		ut_a(bpage->zip.ssize);
 		ut_ad(!bpage->oldest_modification());
 
-		hash_lock->write_unlock();
+		hash_lock.unlock();
 		buf_pool_mutex_exit_forbid();
 
 		buf_buddy_free(bpage->zip.data, bpage->zip_size());
 
 		buf_pool_mutex_exit_allow();
-		buf_page_free_descriptor(bpage);
-		return(false);
-
-	case BUF_BLOCK_FILE_PAGE:
+		bpage->lock.free();
+		ut_free(bpage);
+		return false;
+	} else {
 		static_assert(FIL_NULL == 0xffffffffU, "fill pattern");
 		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-		memset_aligned<4>(reinterpret_cast<buf_block_t*>(bpage)->frame
-				  + FIL_PAGE_OFFSET, 0xff, 4);
+		memset_aligned<4>(bpage->frame + FIL_PAGE_OFFSET, 0xff, 4);
 		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
 			      "not perfect alignment");
-		memset_aligned<2>(reinterpret_cast<buf_block_t*>(bpage)->frame
+		memset_aligned<2>(bpage->frame
 				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
-		MEM_UNDEFINED(((buf_block_t*) bpage)->frame, srv_page_size);
-		bpage->set_state(BUF_BLOCK_REMOVE_HASH);
+		MEM_UNDEFINED(bpage->frame, srv_page_size);
+		bpage->set_state(buf_page_t::REMOVE_HASH);
 
 		if (!zip) {
 			return true;
@@ -1205,7 +1193,7 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
 		and by the time we'll release it in the caller we'd
 		have inserted the compressed only descriptor in the
 		page_hash. */
-		hash_lock->write_unlock();
+		hash_lock.unlock();
 
 		if (bpage->zip.data) {
 			/* Free the compressed page. */
@@ -1224,32 +1212,46 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
 			page_zip_set_size(&bpage->zip, 0);
 		}
 
-		return(true);
-
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		break;
+		return true;
 	}
-
-	ut_error;
-	return(false);
 }
 
-/** Remove one page from LRU list and put it to free list.
-@param bpage     file page to be freed
-@param id        page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here) */
-void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
-                           page_hash_latch *hash_lock)
+/** Release and evict a corrupted page.
+@param bpage    x-latched page that was found corrupted
+@param state    expected current state of the page */
+ATTRIBUTE_COLD
+void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state)
 {
-  while (bpage->buf_fix_count())
+  const page_id_t id{bpage->id()};
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+  page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+
+  mysql_mutex_lock(&mutex);
+  hash_lock.lock();
+
+  ut_ad(!bpage->oldest_modification());
+  bpage->set_corrupt_id();
+  auto unfix= state - buf_page_t::FREED;
+  auto s= bpage->zip.fix.fetch_sub(unfix) - unfix;
+  bpage->lock.x_unlock(true);
+
+  while (s != buf_page_t::FREED || bpage->lock.is_locked_or_waiting())
+  {
+    ut_ad(s >= buf_page_t::FREED);
+    ut_ad(s < buf_page_t::UNFIXED);
     /* Wait for other threads to release the fix count
     before releasing the bpage from LRU list. */
     (void) LF_BACKOFF();
+    s= bpage->state();
+  }
 
-  if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true))
+  /* remove from LRU and page_hash */
+  if (buf_LRU_block_remove_hashed(bpage, id, chain, true))
     buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
+
+  mysql_mutex_unlock(&mutex);
+
+  recv_sys.free_corrupted_page(id);
 }
 
 /** Update buf_pool.LRU_old_ratio.
@@ -1326,6 +1328,23 @@ func_exit:
 	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
 }
 
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn".
+We would only need this for buf_LRU_scan_and_free_block(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  return buf_LRU_free_from_unzip_LRU_list(limit) ||
+    buf_LRU_free_from_common_LRU_list(limit);
+}
+
 #ifdef UNIV_DEBUG
 /** Validate the LRU list. */
 void buf_LRU_validate()
@@ -1358,20 +1377,11 @@ void buf_LRU_validate()
 	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
 	     bpage != NULL;
              bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
-
-		switch (bpage->state()) {
-		case BUF_BLOCK_NOT_USED:
-		case BUF_BLOCK_MEMORY:
-		case BUF_BLOCK_REMOVE_HASH:
-			ut_error;
-			break;
-		case BUF_BLOCK_FILE_PAGE:
-			ut_ad(reinterpret_cast<buf_block_t*>(bpage)
-			      ->in_unzip_LRU_list
-			      == bpage->belongs_to_unzip_LRU());
-		case BUF_BLOCK_ZIP_PAGE:
-			break;
-		}
+		ut_ad(bpage->in_file());
+		ut_ad(!bpage->frame
+		      || reinterpret_cast<buf_block_t*>(bpage)
+		      ->in_unzip_LRU_list
+		      == bpage->belongs_to_unzip_LRU());
 
 		if (bpage->is_old()) {
 			const buf_page_t*	prev
@@ -1397,7 +1407,7 @@ void buf_LRU_validate()
 	     bpage != NULL;
 	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
 
-		ut_a(bpage->state() == BUF_BLOCK_NOT_USED);
+		ut_a(bpage->state() == buf_page_t::NOT_USED);
 	}
 
 	CheckUnzipLRUAndLRUList::validate();
@@ -1433,38 +1443,28 @@ void buf_LRU_print()
 			fputs("old ", stderr);
 		}
 
-		if (const uint32_t buf_fix_count = bpage->buf_fix_count()) {
-			fprintf(stderr, "buffix count %u ", buf_fix_count);
-		}
-
-		if (const auto io_fix = bpage->io_fix()) {
-			fprintf(stderr, "io_fix %d ", io_fix);
+		const unsigned s = bpage->state();
+		if (s > buf_page_t::UNFIXED) {
+			fprintf(stderr, "fix %u ", s - buf_page_t::UNFIXED);
+		} else {
+			ut_ad(s == buf_page_t::UNFIXED
+			      || s == buf_page_t::REMOVE_HASH);
 		}
 
 		if (bpage->oldest_modification()) {
 			fputs("modif. ", stderr);
 		}
 
-		switch (const auto state = bpage->state()) {
-			const byte*	frame;
-		case BUF_BLOCK_FILE_PAGE:
-			frame = buf_block_get_frame((buf_block_t*) bpage);
-			fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
-				fil_page_get_type(frame),
-				btr_page_get_index_id(frame));
-			break;
-		case BUF_BLOCK_ZIP_PAGE:
-			frame = bpage->zip.data;
+		if (const byte* frame = bpage->zip.data) {
 			fprintf(stderr, "\ntype %u size " ULINTPF
 				" index id " IB_ID_FMT "\n",
 				fil_page_get_type(frame),
 				bpage->zip_size(),
 				btr_page_get_index_id(frame));
-			break;
-
-		default:
-			fprintf(stderr, "\n!state %d!\n", state);
-			break;
+		} else {
+			fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
+				fil_page_get_type(bpage->frame),
+				btr_page_get_index_id(bpage->frame));
 		}
 	}
 
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index ff163f74b08..b8fa3055adf 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -41,6 +41,7 @@ Created 11/5/1995 Heikki Tuuri
 #include "os0file.h"
 #include "srv0start.h"
 #include "srv0srv.h"
+#include "log.h"
 
 /** If there are buf_pool.curr_size per the number below pending reads, then
 read-ahead is not done: this is to prevent flooding the buffer pool with
@@ -50,22 +51,30 @@ i/o-fixed buffer blocks */
 /** Remove the sentinel block for the watch before replacing it with a
 real block. watch_unset() or watch_occurred() will notice
 that the block has been replaced with the real block.
-@param watch   sentinel */
-inline void buf_pool_t::watch_remove(buf_page_t *watch)
+@param w          sentinel
+@param chain      locked hash table chain
+@return           w->state() */
+inline uint32_t buf_pool_t::watch_remove(buf_page_t *w,
+                                         buf_pool_t::hash_chain &chain)
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  ut_ad(hash_lock_get(watch->id())->is_write_locked());
-  ut_a(watch_is_sentinel(*watch));
-  if (watch->buf_fix_count())
-  {
-    ut_ad(watch->in_page_hash);
-    ut_d(watch->in_page_hash= false);
-    HASH_DELETE(buf_page_t, hash, &page_hash, watch->id().fold(), watch);
-    watch->set_buf_fix_count(0);
-  }
-  ut_ad(!watch->in_page_hash);
-  watch->set_state(BUF_BLOCK_NOT_USED);
-  watch->id_= page_id_t(~0ULL);
+  ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked());
+  ut_ad(w >= &watch[0]);
+  ut_ad(w < &watch[array_elements(watch)]);
+  ut_ad(!w->in_zip_hash);
+  ut_ad(!w->zip.data);
+
+  uint32_t s{w->state()};
+  w->set_state(buf_page_t::NOT_USED);
+  ut_ad(s >= buf_page_t::UNFIXED);
+  ut_ad(s < buf_page_t::READ_FIX);
+
+  if (~buf_page_t::LRU_MASK & s)
+    page_hash.remove(chain, w);
+
+  ut_ad(!w->in_page_hash);
+  w->id_= page_id_t(~0ULL);
+  return s;
 }
 
 /** Initialize a page for read to the buffer buf_pool. If the page is
@@ -83,6 +92,7 @@ and the lock released later.
 					requested (for ROW_FORMAT=COMPRESSED)
 @return pointer to the block
 @retval	NULL	in case of an error */
+TRANSACTIONAL_TARGET
 static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
                                           ulint zip_size, bool unzip)
 {
@@ -108,29 +118,24 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
   if (!zip_size || unzip || recv_recovery_is_on())
   {
     block= buf_LRU_get_free_block(false);
-    block->initialise(page_id, zip_size);
-    /* We set a pass-type x-lock on the frame because then
-    the same thread which called for the read operation
-    (and is running now at this point of code) can wait
-    for the read to complete by waiting for the x-lock on
-    the frame; if the x-lock were recursive, the same
-    thread would illegally get the x-lock before the page
-    read is completed.  The x-lock will be released
-    in buf_page_read_complete() by the io-handler thread. */
-    rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+    block->initialise(page_id, zip_size, buf_page_t::READ_FIX);
+    /* x_unlock() will be invoked
+    in buf_page_t::read_complete() by the io-handler thread. */
+    block->page.lock.x_lock(true);
   }
 
-  const ulint fold= page_id.fold();
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
 
   mysql_mutex_lock(&buf_pool.mutex);
 
-  buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id, fold);
+  buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain);
   if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
   {
     /* The page is already in the buffer pool. */
     if (block)
     {
-      rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ);
+      block->page.lock.x_unlock(true);
+      ut_d(block->page.set_state(buf_page_t::MEMORY));
       buf_LRU_block_free_non_file_page(block);
     }
     goto func_exit;
@@ -141,27 +146,19 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
     bpage= &block->page;
 
     /* Insert into the hash table of file pages */
-    page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
-    hash_lock->write_lock();
-
-    if (hash_page)
     {
-      /* Preserve the reference count. */
-      auto buf_fix_count= hash_page->buf_fix_count();
-      ut_a(buf_fix_count > 0);
-      block->page.add_buf_fix_count(buf_fix_count);
-      buf_pool.watch_remove(hash_page);
-    }
+      transactional_lock_guard<page_hash_latch> g
+        {buf_pool.page_hash.lock_get(chain)};
 
-    block->page.set_io_fix(BUF_IO_READ);
-    block->page.set_state(BUF_BLOCK_FILE_PAGE);
-    ut_ad(!block->page.in_page_hash);
-    ut_d(block->page.in_page_hash= true);
-    HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
-    hash_lock->write_unlock();
+      if (hash_page)
+        bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
+                         (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
+
+      buf_pool.page_hash.append(chain, &block->page);
+    }
 
     /* The block must be put to the LRU list, to the old blocks */
-    buf_LRU_add_block(bpage, true/* to old blocks */);
+    buf_LRU_add_block(&block->page, true/* to old blocks */);
 
     if (UNIV_UNLIKELY(zip_size))
     {
@@ -194,7 +191,7 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
     check the page_hash again, as it may have been modified. */
     if (UNIV_UNLIKELY(lru))
     {
-      hash_page= buf_pool.page_hash_get_low(page_id, fold);
+      hash_page= buf_pool.page_hash.get(page_id, chain);
 
       if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
       {
@@ -204,43 +201,35 @@ static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
       }
     }
 
-    bpage= buf_page_alloc_descriptor();
+    bpage= static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *bpage));
 
     page_zip_des_init(&bpage->zip);
     page_zip_set_size(&bpage->zip, zip_size);
     bpage->zip.data = (page_zip_t*) data;
 
-    bpage->init(BUF_BLOCK_ZIP_PAGE, page_id);
+    bpage->init(buf_page_t::READ_FIX, page_id);
+    bpage->lock.x_lock(true);
 
-    page_hash_latch *hash_lock= buf_pool.page_hash.lock_get(fold);
-    hash_lock->write_lock();
-
-    if (hash_page)
     {
-      /* Preserve the reference count. It can be 0 if
-      buf_pool_t::watch_unset() is executing concurrently,
-      waiting for buf_pool.mutex, which we are holding. */
-      bpage->add_buf_fix_count(hash_page->buf_fix_count());
-      buf_pool.watch_remove(hash_page);
-    }
+      transactional_lock_guard<page_hash_latch> g
+        {buf_pool.page_hash.lock_get(chain)};
 
-    ut_ad(!bpage->in_page_hash);
-    ut_d(bpage->in_page_hash= true);
-    HASH_INSERT(buf_page_t, hash, &buf_pool.page_hash, fold, bpage);
-    bpage->set_io_fix(BUF_IO_READ);
-    hash_lock->write_unlock();
+      if (hash_page)
+        bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
+                         (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
+
+      buf_pool.page_hash.append(chain, bpage);
+    }
 
     /* The block must be put to the LRU list, to the old blocks.
     The zip size is already set into the page zip */
     buf_LRU_add_block(bpage, true/* to old blocks */);
   }
 
-  mysql_mutex_unlock(&buf_pool.mutex);
-  buf_pool.n_pend_reads++;
-  goto func_exit_no_mutex;
+  buf_pool.stat.n_pages_read++;
 func_exit:
   mysql_mutex_unlock(&buf_pool.mutex);
-func_exit_no_mutex:
+
   if (mode == BUF_READ_IBUF_PAGES_ONLY)
     ibuf_mtr_commit(&mtr);
 
@@ -254,20 +243,18 @@ buffer buf_pool if it is not already there, in which case does nothing.
 Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
 flag is cleared and the x-lock released by an i/o-handler thread.
 
-@param[out] err		DB_SUCCESS or DB_TABLESPACE_DELETED
-			if we are trying
-			to read from a non-existent tablespace
 @param[in,out] space	tablespace
 @param[in] sync		true if synchronous aio is desired
 @param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
 @param[in] page_id	page id
 @param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in] unzip	true=request uncompressed page
-@return whether a read request was queued */
+@return error code
+@retval DB_SUCCESS if the page was read
+@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */
 static
-bool
+dberr_t
 buf_read_page_low(
-	dberr_t*		err,
 	fil_space_t*		space,
 	bool			sync,
 	ulint			mode,
@@ -277,15 +264,12 @@ buf_read_page_low(
 {
 	buf_page_t*	bpage;
 
-	*err = DB_SUCCESS;
-
 	if (buf_dblwr.is_inside(page_id)) {
 		ib::error() << "Trying to read doublewrite buffer page "
 			<< page_id;
 		ut_ad(0);
-nothing_read:
 		space->release();
-		return false;
+		return DB_PAGE_CORRUPTED;
 	}
 
 	if (sync) {
@@ -308,8 +292,9 @@ nothing_read:
 	completed */
 	bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
 
-	if (bpage == NULL) {
-		goto nothing_read;
+	if (!bpage) {
+		space->release();
+		return DB_SUCCESS_LOCKED_REC;
 	}
 
 	ut_ad(bpage->in_file());
@@ -322,47 +307,27 @@ nothing_read:
 		 "read page " << page_id << " zip_size=" << zip_size
 		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
 
-	void*	dst;
-
-	if (zip_size) {
-		dst = bpage->zip.data;
-	} else {
-		ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE);
-
-		dst = ((buf_block_t*) bpage)->frame;
-	}
-
+	void* dst = zip_size ? bpage->zip.data : bpage->frame;
 	const ulint len = zip_size ? zip_size : srv_page_size;
 
 	auto fio = space->io(IORequest(sync
 				       ? IORequest::READ_SYNC
 				       : IORequest::READ_ASYNC),
 			     page_id.page_no() * len, len, dst, bpage);
-	*err= fio.err;
 
 	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
-		if (!sync || fio.err == DB_TABLESPACE_DELETED
-		    || fio.err == DB_IO_ERROR) {
-			buf_pool.corrupted_evict(bpage);
-			return false;
-		}
-
-		ut_error;
-	}
-
-	if (sync) {
+		buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
+	} else if (sync) {
 		thd_wait_end(NULL);
-
 		/* The i/o was already completed in space->io() */
-		*err = buf_page_read_complete(bpage, *fio.node);
+		fio.err = bpage->read_complete(*fio.node);
 		space->release();
-
-		if (*err != DB_SUCCESS) {
-			return false;
+		if (fio.err == DB_FAIL) {
+			fio.err = DB_PAGE_CORRUPTED;
 		}
 	}
 
-	return true;
+	return fio.err;
 }
 
 /** Applies a random read-ahead in buf_pool if there are at least a threshold
@@ -381,6 +346,7 @@ wants to access
 @return number of page read requests issued; NOTE that if we read ibuf
 pages, it may happen that the page at the given page number does not
 get read even if we return a positive value! */
+TRANSACTIONAL_TARGET
 ulint
 buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
 {
@@ -396,7 +362,8 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
     read-ahead, as that could break the ibuf page access order */
     return 0;
 
-  if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+  if (os_aio_pending_reads_approx() >
+      buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
     return 0;
 
   fil_space_t* space= fil_space_t::get(page_id.space());
@@ -414,13 +381,12 @@ buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
 
   for (page_id_t i= low; i < high; ++i)
   {
-    const ulint fold= i.fold();
-    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
-    const buf_page_t *bpage= buf_pool.page_hash_get_low(i, fold);
-    bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage);
-    hash_lock->read_unlock();
-    if (found && !--count)
-      goto read_ahead;
+    buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain))
+      if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count)
+        goto read_ahead;
   }
 
 no_read_ahead:
@@ -440,24 +406,26 @@ read_ahead:
       continue;
     if (space->is_stopping())
       break;
-    dberr_t err;
     space->reacquire();
-    if (buf_read_page_low(&err, space, false, ibuf_mode, i, zip_size, false))
+    if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) ==
+        DB_SUCCESS)
       count++;
   }
 
   if (count)
+  {
     DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
 			  count, space->chain.start->name,
 			  low.page_no()));
-  space->release();
-
-  /* Read ahead is considered one I/O operation for the purpose of
-  LRU policy decision. */
-  buf_LRU_stat_inc_io();
+    mysql_mutex_lock(&buf_pool.mutex);
+    /* Read ahead is considered one I/O operation for the purpose of
+    LRU policy decision. */
+    buf_LRU_stat_inc_io();
+    buf_pool.stat.n_ra_pages_read_rnd+= count;
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
 
-  buf_pool.stat.n_ra_pages_read_rnd+= count;
-  srv_stats.buf_pool_reads.add(count);
+  space->release();
   return count;
 }
 
@@ -467,8 +435,9 @@ on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
 @param[in]	page_id		page id
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@retval DB_SUCCESS if the page was read and is not corrupted,
-@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
 @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
 after decryption normal page checksum does not match.
 @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
@@ -482,13 +451,9 @@ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
     return DB_TABLESPACE_DELETED;
   }
 
-  dberr_t err;
-  if (buf_read_page_low(&err, space, true, BUF_READ_ANY_PAGE,
-			page_id, zip_size, false))
-    srv_stats.buf_pool_reads.add(1);
-
-  buf_LRU_stat_inc_io();
-  return err;
+  buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */
+  return buf_read_page_low(space, true, BUF_READ_ANY_PAGE,
+                           page_id, zip_size, false);
 }
 
 /** High-level function which reads a page asynchronously from a file to the
@@ -501,32 +466,8 @@ released by the i/o-handler thread.
 void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
                               ulint zip_size)
 {
-	dberr_t		err;
-
-	if (buf_read_page_low(&err, space, false, BUF_READ_ANY_PAGE,
-			      page_id, zip_size, false)) {
-		srv_stats.buf_pool_reads.add(1);
-	}
-
-	switch (err) {
-	case DB_SUCCESS:
-	case DB_ERROR:
-		break;
-	case DB_TABLESPACE_DELETED:
-		ib::info() << "trying to read page " << page_id
-			<< " in the background"
-			" in a non-existing or being-dropped tablespace";
-		break;
-	case DB_PAGE_CORRUPTED:
-	case DB_DECRYPTION_FAILED:
-		ib::error()
-			<< "Background Page read failed to "
-			"read or decrypt " << page_id;
-		break;
-	default:
-		ib::fatal() << "Error " << err << " in background read of "
-			<< page_id;
-	}
+	buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
+			  page_id, zip_size, false);
 
 	/* We do not increment number of I/O operations used for LRU policy
 	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
@@ -562,6 +503,7 @@ which could result in a deadlock if the OS does not support asynchronous io.
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	ibuf		whether if we are inside ibuf routine
 @return number of page read requests issued */
+TRANSACTIONAL_TARGET
 ulint
 buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
 {
@@ -573,7 +515,8 @@ buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
     /* No read-ahead to avoid thread deadlocks */
     return 0;
 
-  if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+  if (os_aio_pending_reads_approx() >
+      buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
     return 0;
 
   const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
@@ -614,9 +557,19 @@ fail:
   unsigned prev_accessed= 0;
   for (page_id_t i= low; i != high_1; ++i)
   {
-    const ulint fold= i.fold();
-    page_hash_latch *hash_lock= buf_pool.page_hash.lock<false>(fold);
-    const buf_page_t* bpage= buf_pool.page_hash_get_low(i, fold);
+    buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    const buf_page_t* bpage= buf_pool.page_hash.get(i, chain);
+    if (!bpage)
+    {
+      if (i == page_id)
+        goto fail;
+failed:
+      if (--count)
+        continue;
+      goto fail;
+    }
     if (i == page_id)
     {
       /* Read the natural predecessor and successor page addresses from
@@ -624,53 +577,28 @@ fail:
       on the page, we do not acquire an s-latch on the page, this is to
       prevent deadlocks. The hash_lock is only protecting the
       buf_pool.page_hash for page i, not the bpage contents itself. */
-      if (!bpage)
-      {
-hard_fail:
-        hash_lock->read_unlock();
-	goto fail;
-      }
-      const byte *f;
-      switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) {
-      case BUF_BLOCK_FILE_PAGE:
-        f= reinterpret_cast<const buf_block_t*>(bpage)->frame;
-        break;
-      case BUF_BLOCK_ZIP_PAGE:
-        f= bpage->zip.data;
-        break;
-      default:
-        goto hard_fail;
-      }
-
+      const byte *f= bpage->frame ? bpage->frame : bpage->zip.data;
       uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
       uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
       if (prev == FIL_NULL || next == FIL_NULL)
-        goto hard_fail;
+        goto fail;
       page_id_t id= page_id;
       if (descending && next - 1 == page_id.page_no())
         id.set_page_no(prev);
       else if (!descending && prev + 1 == page_id.page_no())
         id.set_page_no(next);
       else
-        goto hard_fail; /* Successor or predecessor not in the right order */
+        goto fail; /* Successor or predecessor not in the right order */
 
       new_low= id - (id.page_no() % buf_read_ahead_area);
       new_high_1= new_low + (buf_read_ahead_area - 1);
 
       if (id != new_low && id != new_high_1)
         /* This is not a border page of the area: return */
-        goto hard_fail;
+        goto fail;
       if (new_high_1.page_no() > space->last_page_number())
         /* The area is not whole */
-        goto hard_fail;
-    }
-    else if (!bpage)
-    {
-failed:
-      hash_lock->read_unlock();
-      if (--count)
-        continue;
-      goto fail;
+        goto fail;
     }
 
     const unsigned accessed= bpage->is_accessed();
@@ -687,7 +615,6 @@ failed:
     prev_accessed= accessed;
     if (fail)
       goto failed;
-    hash_lock->read_unlock();
   }
 
   /* If we got this far, read-ahead can be sensible: do it */
@@ -699,26 +626,36 @@ failed:
       continue;
     if (space->is_stopping())
       break;
-    dberr_t err;
     space->reacquire();
-    count+= buf_read_page_low(&err, space, false, ibuf_mode, new_low, zip_size,
-                              false);
+    if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) ==
+        DB_SUCCESS)
+      count++;
   }
 
   if (count)
+  {
     DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
                           count, space->chain.start->name,
                           new_low.page_no()));
-  space->release();
-
-  /* Read ahead is considered one I/O operation for the purpose of
-  LRU policy decision. */
-  buf_LRU_stat_inc_io();
+    mysql_mutex_lock(&buf_pool.mutex);
+    /* Read ahead is considered one I/O operation for the purpose of
+    LRU policy decision. */
+    buf_LRU_stat_inc_io();
+    buf_pool.stat.n_ra_pages_read+= count;
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
 
-  buf_pool.stat.n_ra_pages_read+= count;
+  space->release();
   return count;
 }
 
+/** @return whether a page has been freed */
+inline bool fil_space_t::is_freed(uint32_t page)
+{
+  std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+  return freed_ranges.contains(page);
+}
+
 /** Issues read requests for pages which recovery wants to read in.
 @param[in]	space_id	tablespace id
 @param[in]	page_nos	array of page numbers to read, with the
@@ -738,7 +675,7 @@ void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
 	for (ulint i = 0; i < n; i++) {
 
 		/* Ignore if the page already present in freed ranges. */
-		if (space->freed_ranges.contains(page_nos[i])) {
+		if (space->is_freed(page_nos[i])) {
 			continue;
 		}
 
@@ -749,32 +686,23 @@ void buf_read_recv_pages(ulint space_id, const uint32_t* page_nos, ulint n)
 			limit += buf_pool.chunks[j].size / 2;
 		}
 
-		for (ulint count = 0; buf_pool.n_pend_reads >= limit; ) {
-			os_thread_sleep(10000);
-
-			if (!(++count % 1000)) {
-
-				ib::error()
-					<< "Waited for " << count / 100
-					<< " seconds for "
-					<< buf_pool.n_pend_reads
-					<< " pending reads";
-			}
+		if (os_aio_pending_reads() >= limit) {
+			os_aio_wait_until_no_pending_reads();
 		}
 
-		dberr_t err;
 		space->reacquire();
-		buf_read_page_low(&err, space, false,
-				  BUF_READ_ANY_PAGE, cur_page_id, zip_size,
-				  true);
-
-		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
-			ib::error() << "Recovery failed to read or decrypt "
-				<< cur_page_id;
+		switch (buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
+					  cur_page_id, zip_size, true)) {
+		case DB_SUCCESS: case DB_SUCCESS_LOCKED_REC:
+			break;
+		default:
+			sql_print_error("InnoDB: Recovery failed to read page "
+					UINT32PF " from %s",
+					cur_page_id.page_no(),
+					space->chain.start->name);
 		}
 	}
 
-
         DBUG_PRINT("ib_buf", ("recovery read (%u pages) for %s", n,
 			      space->chain.start->name));
 	space->release();
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
index 1414f21221f..e57121041d5 100644
--- a/storage/innobase/dict/dict0boot.cc
+++ b/storage/innobase/dict/dict0boot.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2020, MariaDB Corporation.
+Copyright (c) 2016, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -35,13 +35,14 @@ Created 4/18/1996 Heikki Tuuri
 #include "log0recv.h"
 #include "os0file.h"
 
+/** The DICT_HDR page identifier */
+static constexpr page_id_t hdr_page_id{DICT_HDR_SPACE, DICT_HDR_PAGE_NO};
+
 /** @return the DICT_HDR block, x-latched */
-buf_block_t *dict_hdr_get(mtr_t* mtr)
+static buf_block_t *dict_hdr_get(mtr_t *mtr)
 {
-  buf_block_t *block= buf_page_get(page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO),
-				   0, RW_X_LATCH, mtr);
-  buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
-  return block;
+  /* We assume that the DICT_HDR page is always readable and available. */
+  return buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr);
 }
 
 /**********************************************************************//**
@@ -64,97 +65,83 @@ dict_hdr_get_new_id(
 
 	if (table_id) {
 		id = mach_read_from_8(DICT_HDR + DICT_HDR_TABLE_ID
-				      + dict_hdr->frame);
+				      + dict_hdr->page.frame);
 		id++;
 		mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_TABLE_ID
-			     + dict_hdr->frame, id);
+			     + dict_hdr->page.frame, id);
 		*table_id = id;
 	}
 
 	if (index_id) {
 		id = mach_read_from_8(DICT_HDR + DICT_HDR_INDEX_ID
-				      + dict_hdr->frame);
+				      + dict_hdr->page.frame);
 		id++;
 		mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_INDEX_ID
-			     + dict_hdr->frame, id);
+			     + dict_hdr->page.frame, id);
 		*index_id = id;
 	}
 
 	if (space_id) {
 		*space_id = mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID
-					     + dict_hdr->frame);
+					     + dict_hdr->page.frame);
 		if (fil_assign_new_space_id(space_id)) {
 			mtr.write<4>(*dict_hdr,
 				     DICT_HDR + DICT_HDR_MAX_SPACE_ID
-				     + dict_hdr->frame, *space_id);
+				     + dict_hdr->page.frame, *space_id);
 		}
 	}
 
 	mtr.commit();
 }
 
-/**********************************************************************//**
-Writes the current value of the row id counter to the dictionary header file
-page. */
-void
-dict_hdr_flush_row_id(void)
-/*=======================*/
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id)
 {
-	row_id_t	id;
-	mtr_t		mtr;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	id = dict_sys.row_id;
-
-	mtr.start();
-
-	buf_block_t* d = dict_hdr_get(&mtr);
-
-	mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->frame, id);
-
-	mtr.commit();
+  mtr_t mtr;
+  mtr.start();
+  buf_block_t* d= dict_hdr_get(&mtr);
+  byte *row_id= DICT_HDR + DICT_HDR_ROW_ID + d->page.frame;
+  if (mach_read_from_8(row_id) < id)
+    mtr.write<8>(*d, row_id, id);
+  mtr.commit();
 }
 
-/*****************************************************************//**
-Creates the file page for the dictionary header. This function is
-called only at the database creation.
-@return TRUE if succeed */
-static
-ibool
-dict_hdr_create(
-/*============*/
-	mtr_t*	mtr)	/*!< in: mtr */
+/** Create the DICT_HDR page on database initialization.
+@return error code */
+dberr_t dict_create()
 {
-	buf_block_t*	block;
 	ulint		root_page_no;
 
-	ut_ad(mtr);
+	dberr_t err;
+	mtr_t mtr;
+	mtr.start();
 	compile_time_assert(DICT_HDR_SPACE == 0);
 
 	/* Create the dictionary header file block in a new, allocated file
 	segment in the system tablespace */
-	block = fseg_create(fil_system.sys_space,
-			    DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);
-
-	ut_a(block->page.id() == page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO));
-
-	buf_block_t* d = dict_hdr_get(mtr);
+	buf_block_t* d = fseg_create(fil_system.sys_space,
+				     DICT_HDR + DICT_HDR_FSEG_HEADER, &mtr,
+                                     &err);
+	if (!d) {
+		goto func_exit;
+	}
+	ut_a(d->page.id() == hdr_page_id);
 
 	/* Start counting row, table, index, and tree ids from
 	DICT_HDR_FIRST_ID */
-	mtr->write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->frame,
-		      DICT_HDR_FIRST_ID);
-	mtr->write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->frame,
-		      DICT_HDR_FIRST_ID);
-	mtr->write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->frame,
-		      DICT_HDR_FIRST_ID);
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->page.frame,
+		     DICT_HDR_FIRST_ID);
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->page.frame,
+		     DICT_HDR_FIRST_ID);
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->page.frame,
+		     DICT_HDR_FIRST_ID);
 
-	ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + d->frame));
+	ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID
+				+ d->page.frame));
 
 	/* Obsolete, but we must initialize it anyway. */
-	mtr->write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->frame,
-		      DICT_HDR_FIRST_ID);
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->page.frame,
+		     DICT_HDR_FIRST_ID);
 
 	/* Create the B-tree roots for the clustered indexes of the basic
 	system tables */
@@ -162,103 +149,102 @@ dict_hdr_create(
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_TABLES_ID,
-				  nullptr, mtr);
+				  nullptr, &mtr, &err);
 	if (root_page_no == FIL_NULL) {
-
-		return(FALSE);
+		goto func_exit;
 	}
 
-	mtr->write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->frame, root_page_no);
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->page.frame,
+		     root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_UNIQUE,
 				  fil_system.sys_space, DICT_TABLE_IDS_ID,
-				  nullptr, mtr);
+				  nullptr, &mtr, &err);
 	if (root_page_no == FIL_NULL) {
-
-		return(FALSE);
+		goto func_exit;
 	}
 
-	mtr->write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->frame,
-		      root_page_no);
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->page.frame,
+		     root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_COLUMNS_ID,
-				  nullptr, mtr);
+				  nullptr, &mtr, &err);
 	if (root_page_no == FIL_NULL) {
-
-		return(FALSE);
+		goto func_exit;
 	}
 
-	mtr->write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->frame,
-		      root_page_no);
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->page.frame,
+		     root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_INDEXES_ID,
-				  nullptr, mtr);
+				  nullptr, &mtr, &err);
 	if (root_page_no == FIL_NULL) {
-
-		return(FALSE);
+		goto func_exit;
 	}
 
-	mtr->write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->frame,
-		      root_page_no);
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->page.frame,
+		     root_page_no);
 	/*--------------------------*/
 	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
 				  fil_system.sys_space, DICT_FIELDS_ID,
-				  nullptr, mtr);
+				  nullptr, &mtr, &err);
 	if (root_page_no == FIL_NULL) {
-
-		return(FALSE);
+		goto func_exit;
 	}
 
-	mtr->write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->frame, root_page_no);
-	/*--------------------------*/
-
-	return(TRUE);
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->page.frame,
+		     root_page_no);
+func_exit:
+	mtr.commit();
+	return err ? err : dict_boot();
 }
 
 /*****************************************************************//**
 Initializes the data dictionary memory structures when the database is
 started. This function is also called when the data dictionary is created.
 @return DB_SUCCESS or error code. */
-dberr_t
-dict_boot(void)
-/*===========*/
+dberr_t dict_boot()
 {
 	dict_table_t*	table;
 	dict_index_t*	index;
 	mem_heap_t*	heap;
 	mtr_t		mtr;
 
-	/* Be sure these constants do not ever change.  To avoid bloat,
-	only check the *NUM_FIELDS* in each table */
-
-	ut_ad(DICT_NUM_COLS__SYS_TABLES == 8);
-	ut_ad(DICT_NUM_FIELDS__SYS_TABLES == 10);
-	ut_ad(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2);
-	ut_ad(DICT_NUM_COLS__SYS_COLUMNS == 7);
-	ut_ad(DICT_NUM_FIELDS__SYS_COLUMNS == 9);
-	ut_ad(DICT_NUM_COLS__SYS_INDEXES == 8);
-	ut_ad(DICT_NUM_FIELDS__SYS_INDEXES == 10);
-	ut_ad(DICT_NUM_COLS__SYS_FIELDS == 3);
-	ut_ad(DICT_NUM_FIELDS__SYS_FIELDS == 5);
-	ut_ad(DICT_NUM_COLS__SYS_FOREIGN == 4);
-	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN == 6);
-	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2);
-	ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4);
-	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6);
-
-	mtr_start(&mtr);
+	static_assert(DICT_NUM_COLS__SYS_TABLES == 8, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_TABLES == 10, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_COLUMNS == 7, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_COLUMNS == 9, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_INDEXES == 8, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_INDEXES == 10, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_FIELDS == 3, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FIELDS == 5, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_FOREIGN == 4, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FOREIGN == 6, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2,
+		      "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6, "compatibility");
 
+	mtr.start();
 	/* Create the hash tables etc. */
 	dict_sys.create();
 
+	dberr_t err;
+	const buf_block_t *d = buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH,
+						nullptr, BUF_GET, &mtr, &err);
+        if (!d) {
+		mtr.commit();
+		return err;
+        }
+
 	heap = mem_heap_create(450);
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	/* Get the dictionary header */
-	const byte* dict_hdr = &dict_hdr_get(&mtr)->frame[DICT_HDR];
+	const byte* dict_hdr = &d->page.frame[DICT_HDR];
 
 	/* Because we only write new row ids to disk-based data structure
 	(dictionary header) when it is divisible by
@@ -270,9 +256,7 @@ dict_boot(void)
 	..._MARGIN, it will immediately be updated to the disk-based
 	header. */
 
-	dict_sys.row_id = DICT_HDR_ROW_ID_WRITE_MARGIN
-		+ ut_uint64_align_up(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID),
-				     DICT_HDR_ROW_ID_WRITE_MARGIN);
+	dict_sys.recover_row_id(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID));
 	if (ulint max_space_id = mach_read_from_4(dict_hdr
 						  + DICT_HDR_MAX_SPACE_ID)) {
 		max_space_id--;
@@ -282,9 +266,9 @@ dict_boot(void)
 	/* Insert into the dictionary cache the descriptions of the basic
 	system tables */
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_TABLES", fil_system.sys_space,
-				      8, 0, 0, 0);
-
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_TABLES],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_TABLES, 0, 0, 0);
 	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0,
 			       MAX_FULL_NAME_LEN);
 	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8);
@@ -313,7 +297,7 @@ dict_boot(void)
 	dict_mem_index_add_field(index, "NAME", 0);
 
 	index->id = DICT_TABLES_ID;
-	dberr_t err = dict_index_add_to_cache(
+	err = dict_index_add_to_cache(
 		index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES));
 	ut_a(err == DB_SUCCESS);
 	ut_ad(!table->is_instant());
@@ -330,9 +314,9 @@ dict_boot(void)
 	ut_a(err == DB_SUCCESS);
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_COLUMNS", fil_system.sys_space,
-				      7, 0, 0, 0);
-
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_COLUMNS],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_COLUMNS, 0, 0, 0);
 	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8);
 	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
@@ -363,16 +347,16 @@ dict_boot(void)
 		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_INDEXES", fil_system.sys_space,
-				      DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0);
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_INDEXES],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0);
 
 	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8);
 	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8);
 	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
 	dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
-	/* SYS_INDEXES.SPACE is redundant and not being read;
-	SYS_TABLES.SPACE is being used instead. */
+	/* SYS_INDEXES.SPACE is only read by in dict_drop_index_tree() */
 	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "MERGE_THRESHOLD", DATA_INT, 0, 4);
@@ -406,9 +390,9 @@ dict_boot(void)
 		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
 	/*-------------------------*/
-	table = dict_mem_table_create("SYS_FIELDS", fil_system.sys_space,
-				      3, 0, 0, 0);
-
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_FIELDS],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_FIELDS, 0, 0, 0);
 	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 8);
 	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
 	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
@@ -434,16 +418,11 @@ dict_boot(void)
 	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
 		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
 
-	mtr_commit(&mtr);
-
-	/*-------------------------*/
-
-	/* Initialize the insert buffer table and index for each tablespace */
+	mtr.commit();
 
 	err = ibuf_init_at_db_start();
 
-	if (err == DB_SUCCESS
-	    || srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+	if (err == DB_SUCCESS || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO) {
 		err = DB_SUCCESS;
 		/* Load definitions of other indexes on system tables */
 
@@ -451,44 +430,11 @@ dict_boot(void)
 		dict_load_sys_table(dict_sys.sys_columns);
 		dict_load_sys_table(dict_sys.sys_indexes);
 		dict_load_sys_table(dict_sys.sys_fields);
+		dict_sys.unlock();
+		dict_sys.load_sys_tables();
+	} else {
+		dict_sys.unlock();
 	}
 
-	mutex_exit(&dict_sys.mutex);
-
-	return(err);
-}
-
-/*****************************************************************//**
-Inserts the basic system table data into themselves in the database
-creation. */
-static
-void
-dict_insert_initial_data(void)
-/*==========================*/
-{
-	/* Does nothing yet */
-}
-
-/*****************************************************************//**
-Creates and initializes the data dictionary at the server bootstrap.
-@return DB_SUCCESS or error code. */
-dberr_t
-dict_create(void)
-/*=============*/
-{
-	mtr_t	mtr;
-
-	mtr_start(&mtr);
-
-	dict_hdr_create(&mtr);
-
-	mtr_commit(&mtr);
-
-	dberr_t	err = dict_boot();
-
-	if (err == DB_SUCCESS) {
-		dict_insert_initial_data();
-	}
-
-	return(err);
+	return err;
 }
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
index 2e9b2aa5d57..614048b7ba0 100644
--- a/storage/innobase/dict/dict0crea.cc
+++ b/storage/innobase/dict/dict0crea.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -33,6 +33,7 @@ Created 1/8/1996 Heikki Tuuri
 #include "mach0data.h"
 #include "dict0boot.h"
 #include "dict0dict.h"
+#include "lock0lock.h"
 #include "que0que.h"
 #include "row0ins.h"
 #include "row0mysql.h"
@@ -41,9 +42,9 @@ Created 1/8/1996 Heikki Tuuri
 #include "trx0rseg.h"
 #include "trx0undo.h"
 #include "ut0vec.h"
-#include "dict0priv.h"
 #include "fts0priv.h"
 #include "srv0start.h"
+#include "log.h"
 
 /*****************************************************************//**
 Based on a table object, this function builds the entry to be inserted
@@ -343,14 +344,12 @@ dict_build_table_def_step(
 	que_thr_t*	thr,	/*!< in: query thread */
 	tab_node_t*	node)	/*!< in: table create node */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	dict_table_t*	table = node->table;
-	trx_t* trx = thr_get_trx(thr);
 	ut_ad(!table->is_temporary());
 	ut_ad(!table->space);
 	ut_ad(table->space_id == ULINT_UNDEFINED);
 	dict_hdr_get_new_id(&table->id, NULL, NULL);
-	trx->table_id = table->id;
 
 	/* Always set this bit for all new created tables */
 	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
@@ -363,82 +362,17 @@ dict_build_table_def_step(
 
 		ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0
 		      || dict_table_has_atomic_blobs(table));
-		mtr_t mtr;
-		trx_undo_t* undo = trx->rsegs.m_redo.undo;
-		if (undo && !undo->table_id
-		    && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) {
-			/* This must be a TRUNCATE operation where
-			the empty table is created after the old table
-			was renamed. Be sure to mark the transaction
-			associated with the new empty table, so that
-			we can remove it on recovery. */
-			mtr.start();
-			undo->table_id = trx->table_id;
-			undo->dict_operation = TRUE;
-			buf_block_t* block = trx_undo_page_get(
-				page_id_t(trx->rsegs.m_redo.rseg->space->id,
-					  undo->hdr_page_no),
-				&mtr);
-			mtr.write<1,mtr_t::MAYBE_NOP>(
-				*block,
-				block->frame + undo->hdr_offset
-				+ TRX_UNDO_DICT_TRANS, 1U);
-			mtr.write<8,mtr_t::MAYBE_NOP>(
-				*block,
-				block->frame + undo->hdr_offset
-				+ TRX_UNDO_TABLE_ID, trx->table_id);
-			mtr.commit();
-			log_write_up_to(mtr.commit_lsn(), true);
-		}
 		/* Get a new tablespace ID */
-		ulint space_id;
-		dict_hdr_get_new_id(NULL, NULL, &space_id);
+		dict_hdr_get_new_id(NULL, NULL, &table->space_id);
 
 		DBUG_EXECUTE_IF(
 			"ib_create_table_fail_out_of_space_ids",
-			space_id = ULINT_UNDEFINED;
+			table->space_id = ULINT_UNDEFINED;
 		);
 
-		if (space_id == ULINT_UNDEFINED) {
+		if (table->space_id == ULINT_UNDEFINED) {
 			return DB_ERROR;
 		}
-
-		/* Determine the tablespace flags. */
-		bool	has_data_dir = DICT_TF_HAS_DATA_DIR(table->flags);
-		ulint	fsp_flags = dict_tf_to_fsp_flags(table->flags);
-		ut_ad(!has_data_dir || table->data_dir_path);
-		char*	filepath = has_data_dir
-			? fil_make_filepath(table->data_dir_path,
-					    table->name.m_name, IBD, true)
-			: fil_make_filepath(NULL,
-					    table->name.m_name, IBD, false);
-
-		/* We create a new single-table tablespace for the table.
-		We initially let it be 4 pages:
-		- page 0 is the fsp header and an extent descriptor page,
-		- page 1 is an ibuf bitmap page,
-		- page 2 is the first inode page,
-		- page 3 will contain the root of the clustered index of
-		the table we create here. */
-
-		dberr_t err;
-		table->space = fil_ibd_create(
-			space_id, table->name.m_name, filepath, fsp_flags,
-			FIL_IBD_FILE_INITIAL_SIZE,
-			node->mode, node->key_id, &err);
-
-		ut_free(filepath);
-
-		if (!table->space) {
-			ut_ad(err != DB_SUCCESS);
-			return err;
-		}
-
-		table->space_id = space_id;
-		mtr.start();
-		mtr.set_named_space(table->space);
-		fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
-		mtr.commit();
 	} else {
 		ut_ad(dict_tf_get_rec_format(table->flags)
 		      != REC_FORMAT_COMPRESSED);
@@ -470,7 +404,7 @@ dict_build_v_col_def_step(
 Based on an index object, this function builds the entry to be inserted
 in the SYS_INDEXES system table.
 @return the tuple which should be inserted */
-static
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
 dtuple_t*
 dict_create_sys_indexes_tuple(
 /*==========================*/
@@ -483,9 +417,10 @@ dict_create_sys_indexes_tuple(
 	dfield_t*	dfield;
 	byte*		ptr;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	ut_ad(index);
-	ut_ad(index->table->space || index->table->file_unreadable);
+	ut_ad(index->table->space || !UT_LIST_GET_LEN(index->table->indexes)
+	      || index->table->file_unreadable);
 	ut_ad(!index->table->space
 	      || index->table->space->id == index->table->space_id);
 	ut_ad(heap);
@@ -712,32 +647,26 @@ dict_build_index_def_step(
 	dtuple_t*	row;
 	trx_t*		trx;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	trx = thr_get_trx(thr);
 
 	index = node->index;
 
-	table = index->table = node->table = dict_table_open_on_name(
-		node->table_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+	table = dict_table_open_on_name(
+		node->table_name, true, DICT_ERR_IGNORE_TABLESPACE);
 
-	if (table == NULL) {
-		return(DB_TABLE_NOT_FOUND);
+	if (!table) {
+		return DB_TABLE_NOT_FOUND;
 	}
 
-	if (!trx->table_id) {
-		/* Record only the first table id. */
-		trx->table_id = table->id;
-	}
+	index->table = table;
 
 	ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
 	      || dict_index_is_clust(index));
 
 	dict_hdr_get_new_id(NULL, &index->id, NULL);
 
-	/* Inherit the space id from the table; we store all indexes of a
-	table in the same tablespace */
-
 	node->page_no = FIL_NULL;
 	row = dict_create_sys_indexes_tuple(index, node->heap);
 	node->ind_row = row;
@@ -748,7 +677,7 @@ dict_build_index_def_step(
 	index->trx_id = trx->id;
 	ut_ad(table->def_trx_id <= trx->id);
 	table->def_trx_id = trx->id;
-	dict_table_close(table, true, false);
+	table->release();
 
 	return(DB_SUCCESS);
 }
@@ -763,12 +692,7 @@ dict_build_index_def(
 	dict_index_t*		index,	/*!< in/out: index */
 	trx_t*			trx)	/*!< in/out: InnoDB transaction handle */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	if (trx->table_id == 0) {
-		/* Record only the first table id. */
-		trx->table_id = table->id;
-	}
+	ut_ad(dict_sys.locked());
 
 	ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
 	      || dict_index_is_clust(index));
@@ -798,7 +722,7 @@ dict_build_field_def_step(
 }
 
 /***************************************************************//**
-Creates an index tree for the index if it is not a member of a cluster.
+Creates an index tree for the index.
 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
@@ -811,7 +735,7 @@ dict_create_index_tree_step(
 	dict_index_t*	index;
 	dtuple_t*	search_tuple;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	index = node->index;
 
@@ -827,51 +751,62 @@ dict_create_index_tree_step(
 	mtr.start();
 
 	search_tuple = dict_create_search_tuple(node->ind_row, node->heap);
+	node->page_no = FIL_NULL;
+	pcur.btr_cur.page_cur.index =
+		UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
+
+	dberr_t err = btr_pcur_open(search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
+				    &pcur, &mtr);
 
-	btr_pcur_open(UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes),
-		      search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
-		      &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+		return err;
+	}
 
 	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 
+	if (UNIV_UNLIKELY(btr_pcur_is_after_last_on_page(&pcur))) {
+corrupted:
+		err = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	ulint	len;
+	byte*	data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
+					     DICT_FLD__SYS_INDEXES__ID,
+					     &len);
+	if (UNIV_UNLIKELY(len != 8 || mach_read_from_8(data) != index->id)) {
+		goto corrupted;
+	}
 
-	dberr_t		err = DB_SUCCESS;
+	data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
+				     DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+	if (len != 4) {
+		goto corrupted;
+	}
 
-	if (!index->is_readable()) {
-		node->page_no = FIL_NULL;
-	} else {
+	if (index->is_readable()) {
 		index->set_modified(mtr);
 
 		node->page_no = btr_create(
 			index->type, index->table->space,
-			index->id, index, &mtr);
-
-		if (node->page_no == FIL_NULL) {
-			err = DB_OUT_OF_FILE_SPACE;
-		}
+			index->id, index, &mtr, &err);
 
 		DBUG_EXECUTE_IF("ib_import_create_index_failure_1",
 				node->page_no = FIL_NULL;
 				err = DB_OUT_OF_FILE_SPACE; );
 	}
 
-	ulint	len;
-	byte*	data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
-					     DICT_FLD__SYS_INDEXES__PAGE_NO,
-					     &len);
-	ut_ad(len == 4);
 	mtr.write<4,mtr_t::MAYBE_NOP>(*btr_pcur_get_block(&pcur), data,
 				      node->page_no);
-
-	mtr.commit();
-
-	return(err);
+	goto func_exit;
 }
 
 /***************************************************************//**
 Creates an index tree for the index if it is not a member of a cluster.
 Don't update SYSTEM TABLES.
-@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+@return error code */
 dberr_t
 dict_create_index_tree_in_mem(
 /*==========================*/
@@ -880,7 +815,7 @@ dict_create_index_tree_in_mem(
 {
 	mtr_t		mtr;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	ut_ad(!(index->type & DICT_FTS));
 
 	mtr_start(&mtr);
@@ -891,73 +826,88 @@ dict_create_index_tree_in_mem(
 	ut_ad(index->is_readable());
 	ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED));
 
+	dberr_t err;
 	index->page = btr_create(index->type, index->table->space,
-				 index->id, index, &mtr);
+				 index->id, index, &mtr, &err);
 	mtr_commit(&mtr);
 
 	index->trx_id = trx->id;
 
-	return index->page == FIL_NULL ? DB_OUT_OF_FILE_SPACE : DB_SUCCESS;
+	return err;
 }
 
 /** Drop the index tree associated with a row in SYS_INDEXES table.
 @param[in,out]	pcur	persistent cursor on rec
 @param[in,out]	trx	dictionary transaction
-@param[in,out]	mtr	mini-transaction */
-void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
+@param[in,out]	mtr	mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
 {
-	rec_t*	rec = btr_pcur_get_rec(pcur);
-	byte*	ptr;
-	ulint	len;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-	ut_a(!dict_table_is_comp(dict_sys.sys_indexes));
-
-	ptr = rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
-
-	ut_ad(len == 4);
-
-	btr_pcur_store_position(pcur, mtr);
-
-	const uint32_t root_page_no = mach_read_from_4(ptr);
-
-	if (root_page_no == FIL_NULL) {
-		/* The tree has already been freed */
-		return;
-	}
-
-	compile_time_assert(FIL_NULL == 0xffffffff);
-	mtr->memset(btr_pcur_get_block(pcur), page_offset(ptr), 4, 0xff);
-
-	ptr = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
-
-	ut_ad(len == 4);
-
-	const uint32_t space_id = mach_read_from_4(ptr);
-	ut_ad(space_id < SRV_TMP_SPACE_ID);
-	if (space_id != TRX_SYS_SPACE
-	    && trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE) {
-		/* We are about to delete the entire .ibd file;
-		do not bother to free pages inside it. */
-		return;
-	}
-
-	ptr = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_INDEXES__ID, &len);
-
-	ut_ad(len == 8);
-
-	if (fil_space_t* s = fil_space_t::get(space_id)) {
-		/* Ensure that the tablespace file exists
-		in order to avoid a crash in buf_page_get_gen(). */
-		if (root_page_no < s->get_size()) {
-			btr_free_if_exists(page_id_t(space_id, root_page_no),
-					   s->zip_size(),
-					   mach_read_from_8(ptr), mtr);
-		}
-		s->release();
-	}
+  rec_t *rec= btr_pcur_get_rec(pcur);
+
+  ut_ad(!trx || dict_sys.locked());
+  ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+  btr_pcur_store_position(pcur, mtr);
+
+  static_assert(DICT_FLD__SYS_INDEXES__TABLE_ID == 0, "compatibility");
+  static_assert(DICT_FLD__SYS_INDEXES__ID == 1, "compatibility");
+
+  ulint len= rec_get_n_fields_old(rec);
+  if (len < DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD ||
+      len > DICT_NUM_FIELDS__SYS_INDEXES)
+  {
+rec_corrupted:
+    sql_print_error("InnoDB: Corrupted SYS_INDEXES record");
+    return 0;
+  }
+
+  if (rec_get_1byte_offs_flag(rec))
+  {
+    if (rec_1_get_field_end_info(rec, 0) != 8 ||
+        rec_1_get_field_end_info(rec, 1) != 8 + 8)
+      goto rec_corrupted;
+  }
+  else if (rec_2_get_field_end_info(rec, 0) != 8 ||
+           rec_2_get_field_end_info(rec, 1) != 8 + 8)
+    goto rec_corrupted;
+
+  const byte *p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+  if (len != 4)
+    goto rec_corrupted;
+  const uint32_t type= mach_read_from_4(p);
+  p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+  if (len != 4)
+    goto rec_corrupted;
+  const uint32_t root_page_no= mach_read_from_4(p);
+  p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+  if (len != 4)
+    goto rec_corrupted;
+
+  const uint32_t space_id= mach_read_from_4(p);
+  ut_ad(root_page_no == FIL_NULL || space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+  if (space_id && (type & DICT_CLUSTERED))
+    return space_id;
+
+  if (root_page_no == FIL_NULL)
+    /* The tree has already been freed */;
+  else if (fil_space_t*s= fil_space_t::get(space_id))
+  {
+    /* Ensure that the tablespace file exists
+    in order to avoid a crash in buf_page_get_gen(). */
+    if (root_page_no < s->get_size())
+    {
+      static_assert(FIL_NULL == 0xffffffff, "compatibility");
+      static_assert(DICT_FLD__SYS_INDEXES__PAGE_NO ==
+                    DICT_FLD__SYS_INDEXES__SPACE + 1, "compatibility");
+      mtr->memset(btr_pcur_get_block(pcur), page_offset(p + 4), 4, 0xff);
+      btr_free_if_exists(s, root_page_no, mach_read_from_8(rec + 8), mtr);
+    }
+    s->release();
+  }
+
+  return 0;
 }
 
 /*********************************************************************//**
@@ -968,9 +918,7 @@ tab_create_graph_create(
 /*====================*/
 	dict_table_t*	table,	/*!< in: table to create, built as a memory data
 				structure */
-	mem_heap_t*	heap,	/*!< in: heap where created */
-	fil_encryption_t mode,	/*!< in: encryption mode */
-	uint32_t	key_id)	/*!< in: encryption key_id */
+	mem_heap_t*	heap)	/*!< in: heap where created */
 {
 	tab_node_t*	node;
 
@@ -983,8 +931,6 @@ tab_create_graph_create(
 
 	node->state = TABLE_BUILD_TABLE_DEF;
 	node->heap = mem_heap_create(256);
-	node->mode = mode;
-	node->key_id = key_id;
 
 	node->tab_def = ins_node_create(INS_DIRECT, dict_sys.sys_tables,
 					heap);
@@ -1005,6 +951,8 @@ tab_create_graph_create(
 @param[in]	index	index to create, built as a memory data structure
 @param[in]	table	table name
 @param[in,out]	heap	heap where created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
 @param[in]	add_v	new virtual columns added in the same clause with
 			add index
 @return own: index create node */
@@ -1013,6 +961,8 @@ ind_create_graph_create(
 	dict_index_t*		index,
 	const char*		table,
 	mem_heap_t*		heap,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
 	const dict_add_v_col_t*	add_v)
 {
 	ind_node_t*	node;
@@ -1026,6 +976,8 @@ ind_create_graph_create(
 
 	node->table_name = table;
 
+	node->key_id = key_id;
+	node->mode = mode;
 	node->add_v = add_v;
 
 	node->state = INDEX_BUILD_INDEX_DEF;
@@ -1056,7 +1008,7 @@ dict_create_table_step(
 	trx_t*		trx;
 
 	ut_ad(thr);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	trx = thr_get_trx(thr);
 
@@ -1087,7 +1039,6 @@ dict_create_table_step(
 	}
 
 	if (node->state == TABLE_BUILD_COL_DEF) {
-
 		if (node->col_no + DATA_N_SYS_COLS
 		    < (static_cast<ulint>(node->table->n_def)
 		       + static_cast<ulint>(node->table->n_v_def))) {
@@ -1168,15 +1119,7 @@ dict_create_table_step(
 function_exit:
 	trx->error_state = err;
 
-	if (err == DB_SUCCESS) {
-		/* Ok: do nothing */
-
-	} else if (err == DB_LOCK_WAIT) {
-
-		return(NULL);
-	} else {
-		/* SQL error detected */
-
+	if (err != DB_SUCCESS) {
 		return(NULL);
 	}
 
@@ -1185,6 +1128,40 @@ function_exit:
 	return(thr);
 }
 
+static dberr_t dict_create_index_space(const ind_node_t &node)
+{
+  dict_table_t *table= node.index->table;
+  if (table->space || (table->flags2 & DICT_TF2_DISCARDED))
+    return DB_SUCCESS;
+  ut_ad(table->space_id);
+  ut_ad(table->space_id < SRV_TMP_SPACE_ID);
+  /* Determine the tablespace flags. */
+  const bool has_data_dir= DICT_TF_HAS_DATA_DIR(table->flags);
+  ut_ad(!has_data_dir || table->data_dir_path);
+  char* filepath= fil_make_filepath(has_data_dir
+                                    ? table->data_dir_path : nullptr,
+                                    table->name, IBD, has_data_dir);
+  if (!filepath)
+    return DB_OUT_OF_MEMORY;
+
+  /* We create a new single-table tablespace for the table.
+  We initially let it be 4 pages:
+  - page 0 is the fsp header and an extent descriptor page,
+  - page 1 is an ibuf bitmap page,
+  - page 2 is the first inode page,
+  - page 3 will contain the root of the clustered index of
+  the table we create here. */
+  dberr_t err;
+  table->space= fil_ibd_create(table->space_id, table->name, filepath,
+                               dict_tf_to_fsp_flags(table->flags),
+                               FIL_IBD_FILE_INITIAL_SIZE,
+                               node.mode, node.key_id, &err);
+  ut_ad((err != DB_SUCCESS) == !table->space);
+  ut_free(filepath);
+
+  return err;
+}
+
 /***********************************************************//**
 Creates an index. This is a high-level function used in SQL execution
 graphs.
@@ -1199,7 +1176,7 @@ dict_create_index_step(
 	trx_t*		trx;
 
 	ut_ad(thr);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	trx = thr_get_trx(thr);
 
@@ -1229,6 +1206,12 @@ dict_create_index_step(
 	}
 
 	if (node->state == INDEX_BUILD_FIELD_DEF) {
+		err = dict_create_index_space(*node);
+		if (err != DB_SUCCESS) {
+			dict_mem_index_free(node->index);
+			node->index = nullptr;
+			goto function_exit;
+		}
 
 		if (node->field_no < (node->index)->n_fields) {
 
@@ -1245,11 +1228,10 @@ dict_create_index_step(
 	}
 
 	if (node->state == INDEX_ADD_TO_CACHE) {
-		ut_ad(node->index->table == node->table);
 		err = dict_index_add_to_cache(node->index, FIL_NULL,
 					      node->add_v);
 
-		ut_ad((node->index == NULL) == (err != DB_SUCCESS));
+		ut_ad(!node->index == (err != DB_SUCCESS));
 
 		if (!node->index) {
 			goto function_exit;
@@ -1258,7 +1240,7 @@ dict_create_index_step(
 		ut_ad(!node->index->is_instant());
 		ut_ad(node->index->n_core_null_bytes
 		      == ((dict_index_is_clust(node->index)
-			   && node->table->supports_instant())
+			   && node->index->table->supports_instant())
 			  ? dict_index_t::NO_CORE_NULL_BYTES
 			  : UT_BITS_IN_BYTES(
 				  unsigned(node->index->n_nullable))));
@@ -1275,18 +1257,18 @@ dict_create_index_step(
 				err = DB_OUT_OF_MEMORY;);
 
 		if (err != DB_SUCCESS) {
+			dict_table_t* table = node->index->table;
 			/* If this is a FTS index, we will need to remove
 			it from fts->cache->indexes list as well */
-			if ((node->index->type & DICT_FTS)
-			    && node->table->fts) {
+			if (!(node->index->type & DICT_FTS)) {
+			} else if (auto fts = table->fts) {
 				fts_index_cache_t*	index_cache;
 
-				rw_lock_x_lock(
-					&node->table->fts->cache->init_lock);
+				mysql_mutex_lock(&fts->cache->init_lock);
 
 				index_cache = (fts_index_cache_t*)
 					 fts_find_index_cache(
-						node->table->fts->cache,
+						fts->cache,
 						node->index);
 
 				if (index_cache->words) {
@@ -1295,17 +1277,16 @@ dict_create_index_step(
 				}
 
 				ib_vector_remove(
-					node->table->fts->cache->indexes,
+					fts->cache->indexes,
 					*reinterpret_cast<void**>(index_cache));
 
-				rw_lock_x_unlock(
-					&node->table->fts->cache->init_lock);
+				mysql_mutex_unlock(&fts->cache->init_lock);
 			}
 
 #ifdef BTR_CUR_HASH_ADAPT
 			ut_ad(!node->index->search_info->ref_count);
 #endif /* BTR_CUR_HASH_ADAPT */
-			dict_index_remove_from_cache(node->table, node->index);
+			dict_index_remove_from_cache(table, node->index);
 			node->index = NULL;
 
 			goto function_exit;
@@ -1322,16 +1303,8 @@ dict_create_index_step(
 function_exit:
 	trx->error_state = err;
 
-	if (err == DB_SUCCESS) {
-		/* Ok: do nothing */
-
-	} else if (err == DB_LOCK_WAIT) {
-
-		return(NULL);
-	} else {
-		/* SQL error detected */
-
-		return(NULL);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return nullptr;
 	}
 
 	thr->run_node = que_node_get_parent(node);
@@ -1339,293 +1312,193 @@ function_exit:
 	return(thr);
 }
 
-/****************************************************************//**
-Check whether a system table exists.  Additionally, if it exists,
-move it to the non-LRU end of the table LRU list.  This is oly used
-for system tables that can be upgraded or added to an older database,
-which include SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_TABLESPACES and
-SYS_DATAFILES.
-@return DB_SUCCESS if the sys table exists, DB_CORRUPTION if it exists
-but is not current, DB_TABLE_NOT_FOUND if it does not exist*/
-static
-dberr_t
-dict_check_if_system_table_exists(
-/*==============================*/
-	const char*	tablename,	/*!< in: name of table */
-	ulint		num_fields,	/*!< in: number of fields */
-	ulint		num_indexes)	/*!< in: number of indexes */
-{
-	dict_table_t*	sys_table;
-	dberr_t		error = DB_SUCCESS;
-
-	ut_ad(!srv_any_background_activity());
-
-	mutex_enter(&dict_sys.mutex);
-
-	sys_table = dict_table_get_low(tablename);
-
-	if (sys_table == NULL) {
-		error = DB_TABLE_NOT_FOUND;
-
-	} else if (UT_LIST_GET_LEN(sys_table->indexes) != num_indexes
-		   || sys_table->n_cols != num_fields) {
-		error = DB_CORRUPTION;
-
-	} else {
-		/* This table has already been created, and it is OK.
-		Ensure that it can't be evicted from the table LRU cache. */
-
-		dict_table_prevent_eviction(sys_table);
-	}
-
-	mutex_exit(&dict_sys.mutex);
-
-	return(error);
-}
-
-/****************************************************************//**
-Creates the foreign key constraints system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_foreign_constraint_tables(void)
-/*================================================*/
+bool dict_sys_t::load_sys_tables()
 {
-	trx_t*		trx;
-	my_bool		srv_file_per_table_backup;
-	dberr_t		err;
-	dberr_t		sys_foreign_err;
-	dberr_t		sys_foreign_cols_err;
-
-	ut_ad(!srv_any_background_activity());
-
-	/* Note: The master thread has not been started at this point. */
-
-
-	sys_foreign_err = dict_check_if_system_table_exists(
-		"SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
-	sys_foreign_cols_err = dict_check_if_system_table_exists(
-		"SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
-
-	if (sys_foreign_err == DB_SUCCESS
-	    && sys_foreign_cols_err == DB_SUCCESS) {
-		return(DB_SUCCESS);
-	}
-
-	if (srv_read_only_mode
-	    || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
-		return(DB_READ_ONLY);
-	}
-
-	trx = trx_create();
-
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-
-	trx->op_info = "creating foreign key sys tables";
-
-	row_mysql_lock_data_dictionary(trx);
-
-	DBUG_EXECUTE_IF(
-		"create_and_drop_garbage",
-		err = que_eval_sql(
-			NULL,
-			"PROCEDURE CREATE_GARBAGE_TABLE_PROC () IS\n"
-			"BEGIN\n"
-			"CREATE TABLE\n"
-			"\"test/#sql-ib-garbage\"(ID CHAR);\n"
-			"CREATE UNIQUE CLUSTERED INDEX PRIMARY"
-			" ON \"test/#sql-ib-garbage\"(ID);\n"
-			"END;\n", FALSE, trx);
-		ut_ad(err == DB_SUCCESS);
-		row_drop_table_for_mysql("test/#sql-ib-garbage", trx,
-					 SQLCOM_DROP_DB, true););
-
-	/* Check which incomplete table definition to drop. */
-
-	if (sys_foreign_err == DB_CORRUPTION) {
-		row_drop_table_after_create_fail("SYS_FOREIGN", trx);
-	}
-
-	if (sys_foreign_cols_err == DB_CORRUPTION) {
-		row_drop_table_after_create_fail("SYS_FOREIGN_COLS", trx);
-	}
-
-	ib::info() << "Creating foreign key constraint system tables.";
-
-	/* NOTE: in dict_load_foreigns we use the fact that
-	there are 2 secondary indexes on SYS_FOREIGN, and they
-	are defined just like below */
-
-	/* NOTE: when designing InnoDB's foreign key support in 2001, we made
-	an error and made the table names and the foreign key id of type
-	'CHAR' (internally, really a VARCHAR). We should have made the type
-	VARBINARY, like in other InnoDB system tables, to get a clean
-	design. */
-
-	srv_file_per_table_backup = srv_file_per_table;
-
-	/* We always want SYSTEM tables to be created inside the system
-	tablespace. */
-
-	srv_file_per_table = 0;
-
-	err = que_eval_sql(
-		NULL,
-		"PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
-		"BEGIN\n"
-		"CREATE TABLE\n"
-		"SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
-		" REF_NAME CHAR, N_COLS INT);\n"
-		"CREATE UNIQUE CLUSTERED INDEX ID_IND"
-		" ON SYS_FOREIGN (ID);\n"
-		"CREATE INDEX FOR_IND"
-		" ON SYS_FOREIGN (FOR_NAME);\n"
-		"CREATE INDEX REF_IND"
-		" ON SYS_FOREIGN (REF_NAME);\n"
-		"CREATE TABLE\n"
-		"SYS_FOREIGN_COLS(ID CHAR, POS INT,"
-		" FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
-		"CREATE UNIQUE CLUSTERED INDEX ID_IND"
-		" ON SYS_FOREIGN_COLS (ID, POS);\n"
-		"END;\n",
-		FALSE, trx);
-
-	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-		ib::error() << "Creation of SYS_FOREIGN and SYS_FOREIGN_COLS"
-			" failed: " << err << ". Tablespace is"
-			" full. Dropping incompletely created tables.";
-
-		ut_ad(err == DB_OUT_OF_FILE_SPACE
-		      || err == DB_TOO_MANY_CONCURRENT_TRXS);
-
-		row_drop_table_after_create_fail("SYS_FOREIGN", trx);
-		row_drop_table_after_create_fail("SYS_FOREIGN_COLS", trx);
-
-		if (err == DB_OUT_OF_FILE_SPACE) {
-			err = DB_MUST_GET_MORE_FILE_SPACE;
-		}
-	}
-
-	trx_commit_for_mysql(trx);
-
-	row_mysql_unlock_data_dictionary(trx);
-
-	trx->free();
-
-	srv_file_per_table = srv_file_per_table_backup;
-
-	/* Note: The master thread has not been started at this point. */
-	/* Confirm and move to the non-LRU part of the table LRU list. */
-	sys_foreign_err = dict_check_if_system_table_exists(
-		"SYS_FOREIGN", DICT_NUM_FIELDS__SYS_FOREIGN + 1, 3);
-	ut_a(sys_foreign_err == DB_SUCCESS);
-
-	sys_foreign_cols_err = dict_check_if_system_table_exists(
-		"SYS_FOREIGN_COLS", DICT_NUM_FIELDS__SYS_FOREIGN_COLS + 1, 1);
-	ut_a(sys_foreign_cols_err == DB_SUCCESS);
-
-	return(err);
+  ut_ad(!srv_any_background_activity());
+  bool mismatch= false;
+  lock(SRW_LOCK_CALL);
+  if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN],
+                                DICT_ERR_IGNORE_FK_NOKEY)));
+  else if (UT_LIST_GET_LEN(sys_foreign->indexes) == 3 &&
+           sys_foreign->n_cols == DICT_NUM_COLS__SYS_FOREIGN + DATA_N_SYS_COLS)
+    prevent_eviction(sys_foreign);
+  else
+  {
+    sys_foreign= nullptr;
+    mismatch= true;
+    sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN");
+  }
+  if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS],
+                                     DICT_ERR_IGNORE_FK_NOKEY)));
+  else if (UT_LIST_GET_LEN(sys_foreign_cols->indexes) == 1 &&
+           sys_foreign_cols->n_cols ==
+           DICT_NUM_COLS__SYS_FOREIGN_COLS + DATA_N_SYS_COLS)
+    prevent_eviction(sys_foreign_cols);
+  else
+  {
+    sys_foreign_cols= nullptr;
+    mismatch= true;
+    sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN_COLS");
+  }
+  if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL],
+                                DICT_ERR_IGNORE_FK_NOKEY)));
+  else if (UT_LIST_GET_LEN(sys_virtual->indexes) == 1 &&
+           sys_virtual->n_cols == DICT_NUM_COLS__SYS_VIRTUAL + DATA_N_SYS_COLS)
+    prevent_eviction(sys_virtual);
+  else
+  {
+    sys_virtual= nullptr;
+    mismatch= true;
+    sql_print_error("InnoDB: Invalid definition of SYS_VIRTUAL");
+  }
+  unlock();
+  return mismatch;
 }
 
-/** Creates the virtual column system table (SYS_VIRTUAL) inside InnoDB
-at server bootstrap or server start if the table is not found or is
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_sys_virtual()
+dberr_t dict_sys_t::create_or_check_sys_tables()
 {
-	trx_t*		trx;
-	my_bool		srv_file_per_table_backup;
-	dberr_t		err;
-
-	ut_ad(!srv_any_background_activity());
-
-	/* Note: The master thread has not been started at this point. */
-	err = dict_check_if_system_table_exists(
-		"SYS_VIRTUAL", DICT_NUM_FIELDS__SYS_VIRTUAL + 1, 1);
-
-	if (err == DB_SUCCESS) {
-		mutex_enter(&dict_sys.mutex);
-		dict_sys.sys_virtual = dict_table_get_low("SYS_VIRTUAL");
-		mutex_exit(&dict_sys.mutex);
-		return(DB_SUCCESS);
-	}
-
-	if (srv_read_only_mode
-	    || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
-		return(DB_READ_ONLY);
-	}
-
-	trx = trx_create();
-
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-
-	trx->op_info = "creating sys_virtual tables";
-
-	row_mysql_lock_data_dictionary(trx);
-
-	/* Check which incomplete table definition to drop. */
-
-	if (err == DB_CORRUPTION) {
-		row_drop_table_after_create_fail("SYS_VIRTUAL", trx);
-	}
-
-	ib::info() << "Creating sys_virtual system tables.";
-
-	srv_file_per_table_backup = srv_file_per_table;
-
-	/* We always want SYSTEM tables to be created inside the system
-	tablespace. */
-
-	srv_file_per_table = 0;
-
-	err = que_eval_sql(
-		NULL,
-		"PROCEDURE CREATE_SYS_VIRTUAL_TABLES_PROC () IS\n"
-		"BEGIN\n"
-		"CREATE TABLE\n"
-		"SYS_VIRTUAL(TABLE_ID BIGINT, POS INT,"
-		" BASE_POS INT);\n"
-		"CREATE UNIQUE CLUSTERED INDEX BASE_IDX"
-		" ON SYS_VIRTUAL(TABLE_ID, POS, BASE_POS);\n"
-		"END;\n",
-		FALSE, trx);
-
-	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-		ib::error() << "Creation of SYS_VIRTUAL"
-			" failed: " << err << ". Tablespace is"
-			" full or too many transactions."
-			" Dropping incompletely created tables.";
-
-		ut_ad(err == DB_OUT_OF_FILE_SPACE
-		      || err == DB_TOO_MANY_CONCURRENT_TRXS);
-
-		row_drop_table_after_create_fail("SYS_VIRTUAL", trx);
-
-		if (err == DB_OUT_OF_FILE_SPACE) {
-			err = DB_MUST_GET_MORE_FILE_SPACE;
-		}
-	}
-
-	trx_commit_for_mysql(trx);
-
-	row_mysql_unlock_data_dictionary(trx);
-
-	trx->free();
-
-	srv_file_per_table = srv_file_per_table_backup;
-
-	/* Note: The master thread has not been started at this point. */
-	/* Confirm and move to the non-LRU part of the table LRU list. */
-	dberr_t sys_virtual_err = dict_check_if_system_table_exists(
-		"SYS_VIRTUAL", DICT_NUM_FIELDS__SYS_VIRTUAL + 1, 1);
-	ut_a(sys_virtual_err == DB_SUCCESS);
-	mutex_enter(&dict_sys.mutex);
-	dict_sys.sys_virtual = dict_table_get_low("SYS_VIRTUAL");
-	mutex_exit(&dict_sys.mutex);
-
-	return(err);
+  if (sys_tables_exist())
+    return DB_SUCCESS;
+
+  if (srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)
+    return DB_READ_ONLY;
+
+  if (load_sys_tables())
+  {
+    sql_print_information("InnoDB: Set innodb_read_only=1 "
+                          "or innodb_force_recovery=3 to start up");
+    return DB_CORRUPTION;
+  }
+
+  if (sys_tables_exist())
+    return DB_SUCCESS;
+
+  trx_t *trx= trx_create();
+  trx_start_for_ddl(trx);
+
+  {
+    /* Do not bother with transactional memory; this is only
+    executed at startup, with no conflicts present. */
+    LockMutexGuard g{SRW_LOCK_CALL};
+    trx->mutex_lock();
+    lock_table_create(dict_sys.sys_tables, LOCK_X, trx);
+    lock_table_create(dict_sys.sys_columns, LOCK_X, trx);
+    lock_table_create(dict_sys.sys_indexes, LOCK_X, trx);
+    lock_table_create(dict_sys.sys_fields, LOCK_X, trx);
+    trx->mutex_unlock();
+  }
+
+  row_mysql_lock_data_dictionary(trx);
+
+  /* NOTE: when designing InnoDB's foreign key support in 2001, Heikki Tuuri
+  made a mistake and defined table names and the foreign key id to be of type
+  CHAR (internally, really VARCHAR). The type should have been VARBINARY. */
+
+  /* System tables are always created inside the system tablespace. */
+  const auto srv_file_per_table_backup= srv_file_per_table;
+  srv_file_per_table= 0;
+  dberr_t error;
+  span<const char> tablename;
+
+  if (!sys_foreign)
+  {
+    error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN() IS\n"
+                        "BEGIN\n"
+                        "CREATE TABLE\n"
+                        "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+                        " REF_NAME CHAR, N_COLS INT);\n"
+                        "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+                        " ON SYS_FOREIGN (ID);\n"
+                        "CREATE INDEX FOR_IND"
+                        " ON SYS_FOREIGN (FOR_NAME);\n"
+                        "CREATE INDEX REF_IND"
+                        " ON SYS_FOREIGN (REF_NAME);\n"
+                        "END;\n", trx);
+    if (UNIV_UNLIKELY(error != DB_SUCCESS))
+    {
+      tablename= SYS_TABLE[SYS_FOREIGN];
+err_exit:
+      sql_print_error("InnoDB: Creation of %.*s failed: %s",
+                      int(tablename.size()), tablename.data(),
+                      ut_strerr(error));
+      trx->rollback();
+      row_mysql_unlock_data_dictionary(trx);
+      trx->free();
+      srv_file_per_table= srv_file_per_table_backup;
+      return error;
+    }
+  }
+  if (!sys_foreign_cols)
+  {
+    error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN_COLS() IS\n"
+                        "BEGIN\n"
+                        "CREATE TABLE\n"
+                        "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+                        " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+                        "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+                        " ON SYS_FOREIGN_COLS (ID, POS);\n"
+                        "END;\n", trx);
+    if (UNIV_UNLIKELY(error != DB_SUCCESS))
+    {
+      tablename= SYS_TABLE[SYS_FOREIGN_COLS];
+      goto err_exit;
+    }
+  }
+  if (!sys_virtual)
+  {
+    error= que_eval_sql(nullptr, "PROCEDURE CREATE_VIRTUAL() IS\n"
+                        "BEGIN\n"
+                        "CREATE TABLE\n"
+                        "SYS_VIRTUAL(TABLE_ID BIGINT,POS INT,BASE_POS INT);\n"
+                        "CREATE UNIQUE CLUSTERED INDEX BASE_IDX"
+                        " ON SYS_VIRTUAL(TABLE_ID, POS, BASE_POS);\n"
+                        "END;\n", trx);
+    if (UNIV_UNLIKELY(error != DB_SUCCESS))
+    {
+      tablename= SYS_TABLE[SYS_VIRTUAL];
+      goto err_exit;
+    }
+  }
+
+  trx->commit();
+  row_mysql_unlock_data_dictionary(trx);
+  trx->free();
+  srv_file_per_table= srv_file_per_table_backup;
+
+  lock(SRW_LOCK_CALL);
+  if (sys_foreign);
+  else if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN])))
+  {
+    tablename= SYS_TABLE[SYS_FOREIGN];
+load_fail:
+    unlock();
+    sql_print_error("InnoDB: Failed to CREATE TABLE %.*s",
+                    int(tablename.size()), tablename.data());
+    return DB_TABLE_NOT_FOUND;
+  }
+  else
+    prevent_eviction(sys_foreign);
+
+  if (sys_foreign_cols);
+  else if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS])))
+  {
+    tablename= SYS_TABLE[SYS_FOREIGN_COLS];
+    goto load_fail;
+  }
+  else
+    prevent_eviction(sys_foreign_cols);
+
+  if (sys_virtual);
+  else if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL])))
+  {
+    tablename= SYS_TABLE[SYS_VIRTUAL];
+    goto load_fail;
+  }
+  else
+    prevent_eviction(sys_virtual);
+
+  unlock();
+  return DB_SUCCESS;
 }
 
 /****************************************************************//**
@@ -1641,13 +1514,15 @@ dict_foreign_eval_sql(
 	const char*	id,	/*!< in: foreign key id */
 	trx_t*		trx)	/*!< in/out: transaction */
 {
-	dberr_t	error;
 	FILE*	ef	= dict_foreign_err_file;
 
-	error = que_eval_sql(info, sql, FALSE, trx);
+	dberr_t error = que_eval_sql(info, sql, trx);
 
-	if (error == DB_DUPLICATE_KEY) {
-		mutex_enter(&dict_foreign_err_mutex);
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_DUPLICATE_KEY:
+		mysql_mutex_lock(&dict_foreign_err_mutex);
 		rewind(ef);
 		ut_print_timestamp(ef);
 		fputs(" Error in foreign key constraint creation for table ",
@@ -1660,36 +1535,31 @@ dict_foreign_eval_sql(
 		      "in front of the user-defined constraint name.)\n"
 		      "Note that InnoDB's FOREIGN KEY system tables store\n"
 		      "constraint names as case-insensitive, with the\n"
-		      "MySQL standard latin1_swedish_ci collation. If you\n"
+		      "MariaDB standard latin1_swedish_ci collation. If you\n"
 		      "create tables or databases whose names differ only in\n"
 		      "the character case, then collisions in constraint\n"
 		      "names can occur. Workaround: name your constraints\n"
 		      "explicitly with unique names.\n",
 		      ef);
+		goto release;
+	default:
+		sql_print_error("InnoDB: "
+				"Foreign key constraint creation failed: %s",
+				ut_strerr(error));
 
-		mutex_exit(&dict_foreign_err_mutex);
-
-		return(error);
-	}
-
-	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
-		ib::error() << "Foreign key constraint creation failed: "
-			<< error;
-
-		mutex_enter(&dict_foreign_err_mutex);
+		mysql_mutex_lock(&dict_foreign_err_mutex);
 		ut_print_timestamp(ef);
 		fputs(" Internal error in foreign key constraint creation"
 		      " for table ", ef);
 		ut_print_name(ef, trx, name);
 		fputs(".\n"
-		      "See the MySQL .err log in the datadir"
+		      "See the MariaDB .err log in the datadir"
 		      " for more information.\n", ef);
-		mutex_exit(&dict_foreign_err_mutex);
-
-		return(error);
+release:
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
 	}
 
-	return(DB_SUCCESS);
+	return error;
 }
 
 /********************************************************************//**
@@ -1732,7 +1602,7 @@ dict_create_add_foreign_field_to_dictionary(
 /********************************************************************//**
 Construct foreign key constraint defintion from data dictionary information.
 */
-UNIV_INTERN
+static
 char*
 dict_foreign_def_get(
 /*=================*/
@@ -2017,221 +1887,19 @@ dict_create_add_foreigns_to_dictionary(
 	const dict_table_t*	table,
 	trx_t*			trx)
 {
-	dict_foreign_t*	foreign;
-	dberr_t		error;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	if (NULL == dict_table_get_low("SYS_FOREIGN")) {
+  ut_ad(dict_sys.locked());
 
-		ib::error() << "Table SYS_FOREIGN not found"
-			" in internal data dictionary";
-
-		return(DB_ERROR);
-	}
-
-	error = DB_SUCCESS;
-
-	for (dict_foreign_set::const_iterator it = local_fk_set.begin();
-	     it != local_fk_set.end();
-	     ++it) {
-
-		foreign = *it;
-		ut_ad(foreign->id != NULL);
-
-		error = dict_create_add_foreign_to_dictionary(
-			table->name.m_name, foreign, trx);
-
-		if (error != DB_SUCCESS) {
-			break;
-		}
-	}
-
-	return error;
-}
-
-/****************************************************************//**
-Creates the tablespaces and datafiles system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_sys_tablespace(void)
-/*=====================================*/
-{
-	trx_t*		trx;
-	my_bool		srv_file_per_table_backup;
-	dberr_t		err;
-	dberr_t		sys_tablespaces_err;
-	dberr_t		sys_datafiles_err;
-
-	ut_ad(!srv_any_background_activity());
-
-	/* Note: The master thread has not been started at this point. */
-
-	sys_tablespaces_err = dict_check_if_system_table_exists(
-		"SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
-	sys_datafiles_err = dict_check_if_system_table_exists(
-		"SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
-
-	if (sys_tablespaces_err == DB_SUCCESS
-	    && sys_datafiles_err == DB_SUCCESS) {
-		srv_sys_tablespaces_open = true;
-		return(DB_SUCCESS);
-	}
-
-	if (srv_read_only_mode
-	    || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
-		return(DB_READ_ONLY);
-	}
-
-	trx = trx_create();
-
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-
-	trx->op_info = "creating tablepace and datafile sys tables";
-
-	row_mysql_lock_data_dictionary(trx);
-
-	/* Check which incomplete table definition to drop. */
-
-	if (sys_tablespaces_err == DB_CORRUPTION) {
-		row_drop_table_after_create_fail("SYS_TABLESPACES", trx);
-	}
-
-	if (sys_datafiles_err == DB_CORRUPTION) {
-		row_drop_table_after_create_fail("SYS_DATAFILES", trx);
-	}
-
-	ib::info() << "Creating tablespace and datafile system tables.";
-
-	/* We always want SYSTEM tables to be created inside the system
-	tablespace. */
-	srv_file_per_table_backup = srv_file_per_table;
-	srv_file_per_table = 0;
-
-	err = que_eval_sql(
-		NULL,
-		"PROCEDURE CREATE_SYS_TABLESPACE_PROC () IS\n"
-		"BEGIN\n"
-		"CREATE TABLE SYS_TABLESPACES(\n"
-		" SPACE INT, NAME CHAR, FLAGS INT);\n"
-		"CREATE UNIQUE CLUSTERED INDEX SYS_TABLESPACES_SPACE"
-		" ON SYS_TABLESPACES (SPACE);\n"
-		"CREATE TABLE SYS_DATAFILES(\n"
-		" SPACE INT, PATH CHAR);\n"
-		"CREATE UNIQUE CLUSTERED INDEX SYS_DATAFILES_SPACE"
-		" ON SYS_DATAFILES (SPACE);\n"
-		"END;\n",
-		FALSE, trx);
-
-	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-		ib::error() << "Creation of SYS_TABLESPACES and SYS_DATAFILES"
-			" has failed with error " << err
-			<< ". Dropping incompletely created tables.";
-
-		ut_a(err == DB_OUT_OF_FILE_SPACE
-		     || err == DB_DUPLICATE_KEY
-		     || err == DB_TOO_MANY_CONCURRENT_TRXS);
-
-		row_drop_table_after_create_fail("SYS_TABLESPACES", trx);
-		row_drop_table_after_create_fail("SYS_DATAFILES", trx);
-
-		if (err == DB_OUT_OF_FILE_SPACE) {
-			err = DB_MUST_GET_MORE_FILE_SPACE;
-		}
-	}
-
-	trx_commit_for_mysql(trx);
-
-	row_mysql_unlock_data_dictionary(trx);
-
-	trx->free();
-
-	srv_file_per_table = srv_file_per_table_backup;
-
-	if (err == DB_SUCCESS) {
-		srv_sys_tablespaces_open = true;
-	}
-
-	/* Note: The master thread has not been started at this point. */
-	/* Confirm and move to the non-LRU part of the table LRU list. */
-
-	sys_tablespaces_err = dict_check_if_system_table_exists(
-		"SYS_TABLESPACES", DICT_NUM_FIELDS__SYS_TABLESPACES + 1, 1);
-	ut_a(sys_tablespaces_err == DB_SUCCESS || err != DB_SUCCESS);
-
-	sys_datafiles_err = dict_check_if_system_table_exists(
-		"SYS_DATAFILES", DICT_NUM_FIELDS__SYS_DATAFILES + 1, 1);
-	ut_a(sys_datafiles_err == DB_SUCCESS || err != DB_SUCCESS);
-
-	return(err);
-}
-
-/** Put a tablespace definition into the data dictionary,
-replacing what was there previously.
-@param[in]	space	Tablespace id
-@param[in]	name	Tablespace name
-@param[in]	flags	Tablespace flags
-@param[in]	path	Tablespace path
-@param[in]	trx	Transaction
-@return error code or DB_SUCCESS */
-dberr_t
-dict_replace_tablespace_in_dictionary(
-	ulint		space_id,
-	const char*	name,
-	ulint		flags,
-	const char*	path,
-	trx_t*		trx)
-{
-	if (!srv_sys_tablespaces_open) {
-		/* Startup procedure is not yet ready for updates. */
-		return(DB_SUCCESS);
-	}
-
-	dberr_t		error;
-
-	pars_info_t*	info = pars_info_create();
-
-	pars_info_add_int4_literal(info, "space", space_id);
-
-	pars_info_add_str_literal(info, "name", name);
-
-	pars_info_add_int4_literal(info, "flags", flags);
-
-	pars_info_add_str_literal(info, "path", path);
-
-	error = que_eval_sql(info,
-			     "PROCEDURE P () IS\n"
-			     "p CHAR;\n"
-
-			     "DECLARE CURSOR c IS\n"
-			     " SELECT PATH FROM SYS_DATAFILES\n"
-			     " WHERE SPACE=:space FOR UPDATE;\n"
-
-			     "BEGIN\n"
-			     "OPEN c;\n"
-			     "FETCH c INTO p;\n"
-
-			     "IF (SQL % NOTFOUND) THEN"
-			     "  DELETE FROM SYS_TABLESPACES "
-			     "WHERE SPACE=:space;\n"
-			     "  INSERT INTO SYS_TABLESPACES VALUES"
-			     "(:space, :name, :flags);\n"
-			     "  INSERT INTO SYS_DATAFILES VALUES"
-			     "(:space, :path);\n"
-			     "ELSIF p <> :path THEN\n"
-			     "  UPDATE SYS_DATAFILES SET PATH=:path"
-			     " WHERE CURRENT OF c;\n"
-			     "END IF;\n"
-			     "END;\n",
-			     FALSE, trx);
-
-	if (error != DB_SUCCESS) {
-		return(error);
-	}
+  if (!dict_sys.sys_foreign)
+  {
+    sql_print_error("InnoDB: Table SYS_FOREIGN not found"
+                    " in internal data dictionary");
+    return DB_ERROR;
+  }
 
-	trx->op_info = "";
+  for (auto fk : local_fk_set)
+    if (dberr_t error=
+        dict_create_add_foreign_to_dictionary(table->name.m_name, fk, trx))
+      return error;
 
-	return(error);
+  return DB_SUCCESS;
 }
diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc
index b4542f79703..bec6da8e6af 100644
--- a/storage/innobase/dict/dict0defrag_bg.cc
+++ b/storage/innobase/dict/dict0defrag_bg.cc
@@ -29,12 +29,11 @@ Created 25/08/2016 Jan Lindström
 #include "dict0defrag_bg.h"
 #include "btr0btr.h"
 #include "srv0start.h"
+#include "trx0trx.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
 
-static ib_mutex_t		defrag_pool_mutex;
-
-#ifdef MYSQL_PFS
-static mysql_pfs_key_t		defrag_pool_mutex_key;
-#endif
+static mysql_mutex_t defrag_pool_mutex;
 
 /** Iterator type for iterating over the elements of objects of type
 defrag_pool_t. */
@@ -52,9 +51,7 @@ dict_defrag_pool_init(void)
 /*=======================*/
 {
 	ut_ad(!srv_read_only_mode);
-
-	/* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
-	mutex_create(LATCH_ID_DEFRAGMENT_MUTEX, &defrag_pool_mutex);
+	mysql_mutex_init(0, &defrag_pool_mutex, nullptr);
 }
 
 /*****************************************************************//**
@@ -66,7 +63,7 @@ dict_defrag_pool_deinit(void)
 {
 	ut_ad(!srv_read_only_mode);
 
-	mutex_free(&defrag_pool_mutex);
+	mysql_mutex_destroy(&defrag_pool_mutex);
 }
 
 /*****************************************************************//**
@@ -84,10 +81,10 @@ dict_stats_defrag_pool_get(
 {
 	ut_ad(!srv_read_only_mode);
 
-	mutex_enter(&defrag_pool_mutex);
+	mysql_mutex_lock(&defrag_pool_mutex);
 
 	if (defrag_pool.empty()) {
-		mutex_exit(&defrag_pool_mutex);
+		mysql_mutex_unlock(&defrag_pool_mutex);
 		return(false);
 	}
 
@@ -97,7 +94,7 @@ dict_stats_defrag_pool_get(
 
 	defrag_pool.pop_back();
 
-	mutex_exit(&defrag_pool_mutex);
+	mysql_mutex_unlock(&defrag_pool_mutex);
 
 	return(true);
 }
@@ -117,7 +114,7 @@ dict_stats_defrag_pool_add(
 
 	ut_ad(!srv_read_only_mode);
 
-	mutex_enter(&defrag_pool_mutex);
+	mysql_mutex_lock(&defrag_pool_mutex);
 
 	/* quit if already in the list */
 	for (defrag_pool_iterator_t iter = defrag_pool.begin();
@@ -125,7 +122,7 @@ dict_stats_defrag_pool_add(
 	     ++iter) {
 		if ((*iter).table_id == index->table->id
 		    && (*iter).index_id == index->id) {
-			mutex_exit(&defrag_pool_mutex);
+			mysql_mutex_unlock(&defrag_pool_mutex);
 			return;
 		}
 	}
@@ -137,7 +134,7 @@ dict_stats_defrag_pool_add(
 		/* Kick off dict stats optimizer work */
 		dict_stats_schedule_now();
 	}
-	mutex_exit(&defrag_pool_mutex);
+	mysql_mutex_unlock(&defrag_pool_mutex);
 }
 
 /*****************************************************************//**
@@ -151,9 +148,9 @@ dict_stats_defrag_pool_del(
 {
 	ut_a((table && !index) || (!table && index));
 	ut_ad(!srv_read_only_mode);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.frozen());
 
-	mutex_enter(&defrag_pool_mutex);
+	mysql_mutex_lock(&defrag_pool_mutex);
 
 	defrag_pool_iterator_t iter = defrag_pool.begin();
 	while (iter != defrag_pool.end()) {
@@ -170,91 +167,165 @@ dict_stats_defrag_pool_del(
 		}
 	}
 
-	mutex_exit(&defrag_pool_mutex);
+	mysql_mutex_unlock(&defrag_pool_mutex);
 }
 
 /*****************************************************************//**
 Get the first index that has been added for updating persistent defrag
 stats and eventually save its stats. */
-static
-void
-dict_stats_process_entry_from_defrag_pool()
+static void dict_stats_process_entry_from_defrag_pool(THD *thd)
 {
-	table_id_t	table_id;
-	index_id_t	index_id;
-
-	ut_ad(!srv_read_only_mode);
-
-	/* pop the first index from the auto defrag pool */
-	if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
-		/* no index in defrag pool */
-		return;
-	}
-
-	dict_table_t*	table;
-
-	mutex_enter(&dict_sys.mutex);
-
-	/* If the table is no longer cached, we've already lost the in
-	memory stats so there's nothing really to write to disk. */
-	table = dict_table_open_on_id(table_id, TRUE,
-				      DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
-
-	dict_index_t* index = table && !table->corrupted
-		? dict_table_find_index_on_id(table, index_id)
-		: NULL;
-
-	if (!index || index->is_corrupted()) {
-		if (table) {
-			dict_table_close(table, TRUE, FALSE);
-		}
-		mutex_exit(&dict_sys.mutex);
-		return;
-	}
-
-	mutex_exit(&dict_sys.mutex);
-	dict_stats_save_defrag_stats(index);
-	dict_table_close(table, FALSE, FALSE);
+  table_id_t table_id;
+  index_id_t index_id;
+
+  ut_ad(!srv_read_only_mode);
+
+  /* pop the first index from the auto defrag pool */
+  if (!dict_stats_defrag_pool_get(&table_id, &index_id))
+    /* no index in defrag pool */
+    return;
+
+  /* If the table is no longer cached, we've already lost the in
+  memory stats so there's nothing really to write to disk. */
+  MDL_ticket *mdl= nullptr;
+  if (dict_table_t *table=
+      dict_table_open_on_id(table_id, false, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
+                            thd, &mdl))
+  {
+    if (dict_index_t *index= !table->corrupted
+        ? dict_table_find_index_on_id(table, index_id) : nullptr)
+      if (index->is_btree())
+        dict_stats_save_defrag_stats(index);
+    dict_table_close(table, false, thd, mdl);
+  }
 }
 
-/*****************************************************************//**
+/**
 Get the first index that has been added for updating persistent defrag
 stats and eventually save its stats. */
-void
-dict_defrag_process_entries_from_defrag_pool()
-/*==========================================*/
+void dict_defrag_process_entries_from_defrag_pool(THD *thd)
 {
-	while (defrag_pool.size()) {
-		dict_stats_process_entry_from_defrag_pool();
-	}
+  while (!defrag_pool.empty())
+    dict_stats_process_entry_from_defrag_pool(thd);
 }
 
 /*********************************************************************//**
 Save defragmentation result.
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_save_defrag_summary(
-/*============================*/
-	dict_index_t*	index)	/*!< in: index */
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+{
+  if (index->is_ibuf())
+    return DB_SUCCESS;
+
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (table_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
+    dict_sys.unfreeze();
+  }
+  if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
+  {
+release_and_exit:
+    if (table_stats)
+      dict_table_close(table_stats, false, thd, mdl_table);
+    return DB_STATS_DO_NOT_EXIST;
+  }
+
+  dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
+    dict_sys.unfreeze();
+  }
+  if (!index_stats)
+    goto release_and_exit;
+  if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
+  {
+    dict_table_close(index_stats, false, thd, mdl_index);
+    goto release_and_exit;
+  }
+
+  trx_t *trx= trx_create();
+  trx->mysql_thd= thd;
+  trx_start_internal(trx);
+  dberr_t ret= trx->read_only
+    ? DB_READ_ONLY
+    : lock_table_for_trx(table_stats, trx, LOCK_X);
+  if (ret == DB_SUCCESS)
+    ret= lock_table_for_trx(index_stats, trx, LOCK_X);
+  row_mysql_lock_data_dictionary(trx);
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed",
+                                    index->stat_defrag_n_pages_freed,
+                                    nullptr,
+                                    "Number of pages freed during"
+                                    " last defragmentation run.",
+                                    trx);
+  if (ret == DB_SUCCESS)
+    trx->commit();
+  else
+    trx->rollback();
+
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+
+  row_mysql_unlock_data_dictionary(trx);
+  trx->free();
+
+  return ret;
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return	number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+static
+ulint
+btr_get_size_and_reserved(
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
 {
-	dberr_t	ret=DB_SUCCESS;
+	ulint		dummy;
+
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
+	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
 
-	if (dict_index_is_ibuf(index)) {
-		return DB_SUCCESS;
+	if (index->page == FIL_NULL
+	    || dict_index_is_online_ddl(index)
+	    || !index->is_committed()
+	    || !index->table->space) {
+		return(ULINT_UNDEFINED);
 	}
 
-	dict_sys_lock();
+	dberr_t err;
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
+	*used = 0;
+	if (!root) {
+		return ULINT_UNDEFINED;
+	}
 
-	ret = dict_stats_save_index_stat(index, time(NULL), "n_pages_freed",
-					 index->stat_defrag_n_pages_freed,
-					 NULL,
-					 "Number of pages freed during"
-					 " last defragmentation run.",
-					 NULL);
+	mtr->x_lock_space(index->table->space);
 
-	dict_sys_unlock();
+	ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					+ root->page.frame, used, mtr);
+	if (flag == BTR_TOTAL_SIZE) {
+		n += fseg_n_reserved_pages(*root,
+					   PAGE_HEADER + PAGE_BTR_SEG_TOP
+					   + root->page.frame, &dummy, mtr);
+		*used += dummy;
+	}
 
-	return (ret);
+	return(n);
 }
 
 /*********************************************************************//**
@@ -265,63 +336,99 @@ dict_stats_save_defrag_stats(
 /*============================*/
 	dict_index_t*	index)	/*!< in: index */
 {
-	dberr_t	ret;
-
-	if (dict_index_is_ibuf(index)) {
-		return DB_SUCCESS;
-	}
-
-	if (!index->is_readable()) {
-		return dict_stats_report_error(index->table, true);
-	}
-
-	const time_t now = time(NULL);
-	mtr_t	mtr;
-	ulint	n_leaf_pages;
-	ulint	n_leaf_reserved;
-	mtr.start();
-	mtr_sx_lock_index(index, &mtr);
-	n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
-						    &n_leaf_pages, &mtr);
-	mtr.commit();
-
-	if (n_leaf_reserved == ULINT_UNDEFINED) {
-		// The index name is different during fast index creation,
-		// so the stats won't be associated with the right index
-		// for later use. We just return without saving.
-		return DB_SUCCESS;
-	}
-
-	dict_sys_lock();
-	ret = dict_stats_save_index_stat(index, now, "n_page_split",
-					 index->stat_defrag_n_page_split,
-					 NULL,
-					 "Number of new page splits on leaves"
-					 " since last defragmentation.",
-					 NULL);
-	if (ret != DB_SUCCESS) {
-		goto end;
-	}
-
-	ret = dict_stats_save_index_stat(
-		index, now, "n_leaf_pages_defrag",
-		n_leaf_pages,
-		NULL,
-		"Number of leaf pages when this stat is saved to disk",
-		NULL);
-	if (ret != DB_SUCCESS) {
-		goto end;
-	}
-
-	ret = dict_stats_save_index_stat(
-		index, now, "n_leaf_pages_reserved",
-		n_leaf_reserved,
-		NULL,
-		"Number of pages reserved for this index leaves when this stat "
-		"is saved to disk",
-		NULL);
-
-end:
-	dict_sys_unlock();
-	return ret;
+  if (index->is_ibuf())
+    return DB_SUCCESS;
+  if (!index->is_readable())
+    return dict_stats_report_error(index->table, true);
+
+  const time_t now= time(nullptr);
+  mtr_t mtr;
+  ulint n_leaf_pages;
+  mtr.start();
+  mtr_sx_lock_index(index, &mtr);
+  ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+                                                   &n_leaf_pages, &mtr);
+  mtr.commit();
+
+  if (n_leaf_reserved == ULINT_UNDEFINED)
+    return DB_SUCCESS;
+
+  THD *thd= current_thd;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (table_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
+    dict_sys.unfreeze();
+  }
+  if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
+  {
+release_and_exit:
+    if (table_stats)
+      dict_table_close(table_stats, false, thd, mdl_table);
+    return DB_STATS_DO_NOT_EXIST;
+  }
+
+  dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
+    dict_sys.unfreeze();
+  }
+  if (!index_stats)
+    goto release_and_exit;
+
+  if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
+  {
+    dict_table_close(index_stats, false, thd, mdl_index);
+    goto release_and_exit;
+  }
+
+  trx_t *trx= trx_create();
+  trx->mysql_thd= thd;
+  trx_start_internal(trx);
+  dberr_t ret= trx->read_only
+    ? DB_READ_ONLY
+    : lock_table_for_trx(table_stats, trx, LOCK_X);
+  if (ret == DB_SUCCESS)
+    ret= lock_table_for_trx(index_stats, trx, LOCK_X);
+
+  row_mysql_lock_data_dictionary(trx);
+
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, now, "n_page_split",
+                                    index->stat_defrag_n_page_split, nullptr,
+                                    "Number of new page splits on leaves"
+                                    " since last defragmentation.", trx);
+
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_defrag",
+                                    n_leaf_pages, nullptr,
+                                    "Number of leaf pages when"
+                                    " this stat is saved to disk", trx);
+
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_reserved",
+                                    n_leaf_reserved, nullptr,
+                                    "Number of pages reserved for"
+                                    " this index leaves"
+                                    " when this stat is saved to disk", trx);
+
+  if (ret == DB_SUCCESS)
+    trx->commit();
+  else
+    trx->rollback();
+
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+  row_mysql_unlock_data_dictionary(trx);
+  trx->free();
+
+  return ret;
 }
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index cfc39dd8e32..d1c719d2090 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -46,9 +46,9 @@ Created 1/8/1996 Heikki Tuuri
 #include "buf0buf.h"
 #include "data0type.h"
 #include "dict0boot.h"
+#include "dict0load.h"
 #include "dict0crea.h"
 #include "dict0mem.h"
-#include "dict0priv.h"
 #include "dict0stats.h"
 #include "fts0fts.h"
 #include "fts0types.h"
@@ -67,8 +67,8 @@ Created 1/8/1996 Heikki Tuuri
 #include "row0upd.h"
 #include "srv0mon.h"
 #include "srv0start.h"
-#include "sync0sync.h"
 #include "trx0undo.h"
+#include "trx0purge.h"
 
 #include <vector>
 #include <algorithm>
@@ -76,6 +76,21 @@ Created 1/8/1996 Heikki Tuuri
 /** the dictionary system */
 dict_sys_t	dict_sys;
 
+/** System table names; @see dict_system_id_t */
+const span<const char> dict_sys_t::SYS_TABLE[]=
+{
+  {C_STRING_WITH_LEN("SYS_TABLES")},{C_STRING_WITH_LEN("SYS_INDEXES")},
+  {C_STRING_WITH_LEN("SYS_COLUMNS")},{C_STRING_WITH_LEN("SYS_FIELDS")},
+  {C_STRING_WITH_LEN("SYS_FOREIGN")},{C_STRING_WITH_LEN("SYS_FOREIGN_COLS")},
+  {C_STRING_WITH_LEN("SYS_VIRTUAL")}
+};
+
+/** Diagnostic message for exceeding the mutex_lock_wait() timeout */
+const char dict_sys_t::fatal_msg[]=
+  "innodb_fatal_semaphore_wait_threshold was exceeded for dict_sys.latch. "
+  "Please refer to "
+  "https://mariadb.com/kb/en/how-to-produce-a-full-stack-trace-for-mysqld/";
+
 /** Percentage of compression failures that are allowed in a single
 round */
 ulong	zip_failure_threshold_pct = 5;
@@ -94,11 +109,6 @@ ulong	zip_pad_max = 50;
 /** Identifies generated InnoDB foreign key names */
 static char	dict_ibfk[] = "_ibfk_";
 
-bool		innodb_table_stats_not_found = false;
-bool		innodb_index_stats_not_found = false;
-static bool	innodb_table_stats_not_found_reported = false;
-static bool	innodb_index_stats_not_found_reported = false;
-
 /*******************************************************************//**
 Tries to find column names for the index and sets the col field of the
 index.
@@ -163,7 +173,7 @@ dict_lru_validate(void);
 and unique key errors. Only created if !srv_read_only_mode */
 FILE*	dict_foreign_err_file		= NULL;
 /* mutex protecting the foreign and unique error buffers */
-ib_mutex_t	dict_foreign_err_mutex;
+mysql_mutex_t dict_foreign_err_mutex;
 
 /********************************************************************//**
 Checks if the database name in two table names is the same.
@@ -200,102 +210,35 @@ dict_remove_db_name(
 	return(s + 1);
 }
 
-/** Open a persistent table.
-@param[in]	table_id	persistent table identifier
-@param[in]	ignore_err	errors to ignore
-@param[in]	cached_only	whether to skip loading
-@return persistent table
-@retval	NULL if not found */
-static dict_table_t* dict_table_open_on_id_low(
-	table_id_t		table_id,
-	dict_err_ignore_t	ignore_err,
-	bool			cached_only)
-{
-	dict_table_t* table = dict_sys.get_table(table_id);
-
-	if (!table && !cached_only) {
-		table = dict_load_table_on_id(table_id, ignore_err);
-	}
-
-	return table;
-}
-
-/**********************************************************************//**
-Try to drop any indexes after an aborted index creation.
-This can also be after a server kill during DROP INDEX. */
-static
-void
-dict_table_try_drop_aborted(
-/*========================*/
-	dict_table_t*	table,		/*!< in: table, or NULL if it
-					needs to be looked up again */
-	table_id_t	table_id,	/*!< in: table identifier */
-	uint32_t	ref_count)	/*!< in: expected table->n_ref_count */
-{
-	trx_t*		trx;
-
-	trx = trx_create();
-	trx->op_info = "try to drop any indexes after an aborted index creation";
-	row_mysql_lock_data_dictionary(trx);
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
-
-	if (table == NULL) {
-		table = dict_table_open_on_id_low(
-			table_id, DICT_ERR_IGNORE_FK_NOKEY, FALSE);
-	} else {
-		ut_ad(table->id == table_id);
-	}
-
-	if (table && table->get_ref_count() == ref_count && table->drop_aborted
-	    && !UT_LIST_GET_FIRST(table->locks)) {
-		/* Silence a debug assertion in row_merge_drop_indexes(). */
-		ut_d(table->acquire());
-		row_merge_drop_indexes(trx, table, true);
-		ut_d(table->release());
-		ut_ad(table->get_ref_count() == ref_count);
-		trx_commit_for_mysql(trx);
-	}
-
-	row_mysql_unlock_data_dictionary(trx);
-	trx->free();
-}
-
-/**********************************************************************//**
-When opening a table,
-try to drop any indexes after an aborted index creation.
-Release the dict_sys.mutex. */
-static
-void
-dict_table_try_drop_aborted_and_mutex_exit(
-/*=======================================*/
-	dict_table_t*	table,		/*!< in: table (may be NULL) */
-	ibool		try_drop)	/*!< in: FALSE if should try to
-					drop indexes whose online creation
-					was aborted */
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table)
 {
-	if (try_drop
-	    && table != NULL
-	    && table->drop_aborted
-	    && table->get_ref_count() == 1
-	    && dict_table_get_first_index(table)) {
-
-		/* Attempt to drop the indexes whose online creation
-		was aborted. */
-		table_id_t	table_id = table->id;
-
-		mutex_exit(&dict_sys.mutex);
-
-		dict_table_try_drop_aborted(table, table_id, 1);
-	} else {
-		mutex_exit(&dict_sys.mutex);
-	}
+  if (table->get_ref_count() == 1 &&
+      dict_stats_is_persistent_enabled(table) &&
+      strchr(table->name.m_name, '/'))
+  {
+    /* It looks like we are closing the last handle. The user could
+    have executed FLUSH TABLES in order to have the statistics reloaded
+    from the InnoDB persistent statistics tables. We must acquire
+    exclusive dict_sys.latch to prevent a race condition with another
+    thread concurrently acquiring a handle on the table. */
+    dict_sys.lock(SRW_LOCK_CALL);
+    if (table->release())
+    {
+      table->stats_mutex_lock();
+      if (table->get_ref_count() == 0)
+        dict_stats_deinit(table);
+      table->stats_mutex_unlock();
+    }
+    dict_sys.unlock();
+  }
+  else
+    table->release();
 }
 
 /** Decrements the count of open handles of a table.
 @param[in,out]	table		table
-@param[in]	dict_locked	data dictionary locked
-@param[in]	try_drop	try to drop any orphan indexes after
-				an aborted online index creation
+@param[in]	dict_locked	whether dict_sys.latch is being held
 @param[in]	thd		thread to release MDL
 @param[in]	mdl		metadata lock or NULL if the thread
 				is a foreground one. */
@@ -303,92 +246,33 @@ void
 dict_table_close(
 	dict_table_t*	table,
 	bool		dict_locked,
-	bool		try_drop,
 	THD*		thd,
 	MDL_ticket*	mdl)
 {
-	if (!dict_locked) {
-		mutex_enter(&dict_sys.mutex);
-	}
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-	ut_a(table->get_ref_count() > 0);
-
-	const bool last_handle = table->release();
-
-	/* Force persistent stats re-read upon next open of the table
-	so that FLUSH TABLE can be used to forcibly fetch stats from disk
-	if they have been manually modified. We reset table->stat_initialized
-	only if table reference count is 0 because we do not want too frequent
-	stats re-reads (e.g. in other cases than FLUSH TABLE). */
-	if (last_handle && strchr(table->name.m_name, '/') != NULL
-	    && dict_stats_is_persistent_enabled(table)) {
-
-		dict_stats_deinit(table);
-	}
-
-	MONITOR_DEC(MONITOR_TABLE_REFERENCE);
-
-	ut_ad(dict_lru_validate());
-	ut_ad(dict_sys.find(table));
-
-	if (!dict_locked) {
-		table_id_t	table_id	= table->id;
-		const bool	drop_aborted	= last_handle && try_drop
-			&& table->drop_aborted
-			&& dict_table_get_first_index(table);
-
-		mutex_exit(&dict_sys.mutex);
-
-		/* dict_table_try_drop_aborted() can generate undo logs.
-		So it should be avoided after shutdown of background
-		threads */
-		if (drop_aborted && !srv_undo_sources) {
-			dict_table_try_drop_aborted(NULL, table_id, 0);
-		}
-	}
-
-	if (!thd || !mdl) {
-	} else if (MDL_context *mdl_context= static_cast<MDL_context*>(
-			   thd_mdl_context(thd))) {
-		mdl_context->release_lock(mdl);
-	}
-}
-
-/********************************************************************//**
-Closes the only open handle to a table and drops a table while assuring
-that dict_sys.mutex is held the whole time.  This assures that the table
-is not evicted after the close when the count of open handles goes to zero.
-Because dict_sys.mutex is held, we do not need to call
-dict_table_prevent_eviction().  */
-void
-dict_table_close_and_drop(
-/*======================*/
-	trx_t*		trx,		/*!< in: data dictionary transaction */
-	dict_table_t*	table)		/*!< in/out: table */
-{
-	dberr_t err = DB_SUCCESS;
-
-	ut_d(dict_sys.assert_locked());
-	ut_ad(trx->dict_operation != TRX_DICT_OP_NONE);
-	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
-
-	dict_table_close(table, true, false);
-
-#if defined UNIV_DEBUG || defined UNIV_DDL_DEBUG
-	/* Nobody should have initialized the stats of the newly created
-	table when this is called. So we know that it has not been added
-	for background stats gathering. */
-	ut_a(!table->stat_initialized);
-#endif /* UNIV_DEBUG || UNIV_DDL_DEBUG */
+  if (!dict_locked)
+    dict_table_close(table);
+  else
+  {
+    if (table->release() && dict_stats_is_persistent_enabled(table) &&
+	strchr(table->name.m_name, '/'))
+    {
+      /* Force persistent stats re-read upon next open of the table so
+      that FLUSH TABLE can be used to forcibly fetch stats from disk if
+      they have been manually modified. */
+      table->stats_mutex_lock();
+      if (table->get_ref_count() == 0)
+        dict_stats_deinit(table);
+      table->stats_mutex_unlock();
+    }
 
-	err = row_merge_drop_table(trx, table);
+    ut_ad(dict_lru_validate());
+    ut_ad(dict_sys.find(table));
+  }
 
-	if (err != DB_SUCCESS) {
-		ib::error() << "At " << __FILE__ << ":" << __LINE__
-			    << " row_merge_drop_table returned error: " << err
-			    << " table: " << table->name;
-	}
+  if (!thd || !mdl);
+  else if (MDL_context *mdl_context= static_cast<MDL_context*>
+           (thd_mdl_context(thd)))
+    mdl_context->release_lock(mdl);
 }
 
 /** Check if the table has a given (non_virtual) column.
@@ -718,13 +602,13 @@ dict_index_get_nth_field_pos(
 }
 
 /** Parse the table file name into table name and database name.
-@tparam		dict_locked	whether dict_sys.mutex is being held
-@param[in,out]	db_name		database name buffer
-@param[in,out]	tbl_name	table name buffer
-@param[out]	db_name_len	database name length
-@param[out]	tbl_name_len	table name length
+@tparam        dict_frozen  whether the caller holds dict_sys.latch
+@param[in,out] db_name      database name buffer
+@param[in,out] tbl_name     table name buffer
+@param[out] db_name_len     database name length
+@param[out] tbl_name_len    table name length
 @return whether the table name is visible to SQL */
-template<bool dict_locked>
+template<bool dict_frozen>
 bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1],
                               char (&tbl_name)[NAME_LEN + 1],
                               size_t *db_name_len, size_t *tbl_name_len) const
@@ -732,31 +616,28 @@ bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1],
   char db_buf[MAX_DATABASE_NAME_LEN + 1];
   char tbl_buf[MAX_TABLE_NAME_LEN + 1];
 
-  if (!dict_locked)
-    mutex_enter(&dict_sys.mutex); /* protect against renaming */
-  else
-    ut_ad(mutex_own(&dict_sys.mutex));
+  if (!dict_frozen)
+    dict_sys.freeze(SRW_LOCK_CALL); /* protect against renaming */
+  ut_ad(dict_sys.frozen());
   const size_t db_len= name.dblen();
   ut_ad(db_len <= MAX_DATABASE_NAME_LEN);
 
-  memcpy(db_buf, name.m_name, db_len);
+  memcpy(db_buf, mdl_name.m_name, db_len);
   db_buf[db_len]= 0;
 
-  size_t tbl_len= strlen(name.m_name + db_len + 1);
-
-  const bool is_temp= tbl_len > TEMP_FILE_PREFIX_LENGTH &&
-    !strncmp(name.m_name, TEMP_FILE_PREFIX, TEMP_FILE_PREFIX_LENGTH);
+  size_t tbl_len= strlen(mdl_name.m_name + db_len + 1);
+  const bool is_temp= mdl_name.is_temporary();
 
   if (is_temp);
   else if (const char *is_part= static_cast<const char*>
-           (memchr(name.m_name + db_len + 1, '#', tbl_len)))
-    tbl_len= static_cast<size_t>(is_part - &name.m_name[db_len + 1]);
+           (memchr(mdl_name.m_name + db_len + 1, '#', tbl_len)))
+    tbl_len= static_cast<size_t>(is_part - &mdl_name.m_name[db_len + 1]);
 
-  memcpy(tbl_buf, name.m_name + db_len + 1, tbl_len);
+  memcpy(tbl_buf, mdl_name.m_name + db_len + 1, tbl_len);
   tbl_buf[tbl_len]= 0;
 
-  if (!dict_locked)
-    mutex_exit(&dict_sys.mutex);
+  if (!dict_frozen)
+    dict_sys.unfreeze();
 
   *db_name_len= filename_to_tablename(db_buf, db_name,
                                       MAX_DATABASE_NAME_LEN + 1, true);
@@ -781,7 +662,7 @@ dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1],
 @param[in]      table_op        operation to perform when opening
 @return table object after locking MDL shared
 @retval nullptr if the table is not readable, or if trylock && MDL blocked */
-template<bool trylock>
+template<bool trylock, bool purge_thd>
 dict_table_t*
 dict_acquire_mdl_shared(dict_table_t *table,
                         THD *thd,
@@ -793,16 +674,18 @@ dict_acquire_mdl_shared(dict_table_t *table,
 
   MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
   size_t db_len;
+  dict_table_t *not_found= nullptr;
 
   if (trylock)
   {
-    mutex_enter(&dict_sys.mutex);
+    static_assert(!trylock || !purge_thd, "usage");
+    dict_sys.freeze(SRW_LOCK_CALL);
     db_len= dict_get_db_name_len(table->name.m_name);
-    mutex_exit(&dict_sys.mutex);
+    dict_sys.unfreeze();
   }
   else
   {
-    ut_ad(mutex_own(&dict_sys.mutex));
+    ut_ad(dict_sys.frozen_not_locked());
     db_len= dict_get_db_name_len(table->name.m_name);
   }
 
@@ -825,7 +708,6 @@ dict_acquire_mdl_shared(dict_table_t *table,
 retry:
   if (!unaccessible && (!table->is_readable() || table->corrupted))
   {
-is_unaccessible:
     if (*mdl)
     {
       mdl_context->release_lock(*mdl);
@@ -841,10 +723,12 @@ is_unaccessible:
     return nullptr;
 
   if (!trylock)
-    mutex_exit(&dict_sys.mutex);
+    dict_sys.unfreeze();
+
   {
     MDL_request request;
-    MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED, MDL_EXPLICIT);
+    MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED,
+                     MDL_EXPLICIT);
     if (trylock
         ? mdl_context->try_acquire_lock(&request)
         : mdl_context->acquire_lock(&request,
@@ -858,42 +742,63 @@ is_unaccessible:
         return nullptr;
     }
     else
+    {
       *mdl= request.ticket;
+      if (trylock && !*mdl)
+        return nullptr;
+    }
   }
 
-  if (!trylock)
-    mutex_enter(&dict_sys.mutex);
-  else if (!*mdl)
-    return nullptr;
-
-  table= dict_table_open_on_id(table_id, !trylock, table_op);
+retry_table_open:
+  dict_sys.freeze(SRW_LOCK_CALL);
+  if (purge_thd && purge_sys.must_wait_FTS())
+  {
+    not_found= reinterpret_cast<dict_table_t*>(-1);
+    goto return_without_mdl;
+  }
+  table= dict_sys.find_table(table_id);
+  if (table)
+    table->acquire();
+  if (!table && table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
+  {
+    dict_sys.unfreeze();
+    dict_sys.lock(SRW_LOCK_CALL);
+    if (purge_thd && purge_sys.must_wait_FTS())
+    {
+      dict_sys.unlock();
+      goto retry_table_open;
+    }
+    table= dict_load_table_on_id(table_id,
+                                 table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+                                 ? DICT_ERR_IGNORE_RECOVER_LOCK
+                                 : DICT_ERR_IGNORE_FK_NOKEY);
+    if (table)
+      table->acquire();
+    dict_sys.unlock();
+    dict_sys.freeze(SRW_LOCK_CALL);
+  }
 
-  if (!table)
+  if (!table || !table->is_accessible())
   {
-    /* The table was dropped. */
+    table= nullptr;
+return_without_mdl:
+    if (trylock)
+      dict_sys.unfreeze();
     if (*mdl)
     {
       mdl_context->release_lock(*mdl);
       *mdl= nullptr;
     }
-    return nullptr;
+    return not_found;
   }
 
-  if (!table->is_accessible())
-    goto is_unaccessible;
-
   size_t db1_len, tbl1_len;
 
-  if (!table->parse_name<!trylock>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
+  if (!table->parse_name<true>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
   {
     /* The table was renamed to #sql prefix.
     Release MDL (if any) for the old name and return. */
-    if (*mdl)
-    {
-      mdl_context->release_lock(*mdl);
-      *mdl= nullptr;
-    }
-    return table;
+    goto return_without_mdl;
   }
 
   if (*mdl)
@@ -901,7 +806,11 @@ is_unaccessible:
     if (db_len == db1_len && tbl_len == tbl1_len &&
         !memcmp(db_buf, db_buf1, db_len) &&
         !memcmp(tbl_buf, tbl_buf1, tbl_len))
+    {
+      if (trylock)
+        dict_sys.unfreeze();
       return table;
+    }
 
     /* The table was renamed. Release MDL for the old name and
     try to acquire MDL for the new name. */
@@ -917,8 +826,10 @@ is_unaccessible:
   goto retry;
 }
 
-template dict_table_t*
-dict_acquire_mdl_shared<true>(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
+template dict_table_t* dict_acquire_mdl_shared<false, false>
+(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
+template dict_table_t* dict_acquire_mdl_shared<true, false>
+(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
 
 /** Look up a table by numeric identifier.
 @tparam purge_thd Whether the function is called by purge thread
@@ -928,53 +839,79 @@ dict_acquire_mdl_shared<true>(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
 @param[in,out]  thd             background thread, or NULL to not acquire MDL
 @param[out]     mdl             mdl ticket, or NULL
 @return table, NULL if does not exist */
-template<bool purge_thd>
+template <bool purge_thd>
 dict_table_t*
 dict_table_open_on_id(table_id_t table_id, bool dict_locked,
                       dict_table_op_t table_op, THD *thd,
                       MDL_ticket **mdl)
 {
-	ut_ad(!dict_locked || !thd);
-
-	if (!dict_locked) {
-		mutex_enter(&dict_sys.mutex);
-	}
-
-	ut_ad(mutex_own(&dict_sys.mutex));
+  if (!dict_locked)
+    dict_sys.freeze(SRW_LOCK_CALL);
 
-	dict_table_t* table = dict_table_open_on_id_low(
-		table_id,
-		table_op == DICT_TABLE_OP_LOAD_TABLESPACE
-		? DICT_ERR_IGNORE_RECOVER_LOCK
-		: DICT_ERR_IGNORE_FK_NOKEY,
-		table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+  dict_table_t *table= dict_sys.find_table(table_id);
 
-	if (table != NULL) {
-		if (purge_thd && table->name.is_temporary()) {
-			mutex_exit(&dict_sys.mutex);
-			return nullptr;
-		}
-		dict_sys.acquire(table);
-		MONITOR_INC(MONITOR_TABLE_REFERENCE);
-	}
+  if (table)
+  {
+    if (purge_thd && purge_sys.must_wait_FTS())
+    {
+      table= reinterpret_cast<dict_table_t*>(-1);
+      goto func_exit;
+    }
 
-	if (!dict_locked) {
-		if (thd) {
-			table = dict_acquire_mdl_shared<false>(
-				table, thd, mdl, table_op);
-		}
+    table->acquire();
+    if (thd && !dict_locked)
+      table= dict_acquire_mdl_shared<false, purge_thd>(
+               table, thd, mdl, table_op);
+  }
+  else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
+  {
+    if (!dict_locked)
+    {
+      dict_sys.unfreeze();
+      dict_sys.lock(SRW_LOCK_CALL);
+    }
+    table= dict_load_table_on_id(table_id,
+                                 table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+                                 ? DICT_ERR_IGNORE_RECOVER_LOCK
+                                 : DICT_ERR_IGNORE_FK_NOKEY);
+    if (table)
+    {
+      if (purge_thd && purge_sys.must_wait_FTS())
+      {
+        dict_sys.unlock();
+        return reinterpret_cast<dict_table_t*>(-1);
+      }
+      table->acquire();
+    }
+    if (!dict_locked)
+    {
+      dict_sys.unlock();
+      if (table && thd)
+      {
+        dict_sys.freeze(SRW_LOCK_CALL);
+        table= dict_acquire_mdl_shared<false, purge_thd>(
+                 table, thd, mdl, table_op);
+        dict_sys.unfreeze();
+      }
+      return table;
+    }
+  }
 
-		dict_table_try_drop_aborted_and_mutex_exit(
-			table, table_op == DICT_TABLE_OP_DROP_ORPHAN);
-	}
+func_exit:
+  if (!dict_locked)
+    dict_sys.unfreeze();
 
-	return table;
+  return table;
 }
 
 template dict_table_t* dict_table_open_on_id<false>
-(table_id_t, bool, dict_table_op_t, THD *, MDL_ticket **);
+(table_id_t table_id, bool dict_locked,
+ dict_table_op_t table_op, THD *thd,
+ MDL_ticket **mdl);
 template dict_table_t* dict_table_open_on_id<true>
-(table_id_t, bool, dict_table_op_t, THD *, MDL_ticket **);
+(table_id_t table_id, bool dict_locked,
+ dict_table_op_t table_op, THD *thd,
+ MDL_ticket **mdl);
 
 /********************************************************************//**
 Looks for column n position in the clustered index.
@@ -1035,8 +972,6 @@ void dict_sys_t::create()
   UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU);
   UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU);
 
-  mutex_create(LATCH_ID_DICT_SYS, &mutex);
-
   const ulint hash_size = buf_pool_get_curr_size()
     / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
 
@@ -1044,7 +979,7 @@ void dict_sys_t::create()
   table_id_hash.create(hash_size);
   temp_id_hash.create(hash_size);
 
-  rw_lock_create(dict_operation_lock_key, &latch, SYNC_DICT_OPERATION);
+  latch.SRW_LOCK_INIT(dict_operation_lock_key);
 
   if (!srv_read_only_mode)
   {
@@ -1052,99 +987,134 @@ void dict_sys_t::create()
     ut_a(dict_foreign_err_file);
   }
 
-  mutex_create(LATCH_ID_DICT_FOREIGN_ERR, &dict_foreign_err_mutex);
+  mysql_mutex_init(dict_foreign_err_mutex_key, &dict_foreign_err_mutex,
+                   nullptr);
 }
 
-/** Acquire a reference to a cached table. */
-inline void dict_sys_t::acquire(dict_table_t* table)
+
+void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line))
 {
-  ut_ad(dict_sys.find(table));
-  if (table->can_be_evicted)
+  ulonglong now= my_hrtime_coarse().val, old= 0;
+  if (latch_ex_wait_start.compare_exchange_strong
+      (old, now, std::memory_order_relaxed, std::memory_order_relaxed))
   {
-    UT_LIST_REMOVE(dict_sys.table_LRU, table);
-    UT_LIST_ADD_FIRST(dict_sys.table_LRU, table);
+    latch.wr_lock(SRW_LOCK_ARGS(file, line));
+    latch_ex_wait_start.store(0, std::memory_order_relaxed);
+    ut_ad(!latch_readers);
+    ut_ad(!latch_ex);
+    ut_d(latch_ex= pthread_self());
+    return;
   }
 
-  table->acquire();
+  ut_ad(old);
+  /* We could have old > now due to our use of my_hrtime_coarse(). */
+  ulong waited= old <= now ? static_cast<ulong>((now - old) / 1000000) : 0;
+  const ulong threshold= srv_fatal_semaphore_wait_threshold;
+
+  if (waited >= threshold)
+    ib::fatal() << fatal_msg;
+
+  if (waited > threshold / 4)
+    ib::warn() << "A long wait (" << waited
+               << " seconds) was observed for dict_sys.latch";
+  latch.wr_lock(SRW_LOCK_ARGS(file, line));
+  ut_ad(!latch_readers);
+  ut_ad(!latch_ex);
+  ut_d(latch_ex= pthread_self());
+}
+
+#ifdef UNIV_PFS_RWLOCK
+ATTRIBUTE_NOINLINE void dict_sys_t::unlock()
+{
+  ut_ad(latch_ex == pthread_self());
+  ut_ad(!latch_readers);
+  ut_d(latch_ex= 0);
+  latch.wr_unlock();
 }
 
+ATTRIBUTE_NOINLINE void dict_sys_t::freeze(const char *file, unsigned line)
+{
+  latch.rd_lock(file, line);
+  ut_ad(!latch_ex);
+  ut_d(latch_readers++);
+}
+
+ATTRIBUTE_NOINLINE void dict_sys_t::unfreeze()
+{
+  ut_ad(!latch_ex);
+  ut_ad(latch_readers--);
+  latch.rd_unlock();
+}
+#endif /* UNIV_PFS_RWLOCK */
+
 /**********************************************************************//**
-Returns a table object and increment its open handle count.
+Returns a table object and increments its open handle count.
 NOTE! This is a high-level function to be used mainly from outside the
-'dict' module. Inside this directory dict_table_get_low
+'dict' directory. Inside this directory dict_table_get_low
 is usually the appropriate function.
-@return table, NULL if does not exist */
+@param[in] table_name Table name
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
+@param[in] ignore_err error to be ignored when loading the table
+@return table
+@retval nullptr if does not exist */
 dict_table_t*
 dict_table_open_on_name(
-/*====================*/
-	const char*	table_name,	/*!< in: table name */
-	ibool		dict_locked,	/*!< in: TRUE=data dictionary locked */
-	ibool		try_drop,	/*!< in: TRUE=try to drop any orphan
-					indexes after an aborted online
-					index creation */
-	dict_err_ignore_t
-			ignore_err)	/*!< in: error to be ignored when
-					loading a table definition */
+	const char*		table_name,
+	bool			dict_locked,
+	dict_err_ignore_t	ignore_err)
 {
-	dict_table_t*	table;
-	DBUG_ENTER("dict_table_open_on_name");
-	DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name));
-
-	if (!dict_locked) {
-		mutex_enter(&dict_sys.mutex);
-	}
-
-	ut_ad(table_name);
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	table = dict_table_check_if_in_cache_low(table_name);
-
-	if (table == NULL) {
-		table = dict_load_table(table_name, ignore_err);
-	}
+  dict_table_t *table;
+  DBUG_ENTER("dict_table_open_on_name");
+  DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name));
 
-	ut_ad(!table || table->cached);
+  const span<const char> name{table_name, strlen(table_name)};
 
-	if (table != NULL) {
-
-		/* If table is encrypted or corrupted */
-		if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY)
-		    && !table->is_readable()) {
-			/* Make life easy for drop table. */
-			dict_sys.prevent_eviction(table);
-
-			if (table->corrupted) {
-
-				ib::error() << "Table " << table->name
-					<< " is corrupted. Please "
-					"drop the table and recreate.";
-				if (!dict_locked) {
-					mutex_exit(&dict_sys.mutex);
-				}
-
-				DBUG_RETURN(NULL);
-			}
-
-			dict_sys.acquire(table);
-
-			if (!dict_locked) {
-				mutex_exit(&dict_sys.mutex);
-			}
+  if (!dict_locked)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table= dict_sys.find_table(name);
+    if (table)
+    {
+      ut_ad(table->cached);
+      if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) &&
+          !table->is_readable() && table->corrupted)
+      {
+        ib::error() << "Table " << table->name
+                    << " is corrupted. Please drop the table and recreate.";
+        dict_sys.unfreeze();
+        DBUG_RETURN(nullptr);
+      }
+      table->acquire();
+      dict_sys.unfreeze();
+      DBUG_RETURN(table);
+    }
+    dict_sys.unfreeze();
+    dict_sys.lock(SRW_LOCK_CALL);
+  }
 
-			DBUG_RETURN(table);
-		}
+  table= dict_sys.load_table(name, ignore_err);
 
-		dict_sys.acquire(table);
-		MONITOR_INC(MONITOR_TABLE_REFERENCE);
-	}
+  if (table)
+  {
+    ut_ad(table->cached);
+    if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) &&
+        !table->is_readable() && table->corrupted)
+    {
+      ib::error() << "Table " << table->name
+                  << " is corrupted. Please drop the table and recreate.";
+      if (!dict_locked)
+        dict_sys.unlock();
+      DBUG_RETURN(nullptr);
+    }
 
-	ut_ad(dict_lru_validate());
+    table->acquire();
+  }
 
-	if (!dict_locked) {
-		dict_table_try_drop_aborted_and_mutex_exit(table, try_drop);
-	}
+  ut_ad(dict_lru_validate());
+  if (!dict_locked)
+    dict_sys.unlock();
 
-	DBUG_RETURN(table);
+  DBUG_RETURN(table);
 }
 
 /**********************************************************************//**
@@ -1197,9 +1167,11 @@ inline void dict_sys_t::add(dict_table_t* table)
 {
 	ut_ad(!find(table));
 
-	ulint fold = ut_fold_string(table->name.m_name);
+	ulint fold = my_crc32c(0, table->name.m_name,
+			       strlen(table->name.m_name));
 
-	new (&table->autoinc_mutex) std::mutex();
+	table->autoinc_mutex.init();
+	table->lock_mutex_init();
 
 	/* Look for a table with the same name: error if such exists */
 	{
@@ -1246,16 +1218,13 @@ inline void dict_sys_t::add(dict_table_t* table)
 	ut_ad(dict_lru_validate());
 }
 
-/**********************************************************************//**
-Test whether a table can be evicted from the LRU cache.
-@return TRUE if table can be evicted. */
-static
-ibool
-dict_table_can_be_evicted(
-/*======================*/
-	dict_table_t*	table)		/*!< in: table to test */
+/** Test whether a table can be evicted from dict_sys.table_LRU.
+@param table   table to be considered for eviction
+@return whether the table can be evicted */
+TRANSACTIONAL_TARGET
+static bool dict_table_can_be_evicted(dict_table_t *table)
 {
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 	ut_a(table->can_be_evicted);
 	ut_a(table->foreign_set.empty());
 	ut_a(table->referenced_set.empty());
@@ -1267,25 +1236,26 @@ dict_table_can_be_evicted(
 		the table instance is in "use". */
 
 		if (lock_table_has_locks(table)) {
-			return(FALSE);
+			return false;
 		}
 
 #ifdef BTR_CUR_HASH_ADAPT
 		/* We cannot really evict the table if adaptive hash
 		index entries are pointing to any of its indexes. */
-		for (dict_index_t* index = dict_table_get_first_index(table);
-		     index != NULL;
-		     index = dict_table_get_next_index(index)) {
+		for (const dict_index_t* index
+			     = dict_table_get_first_index(table);
+		     index; index = dict_table_get_next_index(index)) {
 			if (index->n_ahi_pages()) {
-				return(FALSE);
+				return false;
 			}
 		}
 #endif /* BTR_CUR_HASH_ADAPT */
 
-		return(TRUE);
+		ut_ad(!table->fts);
+		return true;
 	}
 
-	return(FALSE);
+	return false;
 }
 
 #ifdef BTR_CUR_HASH_ADAPT
@@ -1311,10 +1281,10 @@ dict_index_t *dict_index_t::clone() const
               sizeof *stat_n_non_null_key_vals);
 
   mem_heap_t* heap= mem_heap_create(size);
-  dict_index_t *index= static_cast<dict_index_t*>(mem_heap_dup(heap, this,
-                                                               sizeof *this));
+  dict_index_t *index= static_cast<dict_index_t*>
+    (mem_heap_alloc(heap, sizeof *this));
   *index= *this;
-  rw_lock_create(index_tree_rw_lock_key, &index->lock, SYNC_INDEX_TREE);
+  index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
   index->heap= heap;
   index->name= mem_heap_strdup(heap, name);
   index->fields= static_cast<dict_field_t*>
@@ -1340,7 +1310,7 @@ dict_index_t *dict_index_t::clone_if_needed()
     return this;
   dict_index_t *prev= UT_LIST_GET_PREV(indexes, this);
 
-  table->autoinc_mutex.lock();
+  table->autoinc_mutex.wr_lock();
   UT_LIST_REMOVE(table->indexes, this);
   UT_LIST_ADD_LAST(table->freed_indexes, this);
   dict_index_t *index= clone();
@@ -1349,69 +1319,54 @@ dict_index_t *dict_index_t::clone_if_needed()
     UT_LIST_INSERT_AFTER(table->indexes, prev, index);
   else
     UT_LIST_ADD_FIRST(table->indexes, index);
-  table->autoinc_mutex.unlock();
+  table->autoinc_mutex.wr_unlock();
   return index;
 }
 #endif /* BTR_CUR_HASH_ADAPT */
 
-/**********************************************************************//**
-Make room in the table cache by evicting an unused table. The unused table
-should not be part of FK relationship and currently not used in any user
-transaction. There is no guarantee that it will remove a table.
-@return number of tables evicted. If the number of tables in the dict_LRU
-is less than max_tables it will not do anything. */
-ulint
-dict_make_room_in_cache(
-/*====================*/
-	ulint		max_tables,	/*!< in: max tables allowed in cache */
-	ulint		pct_check)	/*!< in: max percent to check */
+/** Evict unused, unlocked tables from table_LRU.
+@param half whether to consider half the tables only (instead of all)
+@return number of tables evicted */
+ulint dict_sys_t::evict_table_LRU(bool half)
 {
-	ulint		i;
-	ulint		len;
-	dict_table_t*	table;
-	ulint		check_up_to;
-	ulint		n_evicted = 0;
+#ifdef MYSQL_DYNAMIC_PLUGIN
+	constexpr ulint max_tables = 400;
+#else
+	extern ulong tdc_size;
+	const ulint max_tables = tdc_size;
+#endif
+	ulint n_evicted = 0;
 
-	ut_a(pct_check > 0);
-	ut_a(pct_check <= 100);
-	ut_d(dict_sys.assert_locked());
+	lock(SRW_LOCK_CALL);
 	ut_ad(dict_lru_validate());
 
-	i = len = UT_LIST_GET_LEN(dict_sys.table_LRU);
+	const ulint len = UT_LIST_GET_LEN(table_LRU);
 
 	if (len < max_tables) {
-		return(0);
+func_exit:
+		unlock();
+		return(n_evicted);
 	}
 
-	check_up_to = len - ((len * pct_check) / 100);
-
-	/* Check for overflow */
-	ut_a(i == 0 || check_up_to <= i);
+	const ulint check_up_to = half ? len / 2 : 0;
+	ulint i = len;
 
 	/* Find a suitable candidate to evict from the cache. Don't scan the
 	entire LRU list. Only scan pct_check list entries. */
 
-	for (table = UT_LIST_GET_LAST(dict_sys.table_LRU);
-	     table != NULL
-	     && i > check_up_to
-	     && (len - n_evicted) > max_tables;
-	     --i) {
-
-		dict_table_t*	prev_table;
-
-	        prev_table = UT_LIST_GET_PREV(table_LRU, table);
+	for (dict_table_t *table = UT_LIST_GET_LAST(table_LRU);
+	     table && i > check_up_to && (len - n_evicted) > max_tables; --i) {
+		dict_table_t* prev_table = UT_LIST_GET_PREV(table_LRU, table);
 
 		if (dict_table_can_be_evicted(table)) {
-			ut_ad(!table->fts);
-			dict_sys.remove(table, true);
-
+			remove(table, true);
 			++n_evicted;
 		}
 
 		table = prev_table;
 	}
 
-	return(n_evicted);
+	goto func_exit;
 }
 
 /** Looks for an index with the given id given a table instance.
@@ -1455,6 +1410,98 @@ struct dict_foreign_remove_partial
 	}
 };
 
+/** This function returns a new path name after replacing the basename
+in an old path with a new basename.  The old_path is a full path
+name including the extension.  The tablename is in the normal
+form "databasename/tablename".  The new base name is found after
+the forward slash.  Both input strings are null terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@param[in]	old_path		Pathname
+@param[in]	tablename		Contains new base name
+@return own: new full pathname */
+static char *dir_pathname(const char *old_path, span<const char> tablename)
+{
+  /* Split the tablename into its database and table name components.
+  They are separated by a '/'. */
+  const char *base_name= tablename.data();
+  for (const char *last= tablename.end(); last > tablename.data(); last--)
+  {
+    if (last[-1] == '/')
+    {
+      base_name= last;
+      break;
+    }
+  }
+  const size_t base_name_len= tablename.end() - base_name;
+
+  /* Find the offset of the last slash. We will strip off the
+  old basename.ibd which starts after that slash. */
+  const char *last_slash= strrchr(old_path, '/');
+#ifdef _WIN32
+  if (const char *last= strrchr(old_path, '\\'))
+    if (last > last_slash)
+      last_slash= last;
+#endif
+
+  size_t dir_len= last_slash
+    ? size_t(last_slash - old_path)
+    : strlen(old_path);
+
+  /* allocate a new path and move the old directory path to it. */
+  size_t new_path_len= dir_len + base_name_len + sizeof "/.ibd";
+  char *new_path= static_cast<char*>(ut_malloc_nokey(new_path_len));
+  memcpy(new_path, old_path, dir_len);
+  snprintf(new_path + dir_len, new_path_len - dir_len, "/%.*s.ibd",
+           int(base_name_len), base_name);
+  return new_path;
+}
+
+/** Rename the data file.
+@param new_name     name of the table
+@param replace      whether to replace the file with the new name
+                    (as part of rolling back TRUNCATE) */
+dberr_t
+dict_table_t::rename_tablespace(span<const char> new_name, bool replace) const
+{
+  ut_ad(dict_table_is_file_per_table(this));
+  ut_ad(!is_temporary());
+
+  if (!space)
+    return DB_SUCCESS;
+
+  const char *old_path= UT_LIST_GET_FIRST(space->chain)->name;
+  const bool data_dir= DICT_TF_HAS_DATA_DIR(flags);
+  char *path= data_dir
+    ? dir_pathname(old_path, new_name)
+    : fil_make_filepath(nullptr, new_name, IBD, false);
+  dberr_t err;
+  if (!path)
+    err= DB_OUT_OF_MEMORY;
+  else if (!strcmp(path, old_path))
+    err= DB_SUCCESS;
+  else if (data_dir &&
+           DB_SUCCESS != RemoteDatafile::create_link_file(new_name, path))
+    err= DB_TABLESPACE_EXISTS;
+  else
+  {
+    space->x_lock();
+    err= space->rename(path, true, replace);
+    if (data_dir)
+    {
+      if (err == DB_SUCCESS)
+        new_name= {name.m_name, strlen(name.m_name)};
+      RemoteDatafile::delete_link_file(new_name);
+    }
+    space->x_unlock();
+  }
+
+  ut_free(path);
+  return err;
+}
+
 /**********************************************************************//**
 Renames a table object.
 @return TRUE if success */
@@ -1462,145 +1509,72 @@ dberr_t
 dict_table_rename_in_cache(
 /*=======================*/
 	dict_table_t*	table,		/*!< in/out: table */
-	const char*	new_name,	/*!< in: new name */
-	bool		rename_also_foreigns,
-					/*!< in: in ALTER TABLE we want
-					to preserve the original table name
-					in constraints which reference it */
+	span<const char> new_name,	/*!< in: new name */
 	bool		replace_new_file)
 					/*!< in: whether to replace the
 					file with the new name
 					(as part of rolling back TRUNCATE) */
 {
-	dberr_t		err;
 	dict_foreign_t*	foreign;
-	ulint		fold;
 	char		old_name[MAX_FULL_NAME_LEN + 1];
-	os_file_type_t	ftype;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	/* store the old/current name to an automatic variable */
-	ut_a(strlen(table->name.m_name) < sizeof old_name);
+	const size_t old_name_len = strlen(table->name.m_name);
+	ut_a(old_name_len < sizeof old_name);
 	strcpy(old_name, table->name.m_name);
 
-	fold = ut_fold_string(new_name);
-
-	/* Look for a table with the same name: error if such exists */
-	dict_table_t*	table2;
-	HASH_SEARCH(name_hash, &dict_sys.table_hash, fold,
-			dict_table_t*, table2, ut_ad(table2->cached),
-			(strcmp(table2->name.m_name, new_name) == 0));
-	DBUG_EXECUTE_IF("dict_table_rename_in_cache_failure",
-		if (table2 == NULL) {
-			table2 = (dict_table_t*) -1;
-		} );
-	if (table2) {
-		ib::error() << "Cannot rename table '" << old_name
-			<< "' to '" << new_name << "' since the"
-			" dictionary cache already contains '" << new_name << "'.";
-		return(DB_ERROR);
-	}
-
-	/* If the table is stored in a single-table tablespace, rename the
-	.ibd file and rebuild the .isl file if needed. */
-
-	if (!table->space) {
-		bool		exists;
-		char*		filepath;
-
-		ut_ad(dict_table_is_file_per_table(table));
-		ut_ad(!table->is_temporary());
-
-		/* Make sure the data_dir_path is set. */
-		dict_get_and_save_data_dir_path(table, true);
-
-		if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-			ut_a(table->data_dir_path);
-
-			filepath = fil_make_filepath(
-				table->data_dir_path, table->name.m_name,
-				IBD, true);
-		} else {
-			filepath = fil_make_filepath(
-				NULL, table->name.m_name, IBD, false);
-		}
-
-		if (filepath == NULL) {
-			return(DB_OUT_OF_MEMORY);
-		}
-
-		fil_delete_tablespace(table->space_id, !table->space);
-
-		/* Delete any temp file hanging around. */
-		if (os_file_status(filepath, &exists, &ftype)
-		    && exists
-		    && !os_file_delete_if_exists(innodb_temp_file_key,
-						 filepath, NULL)) {
-
-			ib::info() << "Delete of " << filepath << " failed.";
-		}
-		ut_free(filepath);
-
-	} else if (dict_table_is_file_per_table(table)) {
-		char*	new_path;
-		const char* old_path = UT_LIST_GET_FIRST(table->space->chain)
-			->name;
+	const uint32_t fold= my_crc32c(0, new_name.data(), new_name.size());
+	ut_a(!dict_sys.find_table(new_name));
 
-		ut_ad(!table->is_temporary());
-
-		if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-			new_path = os_file_make_new_pathname(
-				old_path, new_name);
-			err = RemoteDatafile::create_link_file(
-				new_name, new_path);
-
-			if (err != DB_SUCCESS) {
-				ut_free(new_path);
-				return(DB_TABLESPACE_EXISTS);
-			}
-		} else {
-			new_path = fil_make_filepath(
-				NULL, new_name, IBD, false);
-		}
+	if (!dict_table_is_file_per_table(table)) {
+	} else if (dberr_t err = table->rename_tablespace(new_name,
+							  replace_new_file)) {
+		return err;
+	}
 
-		/* New filepath must not exist. */
-		err = table->space->rename(new_name, new_path, true,
-					   replace_new_file);
-		ut_free(new_path);
+	/* Remove table from the hash tables of tables */
+	HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash,
+		    my_crc32c(0, table->name.m_name, old_name_len), table);
 
-		/* If the tablespace is remote, a new .isl file was created
-		If success, delete the old one. If not, delete the new one. */
-		if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-			RemoteDatafile::delete_link_file(
-				err == DB_SUCCESS ? old_name : new_name);
-		}
+        bool keep_mdl_name = !table->name.is_temporary();
 
-		if (err != DB_SUCCESS) {
-			return err;
-		}
+	if (!keep_mdl_name) {
+	} else if (const char* s = static_cast<const char*>
+		   (memchr(new_name.data(), '/', new_name.size()))) {
+		keep_mdl_name = new_name.end() - s >= 5
+			&& !memcmp(s, "/#sql", 5);
 	}
 
-	/* Remove table from the hash tables of tables */
-	HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash,
-		    ut_fold_string(old_name), table);
+	if (keep_mdl_name) {
+		/* Preserve the original table name for
+		dict_table_t::parse_name() and dict_acquire_mdl_shared(). */
+		table->mdl_name.m_name = mem_heap_strdup(table->heap,
+							 table->name.m_name);
+	}
 
-	if (strlen(new_name) > strlen(table->name.m_name)) {
+	if (new_name.size() > strlen(table->name.m_name)) {
 		/* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid
 		memory fragmentation, we assume a repeated calls of
 		ut_realloc() with the same size do not cause fragmentation */
-		ut_a(strlen(new_name) <= MAX_FULL_NAME_LEN);
+		ut_a(new_name.size() <= MAX_FULL_NAME_LEN);
 
 		table->name.m_name = static_cast<char*>(
 			ut_realloc(table->name.m_name, MAX_FULL_NAME_LEN + 1));
 	}
-	strcpy(table->name.m_name, new_name);
+	memcpy(table->name.m_name, new_name.data(), new_name.size());
+	table->name.m_name[new_name.size()] = '\0';
+
+	if (!keep_mdl_name) {
+		table->mdl_name.m_name = table->name.m_name;
+	}
 
 	/* Add table to hash table of tables */
 	HASH_INSERT(dict_table_t, name_hash, &dict_sys.table_hash, fold,
 		    table);
 
-	if (!rename_also_foreigns) {
+	if (table->name.is_temporary()) {
 		/* In ALTER TABLE we think of the rename table operation
 		in the direction table -> temporary table (#sql...)
 		as dropping the table with the old name and creating
@@ -1683,15 +1657,13 @@ dict_table_rename_in_cache(
 			to store foreign key constraint name in charset
 			my_charset_filename for comparison further below. */
 			char    fkid[MAX_TABLE_NAME_LEN * 2 + 20];
-			ibool	on_tmp = FALSE;
 
 			/* The old table name in my_charset_filename is stored
 			in old_name_cs_filename */
 
 			strcpy(old_name_cs_filename, old_name);
 			old_name_cs_filename[MAX_FULL_NAME_LEN] = '\0';
-			if (strstr(old_name, TEMP_TABLE_PATH_PREFIX) == NULL) {
-
+			if (!dict_table_t::is_temporary_name(old_name)) {
 				innobase_convert_to_system_charset(
 					strchr(old_name_cs_filename, '/') + 1,
 					strchr(old_name, '/') + 1,
@@ -1719,13 +1691,14 @@ dict_table_rename_in_cache(
 			strncpy(fkid, foreign->id, (sizeof fkid) - 1);
 			fkid[(sizeof fkid) - 1] = '\0';
 
-			if (strstr(fkid, TEMP_TABLE_PATH_PREFIX) == NULL) {
+			const bool on_tmp = dict_table_t::is_temporary_name(
+				fkid);
+
+			if (!on_tmp) {
 				innobase_convert_to_filename_charset(
 					strchr(fkid, '/') + 1,
 					strchr(foreign->id, '/') + 1,
 					MAX_TABLE_NAME_LEN+20);
-			} else {
-				on_tmp = TRUE;
 			}
 
 			old_id = mem_strdup(foreign->id);
@@ -1860,7 +1833,7 @@ dict_table_change_id_in_cache(
 	dict_table_t*	table,	/*!< in/out: table object already in cache */
 	table_id_t	new_id)	/*!< in: new id to set */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 	ut_ad(!table->is_temporary());
 
@@ -1917,7 +1890,9 @@ void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep)
 	/* Remove table from the hash tables of tables */
 
 	HASH_DELETE(dict_table_t, name_hash, &table_hash,
-		    ut_fold_string(table->name.m_name), table);
+		    my_crc32c(0, table->name.m_name,
+			      strlen(table->name.m_name)),
+		    table);
 
 	hash_table_t* id_hash = table->is_temporary()
 		? &temp_id_hash : &table_id_hash;
@@ -1931,31 +1906,16 @@ void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep)
 		UT_LIST_REMOVE(table_non_LRU, table);
 	}
 
-	if (lru && table->drop_aborted) {
-		/* When evicting the table definition,
-		drop the orphan indexes from the data dictionary
-		and free the index pages. */
-		trx_t* trx = trx_create();
-
-		ut_d(dict_sys.assert_locked());
-		/* Mimic row_mysql_lock_data_dictionary(). */
-		trx->dict_operation_lock_mode = RW_X_LATCH;
-
-		trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
-		row_merge_drop_indexes_dict(trx, table->id);
-		trx_commit_for_mysql(trx);
-		trx->dict_operation_lock_mode = 0;
-		trx->free();
-	}
-
 	/* Free virtual column template if any */
 	if (table->vc_templ != NULL) {
 		dict_free_vc_templ(table->vc_templ);
 		UT_DELETE(table->vc_templ);
 	}
 
+	table->lock_mutex_destroy();
+
 	if (keep) {
-		table->autoinc_mutex.~mutex();
+		table->autoinc_mutex.destroy();
 		return;
 	}
 
@@ -1966,20 +1926,20 @@ void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep)
 		table->fts = nullptr;
 	}
 
-	table->autoinc_mutex.lock();
+	table->autoinc_mutex.wr_lock();
 
 	ulint freed = UT_LIST_GET_LEN(table->freed_indexes);
 
 	table->vc_templ = NULL;
 	table->id = 0;
-	table->autoinc_mutex.unlock();
+	table->autoinc_mutex.wr_unlock();
 
 	if (UNIV_UNLIKELY(freed != 0)) {
 		return;
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
 
-	table->autoinc_mutex.~mutex();
+	table->autoinc_mutex.destroy();
 	dict_mem_table_free(table);
 }
 
@@ -2025,7 +1985,7 @@ dict_index_add_to_cache(
 	ulint		n_ord;
 	ulint		i;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	ut_ad(index->n_def == index->n_fields);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 	ut_ad(!dict_index_is_online_ddl(index));
@@ -2063,9 +2023,6 @@ dict_index_add_to_cache(
 	new_index->trx_id = index->trx_id;
 	new_index->set_committed(index->is_committed());
 	new_index->nulls_equal = index->nulls_equal;
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	new_index->disable_ahi = index->disable_ahi;
-#endif
 
 	n_ord = new_index->n_uniq;
 	/* Flag the ordering columns and also set column max_prefix */
@@ -2134,8 +2091,7 @@ dict_index_add_to_cache(
 #endif /* BTR_CUR_ADAPT */
 
 	new_index->page = unsigned(page_no);
-	rw_lock_create(index_tree_rw_lock_key, &new_index->lock,
-		       SYNC_INDEX_TREE);
+	new_index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
 
 	new_index->n_core_fields = new_index->n_fields;
 
@@ -2146,6 +2102,7 @@ dict_index_add_to_cache(
 
 /**********************************************************************//**
 Removes an index from the dictionary cache. */
+TRANSACTIONAL_TARGET
 static
 void
 dict_index_remove_from_cache_low(
@@ -2158,7 +2115,7 @@ dict_index_remove_from_cache_low(
 	ut_ad(table && index);
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	ut_ad(table->id);
 #ifdef BTR_CUR_HASH_ADAPT
 	ut_ad(!index->freed());
@@ -2168,7 +2125,6 @@ dict_index_remove_from_cache_low(
 	there can't be any active operations on this index (or table). */
 
 	if (index->online_log) {
-		ut_ad(index->online_status == ONLINE_INDEX_CREATION);
 		row_log_free(index->online_log);
 		index->online_log = NULL;
 	}
@@ -2178,9 +2134,9 @@ dict_index_remove_from_cache_low(
 
 	/* The index is being dropped, remove any compression stats for it. */
 	if (!lru_evict && DICT_TF_GET_ZIP_SSIZE(index->table->flags)) {
-		mutex_enter(&page_zip_stat_per_index_mutex);
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
 		page_zip_stat_per_index.erase(index->id);
-		mutex_exit(&page_zip_stat_per_index_mutex);
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 	}
 
 	/* Remove the index from affected virtual column index list */
@@ -2199,15 +2155,15 @@ dict_index_remove_from_cache_low(
 	zero. See also: dict_table_can_be_evicted() */
 
 	if (index->n_ahi_pages()) {
-		table->autoinc_mutex.lock();
+		table->autoinc_mutex.wr_lock();
 		index->set_freed();
 		UT_LIST_ADD_LAST(table->freed_indexes, index);
-		table->autoinc_mutex.unlock();
+		table->autoinc_mutex.wr_unlock();
 		return;
 	}
 #endif /* BTR_CUR_HASH_ADAPT */
 
-	rw_lock_free(&index->lock);
+	index->lock.free();
 
 	dict_mem_index_free(index);
 }
@@ -2240,7 +2196,7 @@ dict_index_find_cols(
 
 	const dict_table_t* table = index->table;
 	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	for (ulint i = 0; i < index->n_fields; i++) {
 		ulint		j;
@@ -2509,7 +2465,7 @@ dict_index_build_internal_clust(
 	ut_ad(index->is_primary());
 	ut_ad(!index->has_virtual());
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	/* Create a new index object with certainly enough fields */
 	new_index = dict_mem_index_create(index->table, index->name,
@@ -2664,7 +2620,7 @@ dict_index_build_internal_non_clust(
 	ut_ad(table && index);
 	ut_ad(!dict_index_is_clust(index));
 	ut_ad(!dict_index_is_ibuf(index));
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	/* The clustered index should be the first in the list of indexes */
 	clust_index = UT_LIST_GET_FIRST(table->indexes);
@@ -2758,7 +2714,7 @@ dict_index_build_internal_fts(
 	dict_index_t*	new_index;
 
 	ut_ad(index->type & DICT_FTS);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	/* Create a new index */
 	new_index = dict_mem_index_create(index->table, index->name,
@@ -2783,26 +2739,15 @@ dict_index_build_internal_fts(
 		table->fts->cache = fts_cache_create(table);
 	}
 
-	rw_lock_x_lock(&table->fts->cache->init_lock);
+	mysql_mutex_lock(&table->fts->cache->init_lock);
 	/* Notify the FTS cache about this index. */
 	fts_cache_index_cache_create(table, new_index);
-	rw_lock_x_unlock(&table->fts->cache->init_lock);
+	mysql_mutex_unlock(&table->fts->cache->init_lock);
 
 	return(new_index);
 }
 /*====================== FOREIGN KEY PROCESSING ========================*/
 
-/*********************************************************************//**
-Checks if a table is referenced by foreign keys.
-@return TRUE if table is referenced by a foreign key */
-ibool
-dict_table_is_referenced_by_foreign_key(
-/*====================================*/
-	const dict_table_t*	table)	/*!< in: InnoDB table */
-{
-	return(!table->referenced_set.empty());
-}
-
 /**********************************************************************//**
 Removes a foreign constraint struct from the dictionary cache. */
 void
@@ -2810,7 +2755,7 @@ dict_foreign_remove_from_cache(
 /*===========================*/
 	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 	ut_a(foreign);
 
 	if (foreign->referenced_table != NULL) {
@@ -2835,7 +2780,7 @@ dict_foreign_find(
 	dict_table_t*	table,		/*!< in: table object */
 	dict_foreign_t*	foreign)	/*!< in: foreign constraint */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.frozen());
 
 	ut_ad(dict_foreign_set_validate(table->foreign_set));
 	ut_ad(dict_foreign_set_validate(table->referenced_set));
@@ -2889,7 +2834,7 @@ dict_foreign_find_index(
 					/*!< out: index where error
 					happened */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.frozen());
 
 	if (error) {
 		*error = FK_INDEX_NOT_FOUND;
@@ -2942,7 +2887,7 @@ dict_foreign_error_report(
 	const char*	msg)	/*!< in: the error message */
 {
 	std::string fk_str;
-	mutex_enter(&dict_foreign_err_mutex);
+	mysql_mutex_lock(&dict_foreign_err_mutex);
 	dict_foreign_error_report_low(file, fk->foreign_table_name);
 	fputs(msg, file);
 	fputs(" Constraint:\n", file);
@@ -2954,7 +2899,7 @@ dict_foreign_error_report(
 			" %s\n%s\n", fk->foreign_index->name(),
 			FOREIGN_KEY_CONSTRAINTS_MSG);
 	}
-	mutex_exit(&dict_foreign_err_mutex);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
 }
 
 /**********************************************************************//**
@@ -2987,13 +2932,15 @@ dict_foreign_add_to_cache(
 	DBUG_ENTER("dict_foreign_add_to_cache");
 	DBUG_PRINT("dict_foreign_add_to_cache", ("id: %s", foreign->id));
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
-	for_table = dict_table_check_if_in_cache_low(
-		foreign->foreign_table_name_lookup);
+	for_table = dict_sys.find_table(
+		{foreign->foreign_table_name_lookup,
+		 strlen(foreign->foreign_table_name_lookup)});
 
-	ref_table = dict_table_check_if_in_cache_low(
-		foreign->referenced_table_name_lookup);
+	ref_table = dict_sys.find_table(
+		{foreign->referenced_table_name_lookup,
+		 strlen(foreign->referenced_table_name_lookup)});
 	ut_a(for_table || ref_table);
 
 	if (for_table) {
@@ -3389,8 +3336,8 @@ dict_get_referenced_table(
 	}
 
 	/* Copy database_name, '/', table_name, '\0' */
-	ref = static_cast<char*>(mem_heap_alloc(
-		heap, database_name_len + table_name_len + 2));
+	const size_t len = database_name_len + table_name_len + 1;
+	ref = static_cast<char*>(mem_heap_alloc(heap, len + 1));
 	memcpy(ref, database_name, database_name_len);
 	ref[database_name_len] = '/';
 	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
@@ -3398,22 +3345,22 @@ dict_get_referenced_table(
 	/* Values;  0 = Store and compare as given; case sensitive
 	            1 = Store and compare in lower; case insensitive
 	            2 = Store as given, compare in lower; case semi-sensitive */
-	if (innobase_get_lower_case_table_names() == 2) {
+	if (lower_case_table_names == 2) {
 		innobase_casedn_str(ref);
-		*table = dict_table_get_low(ref);
+		*table = dict_sys.load_table({ref, len});
 		memcpy(ref, database_name, database_name_len);
 		ref[database_name_len] = '/';
 		memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
 
 	} else {
 #ifndef _WIN32
-		if (innobase_get_lower_case_table_names() == 1) {
+		if (lower_case_table_names == 1) {
 			innobase_casedn_str(ref);
 		}
 #else
 		innobase_casedn_str(ref);
 #endif /* !_WIN32 */
-		*table = dict_table_get_low(ref);
+		*table = dict_sys.load_table({ref, len});
 	}
 
 	return(ref);
@@ -3633,7 +3580,7 @@ dict_foreign_parse_drop_constraints(
 
 	ptr = str;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 loop:
 	ptr = dict_scan_to(ptr, "DROP");
 
@@ -3692,7 +3639,7 @@ loop:
 		if (!srv_read_only_mode) {
 			FILE*	ef = dict_foreign_err_file;
 
-			mutex_enter(&dict_foreign_err_mutex);
+			mysql_mutex_lock(&dict_foreign_err_mutex);
 			rewind(ef);
 			ut_print_timestamp(ef);
 			fputs(" Error in dropping of a foreign key"
@@ -3701,7 +3648,7 @@ loop:
 			fprintf(ef, ",\nin SQL command\n%s"
 				"\nCannot find a constraint with the"
 				" given id %s.\n", str, id);
-			mutex_exit(&dict_foreign_err_mutex);
+			mysql_mutex_unlock(&dict_foreign_err_mutex);
 		}
 
 		ut_free(str);
@@ -3715,7 +3662,7 @@ syntax_error:
 	if (!srv_read_only_mode) {
 		FILE*	ef = dict_foreign_err_file;
 
-		mutex_enter(&dict_foreign_err_mutex);
+		mysql_mutex_lock(&dict_foreign_err_mutex);
 		rewind(ef);
 		ut_print_timestamp(ef);
 		fputs(" Syntax error in dropping of a"
@@ -3723,7 +3670,7 @@ syntax_error:
 		ut_print_name(ef, NULL, table->name.m_name);
 		fprintf(ef, ",\n"
 			"close to:\n%s\n in SQL command\n%s\n", ptr, str);
-		mutex_exit(&dict_foreign_err_mutex);
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
 	}
 
 	ut_free(str);
@@ -3735,14 +3682,14 @@ syntax_error:
 
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
-Assumes that dict_sys.mutex is already being held.
+Assumes that dict_sys.latch is already being held.
 @return index, NULL if not found */
 dict_index_t*
 dict_index_get_if_in_cache_low(
 /*===========================*/
 	index_id_t	index_id)	/*!< in: index id */
 {
-  ut_ad(mutex_own(&dict_sys.mutex));
+  ut_ad(dict_sys.frozen());
 
   for (dict_table_t *table= UT_LIST_GET_FIRST(dict_sys.table_LRU);
        table; table= UT_LIST_GET_NEXT(table_LRU, table))
@@ -3772,11 +3719,11 @@ dict_index_get_if_in_cache(
 		return(NULL);
 	}
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 
 	index = dict_index_get_if_in_cache_low(index_id);
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 
 	return(index);
 }
@@ -4062,7 +4009,7 @@ dict_print_info_on_foreign_keys(
 	dict_foreign_t*	foreign;
 	std::string 	str;
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 
 	for (dict_foreign_set::iterator it = table->foreign_set.begin();
 	     it != table->foreign_set.end();
@@ -4129,89 +4076,14 @@ dict_print_info_on_foreign_keys(
 		}
 	}
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 	return str;
 }
 
-/** Given a space_id of a file-per-table tablespace, search the
-dict_sys.table_LRU list and return the dict_table_t* pointer for it.
-@param	space	tablespace
-@return table if found, NULL if not */
-static
-dict_table_t*
-dict_find_single_table_by_space(const fil_space_t* space)
-{
-	dict_table_t*	table;
-	ulint		num_item;
-	ulint		count = 0;
-
-	ut_ad(space->id > 0);
-
-	if (!dict_sys.is_initialised()) {
-		/* This could happen when it's in redo processing. */
-		return(NULL);
-	}
-
-	table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
-	num_item =  UT_LIST_GET_LEN(dict_sys.table_LRU);
-
-	/* This function intentionally does not acquire mutex as it is used
-	by error handling code in deep call stack as last means to avoid
-	killing the server, so it worth to risk some consequences for
-	the action. */
-	while (table && count < num_item) {
-		if (table->space == space) {
-			if (dict_table_is_file_per_table(table)) {
-				return(table);
-			}
-			return(NULL);
-		}
-
-		table = UT_LIST_GET_NEXT(table_LRU, table);
-		count++;
-	}
-
-	return(NULL);
-}
-
-/**********************************************************************//**
-Flags a table with specified space_id corrupted in the data dictionary
-cache
-@return true if successful */
-bool dict_set_corrupted_by_space(const fil_space_t* space)
-{
-	dict_table_t*   table;
-
-	table = dict_find_single_table_by_space(space);
-
-	if (!table) {
-		return false;
-	}
-
-	/* mark the table->corrupted bit only, since the caller
-	could be too deep in the stack for SYS_INDEXES update */
-	table->corrupted = true;
-	table->file_unreadable = true;
-	return true;
-}
-
-/** Flag a table encrypted in the data dictionary cache. */
-void dict_set_encrypted_by_space(const fil_space_t* space)
-{
-	if (dict_table_t* table = dict_find_single_table_by_space(space)) {
-		table->file_unreadable = true;
-	}
-}
-
 /**********************************************************************//**
 Flags an index corrupted both in the data dictionary cache
 and in the SYS_INDEXES */
-void
-dict_set_corrupted(
-/*===============*/
-	dict_index_t*	index,	/*!< in/out: index */
-	trx_t*		trx,	/*!< in/out: transaction */
-	const char*	ctx)	/*!< in: context */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
 {
 	mem_heap_t*	heap;
 	mtr_t		mtr;
@@ -4221,21 +4093,17 @@ dict_set_corrupted(
 	byte*		buf;
 	const char*	status;
 	btr_cur_t	cursor;
-	bool		locked	= RW_X_LATCH == trx->dict_operation_lock_mode;
 
-	if (!locked) {
-		row_mysql_lock_data_dictionary(trx);
-	}
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	ut_ad(mutex_own(&dict_sys.mutex));
 	ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
 	ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
-	ut_ad(!sync_check_iterate(dict_sync_check()));
 
 	/* Mark the table as corrupted only if the clustered index
 	is corrupted */
 	if (dict_index_is_clust(index)) {
 		index->table->corrupted = TRUE;
+		goto func_exit;
 	}
 
 	if (index->type & DICT_CORRUPT) {
@@ -4273,10 +4141,12 @@ dict_set_corrupted(
 	dfield_set_data(dfield, buf, 8);
 
 	dict_index_copy_types(tuple, sys_index, 2);
+	cursor.page_cur.index = sys_index;
 
-	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_LE,
-				    BTR_MODIFY_LEAF,
-				    &cursor, __FILE__, __LINE__, &mtr);
+	if (cursor.search_leaf(tuple, PAGE_CUR_LE, BTR_MODIFY_LEAF, &mtr)
+	    != DB_SUCCESS) {
+		goto fail;
+	}
 
 	if (cursor.low_match == dtuple_get_n_fields(tuple)) {
 		/* UPDATE SYS_INDEXES SET TYPE=index->type
@@ -4296,39 +4166,12 @@ fail:
 	}
 
 	mtr_commit(&mtr);
-	mem_heap_empty(heap);
+	mem_heap_free(heap);
 	ib::error() << status << " corruption of " << index->name
 		<< " in table " << index->table->name << " in " << ctx;
-	mem_heap_free(heap);
 
 func_exit:
-	if (!locked) {
-		row_mysql_unlock_data_dictionary(trx);
-	}
-}
-
-/** Flags an index corrupted in the data dictionary cache only. This
-is used mostly to mark a corrupted index when index's own dictionary
-is corrupted, and we force to load such index for repair purpose
-@param[in,out]	index	index which is corrupted */
-void
-dict_set_corrupted_index_cache_only(
-	dict_index_t*	index)
-{
-	ut_ad(index != NULL);
-	ut_ad(index->table != NULL);
-	ut_ad(mutex_own(&dict_sys.mutex));
-	ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
-	ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
-
-	/* Mark the table as corrupted only if the clustered index
-	is corrupted */
-	if (dict_index_is_clust(index)) {
-		index->table->corrupted = TRUE;
-		index->table->file_unreadable = true;
-	}
-
-	index->type |= DICT_CORRUPT;
+	dict_sys.unlock();
 }
 
 /** Sets merge_threshold in the SYS_INDEXES
@@ -4351,13 +4194,11 @@ dict_index_set_merge_threshold(
 	ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
 	ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
 
-	dict_sys_lock();
-
 	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
 			       + sizeof(que_fork_t) + sizeof(upd_node_t)
 			       + sizeof(upd_t) + 12));
 
-	mtr_start(&mtr);
+	mtr.start();
 
 	sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
 
@@ -4375,10 +4216,12 @@ dict_index_set_merge_threshold(
 	dfield_set_data(dfield, buf, 8);
 
 	dict_index_copy_types(tuple, sys_index, 2);
+	cursor.page_cur.index = sys_index;
 
-	btr_cur_search_to_nth_level(sys_index, 0, tuple, PAGE_CUR_GE,
-				    BTR_MODIFY_LEAF,
-				    &cursor, __FILE__, __LINE__, &mtr);
+	if (cursor.search_leaf(tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &mtr)
+	    != DB_SUCCESS) {
+		goto func_exit;
+	}
 
 	if (cursor.up_match == dtuple_get_n_fields(tuple)
 	    && rec_get_n_fields_old(btr_cur_get_rec(&cursor))
@@ -4393,10 +4236,9 @@ dict_index_set_merge_threshold(
 					      field, merge_threshold);
 	}
 
+func_exit:
 	mtr_commit(&mtr);
 	mem_heap_free(heap);
-
-	dict_sys_unlock();
 }
 
 #ifdef UNIV_DEBUG
@@ -4414,10 +4256,10 @@ dict_set_merge_threshold_list_debug(
 		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
 		     index != NULL;
 		     index = UT_LIST_GET_NEXT(indexes, index)) {
-			rw_lock_x_lock(dict_index_get_lock(index));
+			index->lock.x_lock(SRW_LOCK_CALL);
 			index->merge_threshold = merge_threshold_all
 				& ((1U << 6) - 1);
-			rw_lock_x_unlock(dict_index_get_lock(index));
+			index->lock.x_unlock();
 		}
 	}
 }
@@ -4428,14 +4270,14 @@ void
 dict_set_merge_threshold_all_debug(
 	uint	merge_threshold_all)
 {
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 
 	dict_set_merge_threshold_list_debug(
 		&dict_sys.table_LRU, merge_threshold_all);
 	dict_set_merge_threshold_list_debug(
 		&dict_sys.table_non_LRU, merge_threshold_all);
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 }
 
 #endif /* UNIV_DEBUG */
@@ -4553,7 +4395,7 @@ dict_table_check_for_dup_indexes(
 	const dict_index_t*	index1;
 	const dict_index_t*	index2;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.frozen());
 
 	/* The primary index _must_ exist */
 	ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
@@ -4596,220 +4438,6 @@ dict_table_check_for_dup_indexes(
 }
 #endif /* UNIV_DEBUG */
 
-/** Auxiliary macro used inside dict_table_schema_check(). */
-#define CREATE_TYPES_NAMES() \
-	dtype_sql_name((unsigned) req_schema->columns[i].mtype, \
-		       (unsigned) req_schema->columns[i].prtype_mask, \
-		       (unsigned) req_schema->columns[i].len, \
-		       req_type, sizeof(req_type)); \
-	dtype_sql_name(table->cols[j].mtype, \
-		       table->cols[j].prtype, \
-		       table->cols[j].len, \
-		       actual_type, sizeof(actual_type))
-
-/*********************************************************************//**
-Checks whether a table exists and whether it has the given structure.
-The table must have the same number of columns with the same names and
-types. The order of the columns does not matter.
-The caller must own the dictionary mutex.
-dict_table_schema_check() @{
-@return DB_SUCCESS if the table exists and contains the necessary columns */
-dberr_t
-dict_table_schema_check(
-/*====================*/
-	dict_table_schema_t*	req_schema,	/*!< in/out: required table
-						schema */
-	char*			errstr,		/*!< out: human readable error
-						message if != DB_SUCCESS is
-						returned */
-	size_t			errstr_sz)	/*!< in: errstr size */
-{
-	char		buf[MAX_FULL_NAME_LEN];
-	char		req_type[64];
-	char		actual_type[64];
-	dict_table_t*	table;
-	ulint		i;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	table = dict_table_get_low(req_schema->table_name);
-
-	if (table == NULL) {
-		bool should_print=true;
-		/* no such table */
-
-		if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_table_stats") == 0) {
-			if (innodb_table_stats_not_found_reported == false && !opt_bootstrap) {
-				innodb_table_stats_not_found = true;
-				innodb_table_stats_not_found_reported = true;
-			} else {
-				should_print = false;
-			}
-		} else if (innobase_strcasecmp(req_schema->table_name, "mysql/innodb_index_stats") == 0 ) {
-			if (innodb_index_stats_not_found_reported == false && !opt_bootstrap) {
-				innodb_index_stats_not_found = true;
-				innodb_index_stats_not_found_reported = true;
-			} else {
-				should_print = false;
-			}
-		}
-
-		if (should_print) {
-			snprintf(errstr, errstr_sz,
-				"Table %s not found.",
-				ut_format_name(req_schema->table_name,
-					   buf, sizeof(buf)));
-			return(DB_TABLE_NOT_FOUND);
-		} else {
-			return(DB_STATS_DO_NOT_EXIST);
-		}
-	}
-
-	if (!table->is_readable() && !table->space) {
-		/* missing tablespace */
-
-		snprintf(errstr, errstr_sz,
-			    "Tablespace for table %s is missing.",
-			    ut_format_name(req_schema->table_name,
-					   buf, sizeof(buf)));
-
-		return(DB_TABLE_NOT_FOUND);
-	}
-
-	if (ulint(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) {
-		/* the table has a different number of columns than required */
-		snprintf(errstr, errstr_sz,
-			 "%s has %d columns but should have " ULINTPF ".",
-			 ut_format_name(req_schema->table_name, buf,
-					sizeof buf),
-			 table->n_def - DATA_N_SYS_COLS,
-			 req_schema->n_cols);
-
-		return(DB_ERROR);
-	}
-
-	/* For each column from req_schema->columns[] search
-	whether it is present in table->cols[].
-	The following algorithm is O(n_cols^2), but is optimized to
-	be O(n_cols) if the columns are in the same order in both arrays. */
-
-	for (i = 0; i < req_schema->n_cols; i++) {
-		ulint	j = dict_table_has_column(
-			table, req_schema->columns[i].name, i);
-
-		if (j == table->n_def) {
-
-			snprintf(errstr, errstr_sz,
-				    "required column %s"
-				    " not found in table %s.",
-				    req_schema->columns[i].name,
-				    ut_format_name(
-					    req_schema->table_name,
-					    buf, sizeof(buf)));
-
-			return(DB_ERROR);
-		}
-
-		/* we found a column with the same name on j'th position,
-		compare column types and flags */
-
-		/* check length for exact match */
-		if (req_schema->columns[i].len == table->cols[j].len) {
-		} else if (!strcmp(req_schema->table_name, TABLE_STATS_NAME)
-			   || !strcmp(req_schema->table_name,
-				      INDEX_STATS_NAME)) {
-			ut_ad(table->cols[j].len < req_schema->columns[i].len);
-			ib::warn() << "Table " << req_schema->table_name
-				   << " has length mismatch in the"
-				   << " column name "
-				   << req_schema->columns[i].name
-				   << ".  Please run mysql_upgrade";
-		} else {
-			CREATE_TYPES_NAMES();
-
-			snprintf(errstr, errstr_sz,
-				    "Column %s in table %s is %s"
-				    " but should be %s (length mismatch).",
-				    req_schema->columns[i].name,
-				    ut_format_name(req_schema->table_name,
-						   buf, sizeof(buf)),
-				    actual_type, req_type);
-
-			return(DB_ERROR);
-		}
-
-		/*
-                  check mtype for exact match.
-                  This check is relaxed to allow use to use TIMESTAMP
-                  (ie INT) for last_update instead of DATA_BINARY.
-                  We have to test for both values as the innodb_table_stats
-                  table may come from MySQL and have the old type.
-                */
-		if (req_schema->columns[i].mtype != table->cols[j].mtype &&
-                    !(req_schema->columns[i].mtype == DATA_INT &&
-                      table->cols[j].mtype == DATA_FIXBINARY))
-                {
-			CREATE_TYPES_NAMES();
-
-			snprintf(errstr, errstr_sz,
-				    "Column %s in table %s is %s"
-				    " but should be %s (type mismatch).",
-				    req_schema->columns[i].name,
-				    ut_format_name(req_schema->table_name,
-						   buf, sizeof(buf)),
-				    actual_type, req_type);
-
-			return(DB_ERROR);
-		}
-
-		/* check whether required prtype mask is set */
-		if (req_schema->columns[i].prtype_mask != 0
-		    && (table->cols[j].prtype
-			& req_schema->columns[i].prtype_mask)
-		       != req_schema->columns[i].prtype_mask) {
-
-			CREATE_TYPES_NAMES();
-
-			snprintf(errstr, errstr_sz,
-				    "Column %s in table %s is %s"
-				    " but should be %s (flags mismatch).",
-				    req_schema->columns[i].name,
-				    ut_format_name(req_schema->table_name,
-						   buf, sizeof(buf)),
-				    actual_type, req_type);
-
-			return(DB_ERROR);
-		}
-	}
-
-	if (req_schema->n_foreign != table->foreign_set.size()) {
-		snprintf(
-			errstr, errstr_sz,
-			"Table %s has " ULINTPF " foreign key(s) pointing"
-			" to other tables, but it must have " ULINTPF ".",
-			ut_format_name(req_schema->table_name,
-				       buf, sizeof(buf)),
-			static_cast<ulint>(table->foreign_set.size()),
-			req_schema->n_foreign);
-		return(DB_ERROR);
-	}
-
-	if (req_schema->n_referenced != table->referenced_set.size()) {
-		snprintf(
-			errstr, errstr_sz,
-			"There are " ULINTPF " foreign key(s) pointing to %s, "
-			"but there must be " ULINTPF ".",
-			static_cast<ulint>(table->referenced_set.size()),
-			ut_format_name(req_schema->table_name,
-				       buf, sizeof(buf)),
-			req_schema->n_referenced);
-		return(DB_ERROR);
-	}
-
-	return(DB_SUCCESS);
-}
-/* @} */
-
 /*********************************************************************//**
 Converts a database and table name from filesystem encoding
 (e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
@@ -4879,7 +4507,7 @@ void dict_sys_t::resize()
 {
   ut_ad(this == &dict_sys);
   ut_ad(is_initialised());
-  mutex_enter(&mutex);
+  lock(SRW_LOCK_CALL);
 
   /* all table entries are in table_LRU and table_non_LRU lists */
   table_hash.free();
@@ -4896,7 +4524,7 @@ void dict_sys_t::resize()
        table= UT_LIST_GET_NEXT(table_LRU, table))
   {
     ut_ad(!table->is_temporary());
-    ulint fold= ut_fold_string(table->name.m_name);
+    ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name));
     ulint id_fold= ut_fold_ull(table->id);
 
     HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
@@ -4906,7 +4534,7 @@ void dict_sys_t::resize()
   for (dict_table_t *table = UT_LIST_GET_FIRST(table_non_LRU); table;
        table= UT_LIST_GET_NEXT(table_LRU, table))
   {
-    ulint fold= ut_fold_string(table->name.m_name);
+    ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name));
     ulint id_fold= ut_fold_ull(table->id);
 
     HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
@@ -4917,7 +4545,7 @@ void dict_sys_t::resize()
     HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
   }
 
-  mutex_exit(&mutex);
+  unlock();
 }
 
 /** Close the data dictionary cache on shutdown. */
@@ -4926,10 +4554,10 @@ void dict_sys_t::close()
   ut_ad(this == &dict_sys);
   if (!is_initialised()) return;
 
-  mutex_enter(&mutex);
+  lock(SRW_LOCK_CALL);
 
-  /* Free the hash elements. We don't remove them from the table
-  because we are going to destroy the table anyway. */
+  /* Free the hash elements. We don't remove them from table_hash
+  because we are invoking table_hash.free() below. */
   for (ulint i= table_hash.n_cells; i--; )
     while (dict_table_t *table= static_cast<dict_table_t*>
            (HASH_GET_FIRST(&table_hash, i)))
@@ -4944,11 +4572,10 @@ void dict_sys_t::close()
   /* No temporary tables should exist at this point. */
   temp_id_hash.free();
 
-  mutex_exit(&mutex);
-  mutex_free(&mutex);
-  rw_lock_free(&latch);
+  unlock();
+  latch.destroy();
 
-  mutex_free(&dict_foreign_err_mutex);
+  mysql_mutex_destroy(&dict_foreign_err_mutex);
 
   if (dict_foreign_err_file)
   {
@@ -4970,7 +4597,7 @@ dict_lru_validate(void)
 {
 	dict_table_t*	table;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.frozen());
 
 	for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
 	     table != NULL;
@@ -5264,9 +4891,3 @@ dict_tf_to_row_format_string(
 	ut_error;
 	return(0);
 }
-
-bool dict_table_t::is_stats_table() const
-{
-  return !strcmp(name.m_name, TABLE_STATS_NAME) ||
-         !strcmp(name.m_name, INDEX_STATS_NAME);
-}
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
index 5db679dbfc9..129a2539341 100644
--- a/storage/innobase/dict/dict0load.cc
+++ b/storage/innobase/dict/dict0load.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2022, MariaDB Corporation.
+Copyright (c) 2016, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -27,14 +27,13 @@ Created 4/24/1996 Heikki Tuuri
 
 #include "dict0load.h"
 
-#include "mysql_version.h"
+#include "log.h"
 #include "btr0pcur.h"
 #include "btr0btr.h"
 #include "dict0boot.h"
 #include "dict0crea.h"
 #include "dict0dict.h"
 #include "dict0mem.h"
-#include "dict0priv.h"
 #include "dict0stats.h"
 #include "fsp0file.h"
 #include "fts0priv.h"
@@ -44,20 +43,7 @@ Created 4/24/1996 Heikki Tuuri
 #include "srv0start.h"
 #include "srv0srv.h"
 #include "fts0opt.h"
-
-/** Following are the InnoDB system tables. The positions in
-this array are referenced by enum dict_system_table_id. */
-static const char* SYSTEM_TABLE_NAME[] = {
-	"SYS_TABLES",
-	"SYS_INDEXES",
-	"SYS_COLUMNS",
-	"SYS_FIELDS",
-	"SYS_FOREIGN",
-	"SYS_FOREIGN_COLS",
-	"SYS_TABLESPACES",
-	"SYS_DATAFILES",
-	"SYS_VIRTUAL"
-};
+#include "row0vers.h"
 
 /** Loads a table definition and also all its index definitions.
 
@@ -73,70 +59,56 @@ key constraints are loaded into memory.
 @param[out]	fk_tables	Related table names that must also be
 				loaded to ensure that all foreign key
 				constraints are loaded.
-@return table, NULL if does not exist; if the table is stored in an
-.ibd file, but the file does not exist, then we set the
-file_unreadable flag in the table object we return */
-static
-dict_table_t*
-dict_load_table_one(
-	const table_name_t&	name,
-	dict_err_ignore_t	ignore_err,
-	dict_names_t&		fk_tables);
-
-/** Load a table definition from a SYS_TABLES record to dict_table_t.
-Do not load any columns or indexes.
-@param[in]	name		Table name
-@param[in]	rec		SYS_TABLES record
-@param[out,own]	table		table, or NULL
-@return	error message
-@retval	NULL on success */
-static const char* dict_load_table_low(const table_name_t& name,
-				       const rec_t* rec, dict_table_t** table)
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+@return table, possibly with file_unreadable flag set
+@retval nullptr if the table does not exist */
+static dict_table_t *dict_load_table_one(const span<const char> &name,
+                                         dict_err_ignore_t ignore_err,
+                                         dict_names_t &fk_tables);
 
 /** Load an index definition from a SYS_INDEXES record to dict_index_t.
-If allocate=TRUE, we will create a dict_index_t structure and fill it
-accordingly. If allocated=FALSE, the dict_index_t will be supplied by
-the caller and filled with information read from the record.
 @return	error message
 @retval	NULL on success */
 static
 const char*
 dict_load_index_low(
 	byte*		table_id,	/*!< in/out: table id (8 bytes),
-					an "in" value if allocate=TRUE
-					and "out" when allocate=FALSE */
+					an "in" value if mtr
+					and "out" when !mtr */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
 	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
-	ibool		allocate,	/*!< in: TRUE=allocate *index,
-					FALSE=fill in a pre-allocated
-					*index */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction,
+					or nullptr if a pre-allocated
+					*index is to be filled in */
+	dict_table_t*	table,		/*!< in/out: table, or NULL */
 	dict_index_t**	index);		/*!< out,own: index, or NULL */
 
 /** Load a table column definition from a SYS_COLUMNS record to dict_table_t.
-@return	error message
-@retval	NULL on success */
-static
-const char*
-dict_load_column_low(
-	dict_table_t*	table,		/*!< in/out: table, could be NULL
-					if we just populate a dict_column_t
-					struct with information from
-					a SYS_COLUMNS record */
-	mem_heap_t*	heap,		/*!< in/out: memory heap
-					for temporary storage */
-	dict_col_t*	column,		/*!< out: dict_column_t to fill,
-					or NULL if table != NULL */
-	table_id_t*	table_id,	/*!< out: table id */
-	const char**	col_name,	/*!< out: column name */
-	const rec_t*	rec,		/*!< in: SYS_COLUMNS record */
-	ulint*		nth_v_col);	/*!< out: if not NULL, this
-					records the "n" of "nth" virtual
-					column */
+@param table           table, or nullptr if the output will be in column
+@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED
+@param heap            memory heap for temporary storage
+@param column          pointer to output buffer, or nullptr if table!=nullptr
+@param table_id        table identifier
+@param col_name        column name
+@param rec             SYS_COLUMNS record
+@param mtr             mini-transaction
+@param nth_v_col       nullptr, or pointer to a counter of virtual columns
+@return error message
+@retval nullptr on success */
+static const char *dict_load_column_low(dict_table_t *table,
+                                        unsigned use_uncommitted,
+                                        mem_heap_t *heap, dict_col_t *column,
+                                        table_id_t *table_id,
+                                        const char **col_name,
+                                        const rec_t *rec,
+                                        mtr_t *mtr,
+                                        ulint *nth_v_col);
 
 /** Load a virtual column "mapping" (to base columns) information
 from a SYS_VIRTUAL record
 @param[in,out]	table		table
+@param[in]	uncommitted	false=READ COMMITTED, true=READ UNCOMMITTED
 @param[in,out]	column		mapped base column's dict_column_t
 @param[in,out]	table_id	table id
 @param[in,out]	pos		virtual column position
@@ -148,6 +120,7 @@ static
 const char*
 dict_load_virtual_low(
 	dict_table_t*	table,
+	bool		uncommitted,
 	dict_col_t**	column,
 	table_id_t*	table_id,
 	ulint*		pos,
@@ -163,6 +136,8 @@ dict_load_field_low(
 	byte*		index_id,	/*!< in/out: index id (8 bytes)
 					an "in" value if index != NULL
 					and "out" if index == NULL */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
 	dict_index_t*	index,		/*!< in/out: index, could be NULL
 					if we just populate a dict_field_t
 					struct with information from
@@ -173,12 +148,9 @@ dict_load_field_low(
 	byte*		last_index_id,	/*!< in: last index id */
 	mem_heap_t*	heap,		/*!< in/out: memory heap
 					for temporary storage */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
 	const rec_t*	rec);		/*!< in: SYS_FIELDS record */
 
-/* If this flag is TRUE, then we will load the cluster index's (and tables')
-metadata even if it is marked as "corrupted". */
-my_bool     srv_load_corrupted;
-
 #ifdef UNIV_DEBUG
 /****************************************************************//**
 Compare the name of an index column.
@@ -201,89 +173,6 @@ name_of_col_is(
 #endif /* UNIV_DEBUG */
 
 /********************************************************************//**
-Finds the first table name in the given database.
-@return own: table name, NULL if does not exist; the caller must free
-the memory in the string! */
-char*
-dict_get_first_table_name_in_db(
-/*============================*/
-	const char*	name)	/*!< in: database name which ends in '/' */
-{
-	dict_table_t*	sys_tables;
-	btr_pcur_t	pcur;
-	dict_index_t*	sys_index;
-	dtuple_t*	tuple;
-	mem_heap_t*	heap;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	const byte*	field;
-	ulint		len;
-	mtr_t		mtr;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	heap = mem_heap_create(1000);
-
-	mtr_start(&mtr);
-
-	sys_tables = dict_table_get_low("SYS_TABLES");
-	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_ad(!dict_table_is_comp(sys_tables));
-
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	dfield_set_data(dfield, name, strlen(name));
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-loop:
-	rec = btr_pcur_get_rec(&pcur);
-
-	if (!btr_pcur_is_on_user_rec(&pcur)) {
-		/* Not found */
-
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap);
-
-		return(NULL);
-	}
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLES__NAME, &len);
-
-	if (len < strlen(name)
-	    || memcmp(name, field, strlen(name))) {
-		/* Not found */
-
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap);
-
-		return(NULL);
-	}
-
-	if (!rec_get_deleted_flag(rec, 0)) {
-
-		/* We found one */
-
-		char*	table_name = mem_strdupl((char*) field, len);
-
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap);
-
-		return(table_name);
-	}
-
-	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-
-	goto loop;
-}
-
-/********************************************************************//**
 This function gets the next system table record as it scans the table.
 @return the next record if found, NULL if end of scan */
 static
@@ -296,7 +185,7 @@ dict_getnext_system_low(
 {
 	rec_t*	rec = NULL;
 
-	while (!rec || rec_get_deleted_flag(rec, 0)) {
+	while (!rec) {
 		btr_pcur_move_to_next_user_rec(pcur, mtr);
 
 		rec = btr_pcur_get_rec(pcur);
@@ -324,24 +213,17 @@ dict_startscan_system(
 	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
 					the record */
 	mtr_t*		mtr,		/*!< in: the mini-transaction */
-	dict_system_id_t system_id)	/*!< in: which system table to open */
+	dict_table_t*	table)		/*!< in: system table */
 {
-	dict_table_t*	system_table;
-	dict_index_t*	clust_index;
-	const rec_t*	rec;
-
-	ut_a(system_id < SYS_NUM_SYSTEM_TABLES);
-
-	system_table = dict_table_get_low(SYSTEM_TABLE_NAME[system_id]);
-
-	clust_index = UT_LIST_GET_FIRST(system_table->indexes);
-
-	btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF, pcur,
-				    true, 0, mtr);
-
-	rec = dict_getnext_system_low(pcur, mtr);
-
-	return(rec);
+  btr_pcur_init(pcur);
+  if (pcur->open_leaf(true, table->indexes.start, BTR_SEARCH_LEAF, mtr) !=
+      DB_SUCCESS)
+    return nullptr;
+  const rec_t *rec;
+  do
+    rec= dict_getnext_system_low(pcur, mtr);
+  while (rec && rec_get_deleted_flag(rec, 0));
+  return rec;
 }
 
 /********************************************************************//**
@@ -354,55 +236,12 @@ dict_getnext_system(
 					to the record */
 	mtr_t*		mtr)		/*!< in: the mini-transaction */
 {
-	const rec_t*	rec;
-
-	/* Restore the position */
-	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
-
-	/* Get the next record */
-	rec = dict_getnext_system_low(pcur, mtr);
-
-	return(rec);
-}
-
-/********************************************************************//**
-This function processes one SYS_TABLES record and populate the dict_table_t
-struct for the table.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_tables_rec_and_mtr_commit(
-/*=======================================*/
-	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
-	const rec_t*	rec,		/*!< in: SYS_TABLES record */
-	dict_table_t**	table,		/*!< out: dict_table_t to fill */
-	bool		cached,		/*!< in: whether to load from cache */
-	mtr_t*		mtr)		/*!< in/out: mini-transaction,
-					will be committed */
-{
-	ulint		len;
-	const char*	field;
-
-	field = (const char*) rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLES__NAME, &len);
-
-	ut_a(!rec_get_deleted_flag(rec, 0));
-
-	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_S_FIX));
-
-	/* Get the table name */
-	table_name_t table_name(mem_heap_strdupl(heap, field, len));
-
-	if (cached) {
-		/* Commit before load the table again */
-		mtr_commit(mtr);
-
-		*table = dict_table_get_low(table_name.m_name);
-		return *table ? NULL : "Table not found in cache";
-	} else {
-		const char* err = dict_load_table_low(table_name, rec, table);
-		mtr_commit(mtr);
-		return err;
-	}
+  const rec_t *rec=nullptr;
+  if (pcur->restore_position(BTR_SEARCH_LEAF, mtr) != btr_pcur_t::CORRUPTED)
+    do
+      rec= dict_getnext_system_low(pcur, mtr);
+    while (rec && rec_get_deleted_flag(rec, 0));
+  return rec;
 }
 
 /********************************************************************//**
@@ -418,19 +257,16 @@ dict_process_sys_indexes_rec(
 	dict_index_t*	index,		/*!< out: index to be filled */
 	table_id_t*	table_id)	/*!< out: index table id */
 {
-	const char*	err_msg;
-	byte*		buf;
+  byte buf[8];
 
-	ut_d(index->is_dummy = true);
-	ut_d(index->in_instant_init = false);
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+  ut_d(index->is_dummy = true);
+  ut_d(index->in_instant_init = false);
 
-	/* Parse the record, and get "dict_index_t" struct filled */
-	err_msg = dict_load_index_low(buf, heap, rec, FALSE, &index);
-
-	*table_id = mach_read_from_8(buf);
-
-	return(err_msg);
+  /* Parse the record, and get "dict_index_t" struct filled */
+  const char *err_msg= dict_load_index_low(buf, false, heap, rec,
+                                           nullptr, nullptr, &index);
+  *table_id= mach_read_from_8(buf);
+  return err_msg;
 }
 
 /********************************************************************//**
@@ -451,8 +287,9 @@ dict_process_sys_columns_rec(
 	const char*	err_msg;
 
 	/* Parse the record, and get "dict_col_t" struct filled */
-	err_msg = dict_load_column_low(NULL, heap, column,
-				       table_id, col_name, rec, nth_v_col);
+	err_msg = dict_load_column_low(NULL, 0, heap, column,
+				       table_id, col_name, rec, nullptr,
+				       nth_v_col);
 
 	return(err_msg);
 }
@@ -471,13 +308,8 @@ dict_process_sys_virtual_rec(
 	ulint*		pos,
 	ulint*		base_pos)
 {
-	const char*	err_msg;
-
-	/* Parse the record, and get "dict_col_t" struct filled */
-	err_msg = dict_load_virtual_low(NULL, NULL, table_id,
-					pos, base_pos, rec);
-
-	return(err_msg);
+  return dict_load_virtual_low(nullptr, false, nullptr, table_id,
+                               pos, base_pos, rec);
 }
 
 /********************************************************************//**
@@ -495,17 +327,14 @@ dict_process_sys_fields_rec(
 	index_id_t*	index_id,	/*!< out: current index id */
 	index_id_t	last_id)	/*!< in: previous index id */
 {
-	byte*		buf;
-	byte*		last_index_id;
+	byte		buf[8];
+	byte		last_index_id[8];
 	const char*	err_msg;
 
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
-
-	last_index_id = static_cast<byte*>(mem_heap_alloc(heap, 8));
 	mach_write_to_8(last_index_id, last_id);
 
-	err_msg = dict_load_field_low(buf, NULL, sys_field,
-				      pos, last_index_id, heap, rec);
+	err_msg = dict_load_field_low(buf, false, nullptr, sys_field,
+				      pos, last_index_id, heap, nullptr, rec);
 
 	*index_id = mach_read_from_8(buf);
 
@@ -662,307 +491,6 @@ err_len:
 	return(NULL);
 }
 
-/********************************************************************//**
-This function parses a SYS_TABLESPACES record, extracts necessary
-information from the record and returns to caller.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_tablespaces(
-/*=========================*/
-	mem_heap_t*	heap,		/*!< in/out: heap memory */
-	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
-	uint32_t*	space,		/*!< out: tablespace identifier */
-	const char**	name,		/*!< out: tablespace name */
-	ulint*		flags)		/*!< out: tablespace flags */
-{
-	ulint		len;
-	const byte*	field;
-
-	if (rec_get_deleted_flag(rec, 0)) {
-		return("delete-marked record in SYS_TABLESPACES");
-	}
-
-	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLESPACES) {
-		return("wrong number of columns in SYS_TABLESPACES record");
-	}
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len);
-	if (len != DICT_FLD_LEN_SPACE) {
-err_len:
-		return("incorrect column length in SYS_TABLESPACES");
-	}
-	*space = mach_read_from_4(field);
-
-	rec_get_nth_field_offs_old(
-		rec, DICT_FLD__SYS_TABLESPACES__DB_TRX_ID, &len);
-	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
-		goto err_len;
-	}
-
-	rec_get_nth_field_offs_old(
-		rec, DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR, &len);
-	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
-		goto err_len;
-	}
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLESPACES__NAME, &len);
-	if (len == 0 || len == UNIV_SQL_NULL) {
-		goto err_len;
-	}
-	*name = mem_heap_strdupl(heap, (char*) field, len);
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len);
-	if (len != DICT_FLD_LEN_FLAGS) {
-		goto err_len;
-	}
-	*flags = mach_read_from_4(field);
-
-	return(NULL);
-}
-
-/********************************************************************//**
-This function parses a SYS_DATAFILES record, extracts necessary
-information from the record and returns it to the caller.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_datafiles(
-/*=======================*/
-	mem_heap_t*	heap,		/*!< in/out: heap memory */
-	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
-	uint32_t*	space,		/*!< out: space id */
-	const char**	path)		/*!< out: datafile paths */
-{
-	ulint		len;
-	const byte*	field;
-
-	if (rec_get_deleted_flag(rec, 0)) {
-		return("delete-marked record in SYS_DATAFILES");
-	}
-
-	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_DATAFILES) {
-		return("wrong number of columns in SYS_DATAFILES record");
-	}
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_DATAFILES__SPACE, &len);
-	if (len != DICT_FLD_LEN_SPACE) {
-err_len:
-		return("incorrect column length in SYS_DATAFILES");
-	}
-	*space = mach_read_from_4(field);
-
-	rec_get_nth_field_offs_old(
-		rec, DICT_FLD__SYS_DATAFILES__DB_TRX_ID, &len);
-	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
-		goto err_len;
-	}
-
-	rec_get_nth_field_offs_old(
-		rec, DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR, &len);
-	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
-		goto err_len;
-	}
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
-	if (len == 0 || len == UNIV_SQL_NULL) {
-		goto err_len;
-	}
-	*path = mem_heap_strdupl(heap, (char*) field, len);
-
-	return(NULL);
-}
-
-/** Get the first filepath from SYS_DATAFILES for a given space_id.
-@param[in]	space_id	Tablespace ID
-@return First filepath (caller must invoke ut_free() on it)
-@retval NULL if no SYS_DATAFILES entry was found. */
-static char*
-dict_get_first_path(
-	ulint	space_id)
-{
-	mtr_t		mtr;
-	dict_table_t*	sys_datafiles;
-	dict_index_t*	sys_index;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	byte*		buf;
-	btr_pcur_t	pcur;
-	const rec_t*	rec;
-	const byte*	field;
-	ulint		len;
-	char*		filepath = NULL;
-	mem_heap_t*	heap = mem_heap_create(1024);
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	mtr_start(&mtr);
-
-	sys_datafiles = dict_table_get_low("SYS_DATAFILES");
-	sys_index = UT_LIST_GET_FIRST(sys_datafiles->indexes);
-
-	ut_ad(!dict_table_is_comp(sys_datafiles));
-	ut_ad(name_of_col_is(sys_datafiles, sys_index,
-			     DICT_FLD__SYS_DATAFILES__SPACE, "SPACE"));
-	ut_ad(name_of_col_is(sys_datafiles, sys_index,
-			     DICT_FLD__SYS_DATAFILES__PATH, "PATH"));
-
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, DICT_FLD__SYS_DATAFILES__SPACE);
-
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
-	mach_write_to_4(buf, space_id);
-
-	dfield_set_data(dfield, buf, 4);
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-
-	rec = btr_pcur_get_rec(&pcur);
-
-	/* Get the filepath from this SYS_DATAFILES record. */
-	if (btr_pcur_is_on_user_rec(&pcur)) {
-		field = rec_get_nth_field_old(
-			rec, DICT_FLD__SYS_DATAFILES__SPACE, &len);
-		ut_a(len == 4);
-
-		if (space_id == mach_read_from_4(field)) {
-			/* A record for this space ID was found. */
-			field = rec_get_nth_field_old(
-				rec, DICT_FLD__SYS_DATAFILES__PATH, &len);
-
-			ut_ad(len > 0);
-			ut_ad(len < OS_FILE_MAX_PATH);
-
-			if (len > 0 && len < UNIV_SQL_NULL) {
-				filepath = mem_strdupl(
-					reinterpret_cast<const char*>(field),
-					len);
-				ut_ad(filepath != NULL);
-
-				/* The dictionary may have been written on
-				another OS. */
-				os_normalize_path(filepath);
-			}
-		}
-	}
-
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-	mem_heap_free(heap);
-
-	return(filepath);
-}
-
-/** Update the record for space_id in SYS_TABLESPACES to this filepath.
-@param[in]	space_id	Tablespace ID
-@param[in]	filepath	Tablespace filepath
-@return DB_SUCCESS if OK, dberr_t if the insert failed */
-dberr_t
-dict_update_filepath(
-	ulint		space_id,
-	const char*	filepath)
-{
-	if (!srv_sys_tablespaces_open) {
-		/* Startup procedure is not yet ready for updates. */
-		return(DB_SUCCESS);
-	}
-
-	dberr_t		err = DB_SUCCESS;
-	trx_t*		trx;
-
-	ut_d(dict_sys.assert_locked());
-
-	trx = trx_create();
-	trx->op_info = "update filepath";
-	trx->dict_operation_lock_mode = RW_X_LATCH;
-	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
-
-	pars_info_t*	info = pars_info_create();
-
-	pars_info_add_int4_literal(info, "space", space_id);
-	pars_info_add_str_literal(info, "path", filepath);
-
-	err = que_eval_sql(info,
-			   "PROCEDURE UPDATE_FILEPATH () IS\n"
-			   "BEGIN\n"
-			   "UPDATE SYS_DATAFILES"
-			   " SET PATH = :path\n"
-			   " WHERE SPACE = :space;\n"
-			   "END;\n", FALSE, trx);
-
-	trx_commit_for_mysql(trx);
-	trx->dict_operation_lock_mode = 0;
-	trx->free();
-
-	if (UNIV_LIKELY(err == DB_SUCCESS)) {
-		/* We just updated SYS_DATAFILES due to the contents in
-		a link file.  Make a note that we did this. */
-		ib::info() << "The InnoDB data dictionary table SYS_DATAFILES"
-			" for tablespace ID " << space_id
-			<< " was updated to use file " << filepath << ".";
-	} else {
-		ib::warn() << "Error occurred while updating InnoDB data"
-			" dictionary table SYS_DATAFILES for tablespace ID "
-			<< space_id << " to file " << filepath << ": "
-			<< err << ".";
-	}
-
-	return(err);
-}
-
-/** Replace records in SYS_TABLESPACES and SYS_DATAFILES associated with
-the given space_id using an independent transaction.
-@param[in]	space_id	Tablespace ID
-@param[in]	name		Tablespace name
-@param[in]	filepath	First filepath
-@param[in]	fsp_flags	Tablespace flags
-@return DB_SUCCESS if OK, dberr_t if the insert failed */
-dberr_t
-dict_replace_tablespace_and_filepath(
-	ulint		space_id,
-	const char*	name,
-	const char*	filepath,
-	ulint		fsp_flags)
-{
-	if (!srv_sys_tablespaces_open) {
-		/* Startup procedure is not yet ready for updates.
-		Return success since this will likely get updated
-		later. */
-		return(DB_SUCCESS);
-	}
-
-	dberr_t		err = DB_SUCCESS;
-	trx_t*		trx;
-
-	DBUG_EXECUTE_IF("innodb_fail_to_update_tablespace_dict",
-			return(DB_INTERRUPTED););
-
-	ut_d(dict_sys.assert_locked());
-	ut_ad(filepath);
-
-	trx = trx_create();
-	trx->op_info = "insert tablespace and filepath";
-	trx->dict_operation_lock_mode = RW_X_LATCH;
-	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
-
-	/* A record for this space ID was not found in
-	SYS_DATAFILES. Assume the record is also missing in
-	SYS_TABLESPACES.  Insert records into them both. */
-	err = dict_replace_tablespace_in_dictionary(
-		space_id, name, fsp_flags, filepath, trx);
-
-	trx_commit_for_mysql(trx);
-	trx->dict_operation_lock_mode = 0;
-	trx->free();
-
-	return(err);
-}
-
 /** Check the validity of a SYS_TABLES record
 Make sure the fields are the right length and that they
 do not contain invalid contents.
@@ -976,11 +504,7 @@ dict_sys_tables_rec_check(
 	const byte*	field;
 	ulint		len;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	if (rec_get_deleted_flag(rec, 0)) {
-		return("delete-marked record in SYS_TABLES");
-	}
+	ut_ad(dict_sys.locked());
 
 	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) {
 		return("wrong number of columns in SYS_TABLES record");
@@ -1046,53 +570,6 @@ err_len:
 	return(NULL);
 }
 
-/** Read and return the contents of a SYS_TABLESPACES record.
-@param[in]	rec	A record of SYS_TABLESPACES
-@param[out]	id	Pointer to the space_id for this table
-@param[in,out]	name	Buffer for Tablespace Name of length NAME_LEN
-@param[out]	flags	Pointer to tablespace flags
-@return true if the record was read correctly, false if not. */
-bool
-dict_sys_tablespaces_rec_read(
-	const rec_t*	rec,
-	ulint*		id,
-	char*		name,
-	ulint*		flags)
-{
-	const byte*	field;
-	ulint		len;
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLESPACES__SPACE, &len);
-	if (len != DICT_FLD_LEN_SPACE) {
-		ib::error() << "Wrong field length in SYS_TABLESPACES.SPACE: "
-		<< len;
-		return(false);
-	}
-	*id = mach_read_from_4(field);
-
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLESPACES__NAME, &len);
-	if (len == 0 || len == UNIV_SQL_NULL) {
-		ib::error() << "Wrong field length in SYS_TABLESPACES.NAME: "
-			<< len;
-		return(false);
-	}
-	strncpy(name, reinterpret_cast<const char*>(field), NAME_LEN);
-
-	/* read the 4 byte flags from the TYPE field */
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLESPACES__FLAGS, &len);
-	if (len != 4) {
-		ib::error() << "Wrong field length in SYS_TABLESPACES.FLAGS: "
-			<< len;
-		return(false);
-	}
-	*flags = mach_read_from_4(field);
-
-	return(true);
-}
-
 /** Check if SYS_TABLES.TYPE is valid
 @param[in]	type		SYS_TABLES.TYPE
 @param[in]	not_redundant	whether ROW_FORMAT=REDUNDANT is not used
@@ -1155,30 +632,80 @@ dict_sys_tables_type_to_tf(ulint type, bool not_redundant)
 	return(flags);
 }
 
+/** Outcome of dict_sys_tables_rec_read() */
+enum table_read_status { READ_OK= 0, READ_ERROR, READ_NOT_FOUND };
+
 /** Read and return 5 integer fields from a SYS_TABLES record.
 @param[in]	rec		A record of SYS_TABLES
-@param[in]	name		Table Name, the same as SYS_TABLES.NAME
+@param[in]	uncommitted	true=use READ UNCOMMITTED, false=READ COMMITTED
+@param[in]	mtr		mini-transaction
 @param[out]	table_id	Pointer to the table_id for this table
 @param[out]	space_id	Pointer to the space_id for this table
 @param[out]	n_cols		Pointer to number of columns for this table.
 @param[out]	flags		Pointer to table flags
 @param[out]	flags2		Pointer to table flags2
-@return true if the record was read correctly, false if not. */
+@param[out]	trx_id		DB_TRX_ID of the committed SYS_TABLES record,
+				or nullptr to perform READ UNCOMMITTED
+@return whether the record was read correctly */
 MY_ATTRIBUTE((warn_unused_result))
 static
-bool
+table_read_status
 dict_sys_tables_rec_read(
 	const rec_t*		rec,
-	const table_name_t&	table_name,
+	bool			uncommitted,
+	mtr_t*			mtr,
 	table_id_t*		table_id,
 	ulint*			space_id,
 	ulint*			n_cols,
 	ulint*			flags,
-	ulint*			flags2)
+	ulint*			flags2,
+	trx_id_t*		trx_id)
 {
 	const byte*	field;
 	ulint		len;
 	ulint		type;
+	mem_heap_t*	heap = nullptr;
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+	ut_ad(len == 6 || len == UNIV_SQL_NULL);
+	trx_id_t id = len == 6 ? trx_read_trx_id(field) : 0;
+	if (id && !uncommitted && trx_sys.find(nullptr, id, false)) {
+		const auto savepoint = mtr->get_savepoint();
+		heap = mem_heap_create(1024);
+		dict_index_t* index = UT_LIST_GET_FIRST(
+			dict_sys.sys_tables->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!rec) {
+			mem_heap_free(heap);
+			return READ_NOT_FOUND;
+		}
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+		if (UNIV_UNLIKELY(len != 6)) {
+			mem_heap_free(heap);
+			return READ_ERROR;
+		}
+		id = trx_read_trx_id(field);
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(id);
+		if (trx_id) {
+			return READ_NOT_FOUND;
+		}
+	}
+
+	if (trx_id) {
+		*trx_id = id;
+	}
 
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_TABLES__ID, &len);
@@ -1283,11 +810,17 @@ dict_sys_tables_rec_read(
 	const bool not_redundant = 0 != (*n_cols & DICT_N_COLS_COMPACT);
 
 	if (!dict_sys_tables_type_valid(type, not_redundant)) {
-		ib::error() << "Table " << table_name << " in InnoDB"
-			" data dictionary contains invalid flags."
-			" SYS_TABLES.TYPE=" << type <<
-			" SYS_TABLES.N_COLS=" << *n_cols;
-		return(false);
+		sql_print_error("InnoDB: Table %.*s in InnoDB"
+				" data dictionary contains invalid flags."
+				" SYS_TABLES.TYPE=" ULINTPF
+				" SYS_TABLES.N_COLS=" ULINTPF,
+				int(rec_get_field_start_offs(rec, 1)), rec,
+				type, *n_cols);
+err_exit:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return READ_ERROR;
 	}
 
 	*flags = dict_sys_tables_type_to_tf(type, not_redundant);
@@ -1307,11 +840,15 @@ dict_sys_tables_rec_read(
 		*flags2 = mach_read_from_4(field);
 
 		if (!dict_tf2_is_valid(*flags, *flags2)) {
-			ib::error() << "Table " << table_name << " in InnoDB"
-				" data dictionary contains invalid flags."
-				" SYS_TABLES.TYPE=" << type
-				<< " SYS_TABLES.MIX_LEN=" << *flags2;
-			return(false);
+			sql_print_error("InnoDB: Table %.*s in InnoDB"
+					" data dictionary"
+					" contains invalid flags."
+					" SYS_TABLES.TYPE=" ULINTPF
+					" SYS_TABLES.MIX_LEN=" ULINTPF,
+					int(rec_get_field_start_offs(rec, 1)),
+					rec,
+					type, *flags2);
+			goto err_exit;
 		}
 
 		/* DICT_TF2_FTS will be set when indexes are being loaded */
@@ -1323,41 +860,39 @@ dict_sys_tables_rec_read(
 		*flags2 = 0;
 	}
 
-	return(true);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return READ_OK;
 }
 
-/** Load and check each non-predefined tablespace mentioned in SYS_TABLES.
-Search SYS_TABLES and check each tablespace mentioned that has not
-already been added to the fil_system.  If it is valid, add it to the
-file_system list.
-@return the highest space ID found. */
-static ulint dict_check_sys_tables()
+/** Check each tablespace found in the data dictionary.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id()
 {
 	ulint		max_space_id = 0;
 	btr_pcur_t	pcur;
-	const rec_t*	rec;
 	mtr_t		mtr;
 
-	DBUG_ENTER("dict_check_sys_tables");
-
-	ut_d(dict_sys.assert_locked());
+	DBUG_ENTER("dict_check_tablespaces_and_store_max_id");
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	/* Before traversing SYS_TABLES, let's make sure we have
-	SYS_TABLESPACES and SYS_DATAFILES loaded. */
-	dict_table_t*	sys_tablespaces;
-	dict_table_t*	sys_datafiles;
-	sys_tablespaces = dict_table_get_low("SYS_TABLESPACES");
-	ut_a(sys_tablespaces != NULL);
-	sys_datafiles = dict_table_get_low("SYS_DATAFILES");
-	ut_a(sys_datafiles != NULL);
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
-	     rec != NULL;
-	     mtr.commit(), mtr.start(),
-	     rec = dict_getnext_system(&pcur, &mtr)) {
-		const byte*	field;
+	for (const rec_t *rec = dict_startscan_system(&pcur, &mtr,
+						      dict_sys.sys_tables);
+	     rec; rec = dict_getnext_system_low(&pcur, &mtr)) {
 		ulint		len;
 		table_id_t	table_id;
 		ulint		space_id;
@@ -1367,39 +902,31 @@ static ulint dict_check_sys_tables()
 
 		/* If a table record is not useable, ignore it and continue
 		on to the next record. Error messages were logged. */
-		if (dict_sys_tables_rec_check(rec) != NULL) {
+		if (dict_sys_tables_rec_check(rec)) {
 			continue;
 		}
 
-		/* Copy the table name from rec */
-		field = rec_get_nth_field_old(
-			rec, DICT_FLD__SYS_TABLES__NAME, &len);
+		const char *field = reinterpret_cast<const char*>(
+			rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__NAME,
+					      &len));
 
-		table_name_t table_name(mem_strdupl((char*) field, len));
 		DBUG_PRINT("dict_check_sys_tables",
-			   ("name: %p, '%s'", table_name.m_name,
-			    table_name.m_name));
+			   ("name: %*.s", static_cast<int>(len), field));
 
-		if (!dict_sys_tables_rec_read(rec, table_name,
-					      &table_id, &space_id,
-					      &n_cols, &flags, &flags2)
+		if (dict_sys_tables_rec_read(rec, false,
+					     &mtr, &table_id, &space_id,
+					     &n_cols, &flags, &flags2, nullptr)
+		    != READ_OK
 		    || space_id == TRX_SYS_SPACE) {
-next:
-			ut_free(table_name.m_name);
 			continue;
 		}
 
-		if (strstr(table_name.m_name, "/" TEMP_FILE_PREFIX "-")) {
-			/* This table will be dropped by
-			row_mysql_drop_garbage_tables().
-			We do not care if the file exists. */
-			goto next;
-		}
-
 		if (flags2 & DICT_TF2_DISCARDED) {
-			ib::info() << "Ignoring tablespace for " << table_name
-				<< " because the DISCARD flag is set .";
-			goto next;
+			sql_print_information("InnoDB: Ignoring tablespace"
+					      " for %.*s because "
+					      "the DISCARD flag is set",
+					      static_cast<int>(len), field);
+			continue;
 		}
 
 		/* For tables or partitions using .ibd files, the flag
@@ -1410,118 +937,78 @@ next:
 		newly created or rebuilt tables or partitions, but
 		will otherwise ignore the flag. */
 
-		/* Now that we have the proper name for this tablespace,
-		look to see if it is already in the tablespace cache. */
-		if (const fil_space_t* space
-		    = fil_space_for_table_exists_in_mem(
-			    space_id, table_name.m_name, flags)) {
-			/* Recovery can open a datafile that does not
-			match SYS_DATAFILES.  If they don't match, update
-			SYS_DATAFILES. */
-			char *dict_path = dict_get_first_path(space_id);
-			const char *fil_path = space->chain.start->name;
-			if (dict_path
-			    && strcmp(dict_path, fil_path)) {
-				dict_update_filepath(space_id, fil_path);
-			}
-			ut_free(dict_path);
-			ut_free(table_name.m_name);
+		if (fil_space_for_table_exists_in_mem(space_id, flags)) {
 			continue;
 		}
 
-		/* Set the expected filepath from the data dictionary.
-		If the file is found elsewhere (from an ISL or the default
-		location) or this path is the same file but looks different,
-		fil_ibd_open() will update the dictionary with what is
-		opened. */
-		char*	filepath = dict_get_first_path(space_id);
+		const span<const char> name{field, len};
+
+		char*	filepath = fil_make_filepath(nullptr, name,
+						     IBD, false);
+
+		const bool not_dropped{!rec_get_deleted_flag(rec, 0)};
 
 		/* Check that the .ibd file exists. */
-		if (!fil_ibd_open(
-			    false,
-			    !srv_read_only_mode && srv_log_file_size != 0,
-			    FIL_TYPE_TABLESPACE,
-			    space_id, dict_tf_to_fsp_flags(flags),
-			    table_name, filepath)) {
-			ib::warn() << "Ignoring tablespace for "
-				<< table_name
-				<< " because it could not be opened.";
+		if (fil_ibd_open(not_dropped, FIL_TYPE_TABLESPACE,
+				 space_id, dict_tf_to_fsp_flags(flags),
+				 name, filepath)) {
+		} else if (!not_dropped) {
+		} else if (srv_operation == SRV_OPERATION_NORMAL
+			   && srv_start_after_restore
+			   && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+			   && dict_table_t::is_temporary_name(filepath)) {
+			/* Mariabackup will not copy files whose
+			names start with #sql-. This table ought to
+			be dropped by drop_garbage_tables_after_restore()
+			a little later. */
+		} else {
+			sql_print_warning("InnoDB: Ignoring tablespace for"
+					  " %.*s because it"
+					  " could not be opened.",
+					  static_cast<int>(len), field);
 		}
 
 		max_space_id = ut_max(max_space_id, space_id);
 
-		ut_free(table_name.m_name);
 		ut_free(filepath);
 	}
 
-	mtr_commit(&mtr);
-
-	DBUG_RETURN(max_space_id);
-}
-
-/** Check each tablespace found in the data dictionary.
-Then look at each table defined in SYS_TABLES that has a space_id > 0
-to find all the file-per-table tablespaces.
-
-In a crash recovery we already have some tablespace objects created from
-processing the REDO log.  Any other tablespace in SYS_TABLESPACES not
-previously used in recovery will be opened here.  We will compare the
-space_id information in the data dictionary to what we find in the
-tablespace file. In addition, more validation will be done if recovery
-was needed and force_recovery is not set.
-
-We also scan the biggest space id, and store it to fil_system. */
-void dict_check_tablespaces_and_store_max_id()
-{
-	mtr_t	mtr;
-
-	DBUG_ENTER("dict_check_tablespaces_and_store_max_id");
-
-	dict_sys_lock();
-
-	/* Initialize the max space_id from sys header */
-	mtr.start();
-	ulint max_space_id = mach_read_from_4(DICT_HDR_MAX_SPACE_ID
-					      + DICT_HDR
-					      + dict_hdr_get(&mtr)->frame);
 	mtr.commit();
 
 	fil_set_max_space_id_if_bigger(max_space_id);
 
-	/* Open all tablespaces referenced in SYS_TABLES.
-	This will update SYS_TABLESPACES and SYS_DATAFILES if it
-	finds any file-per-table tablespaces not already there. */
-	max_space_id = dict_check_sys_tables();
-	fil_set_max_space_id_if_bigger(max_space_id);
-
-	dict_sys_unlock();
+	dict_sys.unlock();
 
 	DBUG_VOID_RETURN;
 }
 
 /** Error message for a delete-marked record in dict_load_column_low() */
-static const char* dict_load_column_del = "delete-marked record in SYS_COLUMN";
+static const char *dict_load_column_del= "delete-marked record in SYS_COLUMNS";
+/** Error message for a missing record in dict_load_column_low() */
+static const char *dict_load_column_none= "SYS_COLUMNS record not found";
+/** Message for incomplete instant ADD/DROP in dict_load_column_low() */
+static const char *dict_load_column_instant= "incomplete instant ADD/DROP";
 
 /** Load a table column definition from a SYS_COLUMNS record to dict_table_t.
-@return	error message
-@retval	NULL on success */
-static
-const char*
-dict_load_column_low(
-	dict_table_t*	table,		/*!< in/out: table, could be NULL
-					if we just populate a dict_column_t
-					struct with information from
-					a SYS_COLUMNS record */
-	mem_heap_t*	heap,		/*!< in/out: memory heap
-					for temporary storage */
-	dict_col_t*	column,		/*!< out: dict_column_t to fill,
-					or NULL if table != NULL */
-	table_id_t*	table_id,	/*!< out: table id */
-	const char**	col_name,	/*!< out: column name */
-	const rec_t*	rec,		/*!< in: SYS_COLUMNS record */
-	ulint*		nth_v_col)	/*!< out: if not NULL, this
-					records the "n" of "nth" virtual
-					column */
+@param table           table, or nullptr if the output will be in column
+@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED
+@param heap            memory heap for temporary storage
+@param column          pointer to output buffer, or nullptr if table!=nullptr
+@param table_id        table identifier
+@param col_name        column name
+@param rec             SYS_COLUMNS record
+@param mtr             mini-transaction
+@param nth_v_col       nullptr, or pointer to a counter of virtual columns
+@return error message
+@retval nullptr on success */
+static const char *dict_load_column_low(dict_table_t *table,
+                                        unsigned use_uncommitted,
+                                        mem_heap_t *heap, dict_col_t *column,
+                                        table_id_t *table_id,
+                                        const char **col_name,
+                                        const rec_t *rec,
+                                        mtr_t *mtr,
+                                        ulint *nth_v_col)
 {
 	char*		name;
 	const byte*	field;
@@ -1534,10 +1021,6 @@ dict_load_column_low(
 
 	ut_ad(!table == !!column);
 
-	if (rec_get_deleted_flag(rec, 0)) {
-		return(dict_load_column_del);
-	}
-
 	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) {
 		return("wrong number of columns in SYS_COLUMNS record");
 	}
@@ -1552,7 +1035,7 @@ err_len:
 	if (table_id) {
 		*table_id = mach_read_from_8(field);
 	} else if (table->id != mach_read_from_8(field)) {
-		return("SYS_COLUMNS.TABLE_ID mismatch");
+		return dict_load_column_none;
 	}
 
 	field = rec_get_nth_field_old(
@@ -1563,11 +1046,41 @@ err_len:
 
 	pos = mach_read_from_4(field);
 
-	rec_get_nth_field_offs_old(
+	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len);
 	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
 	}
+
+	const trx_id_t trx_id = trx_read_trx_id(field);
+
+	if (trx_id && mtr && use_uncommitted < 2
+	    && trx_sys.find(nullptr, trx_id, false)) {
+		if (use_uncommitted) {
+			return dict_load_column_instant;
+		}
+		const auto savepoint = mtr->get_savepoint();
+		dict_index_t* index = UT_LIST_GET_FIRST(
+			dict_sys.sys_columns->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!old_vers) {
+			return dict_load_column_none;
+		}
+		ut_ad(!rec_get_deleted_flag(rec, 0));
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(trx_id);
+		return dict_load_column_del;
+	}
+
 	rec_get_nth_field_offs_old(
 		rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len);
 	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
@@ -1580,11 +1093,7 @@ err_len:
 		goto err_len;
 	}
 
-	name = mem_heap_strdupl(heap, (const char*) field, len);
-
-	if (col_name) {
-		*col_name = name;
-	}
+	*col_name = name = mem_heap_strdupl(heap, (const char*) field, len);
 
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len);
@@ -1654,6 +1163,10 @@ err_len:
 			dict_mem_table_add_col(table, heap, name, mtype,
 					       prtype, col_len);
 		}
+
+		if (trx_id > table->def_trx_id) {
+			table->def_trx_id = trx_id;
+		}
 	} else {
 		dict_mem_fill_column_struct(column, pos, mtype,
 					    prtype, col_len);
@@ -1668,11 +1181,13 @@ err_len:
 }
 
 /** Error message for a delete-marked record in dict_load_virtual_low() */
-static const char* dict_load_virtual_del = "delete-marked record in SYS_VIRTUAL";
+static const char *dict_load_virtual_del= "delete-marked record in SYS_VIRTUAL";
+static const char *dict_load_virtual_none= "SYS_VIRTUAL record not found";
 
 /** Load a virtual column "mapping" (to base columns) information
 from a SYS_VIRTUAL record
 @param[in,out]	table		table
+@param[in]	uncommitted	false=READ COMMITTED, true=READ UNCOMMITTED
 @param[in,out]	column		mapped base column's dict_column_t
 @param[in,out]	table_id	table id
 @param[in,out]	pos		virtual column position
@@ -1684,6 +1199,7 @@ static
 const char*
 dict_load_virtual_low(
 	dict_table_t*	table,
+	bool		uncommitted,
 	dict_col_t**	column,
 	table_id_t*	table_id,
 	ulint*		pos,
@@ -1694,10 +1210,6 @@ dict_load_virtual_low(
 	ulint		len;
 	ulint		base;
 
-	if (rec_get_deleted_flag(rec, 0)) {
-		return(dict_load_virtual_del);
-	}
-
 	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_VIRTUAL) {
 		return("wrong number of columns in SYS_VIRTUAL record");
 	}
@@ -1712,7 +1224,7 @@ err_len:
 	if (table_id != NULL) {
 		*table_id = mach_read_from_8(field);
 	} else if (table->id != mach_read_from_8(field)) {
-		return("SYS_VIRTUAL.TABLE_ID mismatch");
+		return dict_load_virtual_none;
 	}
 
 	field = rec_get_nth_field_old(
@@ -1737,7 +1249,7 @@ err_len:
 		*base_pos = base;
 	}
 
-	rec_get_nth_field_offs_old(
+	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_VIRTUAL__DB_TRX_ID, &len);
 	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
@@ -1749,6 +1261,18 @@ err_len:
 		goto err_len;
 	}
 
+	const trx_id_t trx_id = trx_read_trx_id(field);
+
+	if (trx_id && column && !uncommitted
+	    && trx_sys.find(nullptr, trx_id, false)) {
+		if (!rec_get_deleted_flag(rec, 0)) {
+			return dict_load_virtual_none;
+		}
+	} else if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(trx_id != 0);
+		return dict_load_virtual_del;
+	}
+
 	if (column != NULL) {
 		*column = dict_table_get_nth_col(table, base);
 	}
@@ -1756,74 +1280,85 @@ err_len:
 	return(NULL);
 }
 
-/********************************************************************//**
-Loads definitions for table columns. */
-static
-void
-dict_load_columns(
-/*==============*/
-	dict_table_t*	table,	/*!< in/out: table */
-	mem_heap_t*	heap)	/*!< in/out: memory heap
-				for temporary storage */
+/** Load the definitions for table columns.
+@param table           table
+@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED
+@param heap            memory heap for temporary storage
+@return error code
+@retval DB_SUCCESS on success
+@retval DB_SUCCESS_LOCKED_REC on success if use_uncommitted=1
+and instant ADD/DROP/reorder was detected */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static dberr_t dict_load_columns(dict_table_t *table, unsigned use_uncommitted,
+                                 mem_heap_t *heap)
 {
-	dict_table_t*	sys_columns;
-	dict_index_t*	sys_index;
 	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	byte*		buf;
-	ulint		i;
 	mtr_t		mtr;
 	ulint		n_skipped = 0;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	sys_columns = dict_table_get_low("SYS_COLUMNS");
-	sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
-	ut_ad(!dict_table_is_comp(sys_columns));
+	dict_index_t* sys_index = dict_sys.sys_columns->indexes.start;
+	ut_ad(!dict_sys.sys_columns->not_redundant());
 
-	ut_ad(name_of_col_is(sys_columns, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index,
 			     DICT_FLD__SYS_COLUMNS__NAME, "NAME"));
-	ut_ad(name_of_col_is(sys_columns, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index,
 			     DICT_FLD__SYS_COLUMNS__PREC, "PREC"));
 
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
-	mach_write_to_8(buf, table->id);
-
-	dfield_set_data(dfield, buf, 8);
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte table_id[8];
+	mach_write_to_8(table_id, table->id);
+	dfield_set_data(&dfield, table_id, 8);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
 
 	ut_ad(table->n_t_cols == static_cast<ulint>(
 	      table->n_cols) + static_cast<ulint>(table->n_v_cols));
 
-	for (i = 0;
+	for (ulint i = 0;
 	     i + DATA_N_SYS_COLS < table->n_t_cols + n_skipped;
 	     i++) {
 		const char*	err_msg;
 		const char*	name = NULL;
 		ulint		nth_v_col = ULINT_UNDEFINED;
+		const rec_t*	rec = btr_pcur_get_rec(&pcur);
 
-		rec = btr_pcur_get_rec(&pcur);
+		err_msg = btr_pcur_is_on_user_rec(&pcur)
+			? dict_load_column_low(table, use_uncommitted,
+					       heap, NULL, NULL,
+					       &name, rec, &mtr, &nth_v_col)
+			: dict_load_column_none;
 
-		ut_a(btr_pcur_is_on_user_rec(&pcur));
-
-		err_msg = dict_load_column_low(table, heap, NULL, NULL,
-					       &name, rec, &nth_v_col);
-
-		if (err_msg == dict_load_column_del) {
+		if (!err_msg) {
+		} else if (err_msg == dict_load_column_del) {
 			n_skipped++;
 			goto next_rec;
-		} else if (err_msg) {
-			ib::fatal() << err_msg;
+		} else if (err_msg == dict_load_column_instant) {
+			err = DB_SUCCESS_LOCKED_REC;
+			goto func_exit;
+		} else if (err_msg == dict_load_column_none
+			   && strstr(table->name.m_name,
+				     "/" TEMP_FILE_PREFIX_INNODB)) {
+			break;
+		} else {
+			ib::error() << err_msg << " for table " << table->name;
+			err = DB_CORRUPTION;
+			goto func_exit;
 		}
 
 		/* Note: Currently we have one DOC_ID column that is
@@ -1866,125 +1401,113 @@ next_rec:
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+func_exit:
+	mtr.commit();
+	return err;
 }
 
 /** Loads SYS_VIRTUAL info for one virtual column
-@param[in,out]	table		table
-@param[in]	nth_v_col	virtual column sequence num
-@param[in,out]	v_col		virtual column
-@param[in,out]	heap		memory heap
-*/
+@param table	   table definition
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED
+@param nth_v_col   virtual column position */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 static
-void
-dict_load_virtual_one_col(
-	dict_table_t*	table,
-	ulint		nth_v_col,
-	dict_v_col_t*	v_col,
-	mem_heap_t*	heap)
+dberr_t
+dict_load_virtual_col(dict_table_t *table, bool uncommitted, ulint nth_v_col)
 {
-	dict_table_t*	sys_virtual;
-	dict_index_t*	sys_virtual_index;
-	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	byte*		buf;
-	ulint		i = 0;
-	mtr_t		mtr;
-	ulint		skipped = 0;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
+	const dict_v_col_t* v_col = dict_table_get_nth_v_col(table, nth_v_col);
 
 	if (v_col->num_base == 0) {
-		return;
+		return DB_SUCCESS;
 	}
 
-	mtr_start(&mtr);
-
-	sys_virtual = dict_table_get_low("SYS_VIRTUAL");
-	sys_virtual_index = UT_LIST_GET_FIRST(sys_virtual->indexes);
-	ut_ad(!dict_table_is_comp(sys_virtual));
-
-	ut_ad(name_of_col_is(sys_virtual, sys_virtual_index,
-			     DICT_FLD__SYS_VIRTUAL__POS, "POS"));
-
-	tuple = dtuple_create(heap, 2);
-
-	/* table ID field */
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
-	mach_write_to_8(buf, table->id);
-
-	dfield_set_data(dfield, buf, 8);
+	dict_index_t*	sys_virtual_index;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
 
-	/* virtual column pos field */
-	dfield = dtuple_get_nth_field(tuple, 1);
+	ut_ad(dict_sys.locked());
 
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
-	ulint	vcol_pos = dict_create_v_col_pos(nth_v_col, v_col->m_col.ind);
-	mach_write_to_4(buf, vcol_pos);
+	mtr.start();
 
-	dfield_set_data(dfield, buf, 4);
+	sys_virtual_index = dict_sys.sys_virtual->indexes.start;
+	ut_ad(!dict_sys.sys_virtual->not_redundant());
 
-	dict_index_copy_types(tuple, sys_virtual_index, 2);
+	ut_ad(name_of_col_is(dict_sys.sys_virtual, sys_virtual_index,
+			     DICT_FLD__SYS_VIRTUAL__POS, "POS"));
 
-	btr_pcur_open_on_user_rec(sys_virtual_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	dfield_t dfield[2];
+	dtuple_t tuple{
+		0,2,2,dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte table_id[8], vcol_pos[4];
+	mach_write_to_8(table_id, table->id);
+	dfield_set_data(&dfield[0], table_id, 8);
+	mach_write_to_4(vcol_pos,
+			dict_create_v_col_pos(nth_v_col, v_col->m_col.ind));
+	dfield_set_data(&dfield[1], vcol_pos, 4);
+
+	dict_index_copy_types(&tuple, sys_virtual_index, 2);
+	pcur.btr_cur.page_cur.index = sys_virtual_index;
+
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
 
-	for (i = 0; i < unsigned{v_col->num_base} + skipped; i++) {
-		const char*	err_msg;
+	for (ulint i = 0, skipped = 0;
+	     i < unsigned{v_col->num_base} + skipped; i++) {
 		ulint		pos;
-
-		ut_ad(btr_pcur_is_on_user_rec(&pcur));
-
-		rec = btr_pcur_get_rec(&pcur);
-
-		ut_a(btr_pcur_is_on_user_rec(&pcur));
-
-		err_msg = dict_load_virtual_low(table,
+		const char*	err_msg
+			= btr_pcur_is_on_user_rec(&pcur)
+			? dict_load_virtual_low(table, uncommitted,
 						&v_col->base_col[i - skipped],
 						NULL,
-					        &pos, NULL, rec);
-
-		if (err_msg) {
-			if (err_msg != dict_load_virtual_del) {
-				ib::fatal() << err_msg;
-			} else {
-				skipped++;
-			}
+					        &pos, NULL,
+						btr_pcur_get_rec(&pcur))
+			: dict_load_virtual_none;
+
+		if (!err_msg) {
+			ut_ad(pos == mach_read_from_4(vcol_pos));
+		} else if (err_msg == dict_load_virtual_del) {
+			skipped++;
+		} else if (err_msg == dict_load_virtual_none
+			   && strstr(table->name.m_name,
+				     "/" TEMP_FILE_PREFIX_INNODB)) {
+			break;
 		} else {
-			ut_ad(pos == vcol_pos);
+			ib::error() << err_msg << " for table " << table->name;
+			err = DB_CORRUPTION;
+			break;
 		}
 
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+func_exit:
+	mtr.commit();
+	return err;
 }
 
 /** Loads info from SYS_VIRTUAL for virtual columns.
-@param[in,out]	table	table
-@param[in]	heap	memory heap
-*/
-static
-void
-dict_load_virtual(
-	dict_table_t*	table,
-	mem_heap_t*	heap)
+@param table	   table definition
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static dberr_t dict_load_virtual(dict_table_t *table, bool uncommitted)
 {
-	for (ulint i = 0; i < table->n_v_cols; i++) {
-		dict_v_col_t*	v_col = dict_table_get_nth_v_col(table, i);
-
-		dict_load_virtual_one_col(table, i, v_col, heap);
-	}
+  for (ulint i= 0; i < table->n_v_cols; i++)
+    if (dberr_t err= dict_load_virtual_col(table, uncommitted, i))
+      return err;
+  return DB_SUCCESS;
 }
 
 /** Error message for a delete-marked record in dict_load_field_low() */
-static const char* dict_load_field_del = "delete-marked record in SYS_FIELDS";
+static const char *dict_load_field_del= "delete-marked record in SYS_FIELDS";
+
+static const char *dict_load_field_none= "SYS_FIELDS record not found";
 
 /** Load an index field definition from a SYS_FIELDS record to dict_index_t.
 @return	error message
@@ -1995,6 +1518,8 @@ dict_load_field_low(
 	byte*		index_id,	/*!< in/out: index id (8 bytes)
 					an "in" value if index != NULL
 					and "out" if index == NULL */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
 	dict_index_t*	index,		/*!< in/out: index, could be NULL
 					if we just populate a dict_field_t
 					struct with information from
@@ -2005,6 +1530,7 @@ dict_load_field_low(
 	byte*		last_index_id,	/*!< in: last index id */
 	mem_heap_t*	heap,		/*!< in/out: memory heap
 					for temporary storage */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
 	const rec_t*	rec)		/*!< in: SYS_FIELDS record */
 {
 	const byte*	field;
@@ -2015,11 +1541,8 @@ dict_load_field_low(
 	ulint		position;
 
 	/* Either index or sys_field is supplied, not both */
-	ut_a((!index) || (!sys_field));
-
-	if (rec_get_deleted_flag(rec, 0)) {
-		return(dict_load_field_del);
-	}
+	ut_ad((!index) != (!sys_field));
+	ut_ad((!index) == !mtr);
 
 	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) {
 		return("wrong number of columns in SYS_FIELDS record");
@@ -2039,7 +1562,7 @@ err_len:
 	} else {
 		first_field = (index->n_def == 0);
 		if (memcmp(field, index_id, 8)) {
-			return("SYS_FIELDS.INDEX_ID mismatch");
+			return dict_load_field_none;
 		}
 	}
 
@@ -2073,7 +1596,7 @@ err_len:
 		position = pos_and_prefix_len & 0xFFFFUL;
 	}
 
-	rec_get_nth_field_offs_old(
+	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len);
 	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
@@ -2084,6 +1607,32 @@ err_len:
 		goto err_len;
 	}
 
+	const trx_id_t trx_id = trx_read_trx_id(field);
+
+	if (!trx_id) {
+		ut_ad(!rec_get_deleted_flag(rec, 0));
+	} else if (!mtr || uncommitted) {
+	} else if (trx_sys.find(nullptr, trx_id, false)) {
+		const auto savepoint = mtr->get_savepoint();
+		dict_index_t* sys_field = UT_LIST_GET_FIRST(
+			dict_sys.sys_fields->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, sys_field, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, sys_field, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!old_vers || rec_get_deleted_flag(rec, 0)) {
+			return dict_load_field_none;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return(dict_load_field_del);
+	}
+
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len);
 	if (len == 0 || len == UNIV_SQL_NULL) {
@@ -2095,9 +1644,6 @@ err_len:
 			index, mem_heap_strdupl(heap, (const char*) field, len),
 			prefix_len);
 	} else {
-		ut_a(sys_field);
-		ut_a(pos);
-
 		sys_field->name = mem_heap_strdupl(
 			heap, (const char*) field, len);
 		sys_field->prefix_len = prefix_len & ((1U << 12) - 1);
@@ -2107,125 +1653,118 @@ err_len:
 	return(NULL);
 }
 
-/********************************************************************//**
-Loads definitions for index fields.
-@return DB_SUCCESS if ok, DB_CORRUPTION if corruption */
-static
-ulint
-dict_load_fields(
-/*=============*/
-	dict_index_t*	index,	/*!< in/out: index whose fields to load */
-	mem_heap_t*	heap)	/*!< in: memory heap for temporary storage */
+/**
+Load definitions for index fields.
+@param index       index whose fields are to be loaded
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED
+@param heap        memory heap for temporary storage
+@return error code
+@return DB_SUCCESS if the fields were loaded successfully */
+static dberr_t dict_load_fields(dict_index_t *index, bool uncommitted,
+                                mem_heap_t *heap)
 {
-	dict_table_t*	sys_fields;
-	dict_index_t*	sys_index;
 	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	byte*		buf;
-	ulint		i;
 	mtr_t		mtr;
-	dberr_t		error;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	sys_fields = dict_table_get_low("SYS_FIELDS");
-	sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
-	ut_ad(!dict_table_is_comp(sys_fields));
-	ut_ad(name_of_col_is(sys_fields, sys_index,
+	dict_index_t* sys_index = dict_sys.sys_fields->indexes.start;
+	ut_ad(!dict_sys.sys_fields->not_redundant());
+	ut_ad(name_of_col_is(dict_sys.sys_fields, sys_index,
 			     DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME"));
 
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
-	mach_write_to_8(buf, index->id);
-
-	dfield_set_data(dfield, buf, 8);
-	dict_index_copy_types(tuple, sys_index, 1);
-
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	for (i = 0; i < index->n_fields; i++) {
-		const char* err_msg;
-
-		rec = btr_pcur_get_rec(&pcur);
-
-		ut_a(btr_pcur_is_on_user_rec(&pcur));
-
-		err_msg = dict_load_field_low(buf, index, NULL, NULL, NULL,
-					      heap, rec);
-
-		if (err_msg == dict_load_field_del) {
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte index_id[8];
+	mach_write_to_8(index_id, index->id);
+	dfield_set_data(&dfield, index_id, 8);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF,
+						  &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	for (ulint i = 0; i < index->n_fields; i++) {
+		const char *err_msg = btr_pcur_is_on_user_rec(&pcur)
+			? dict_load_field_low(index_id, uncommitted, index,
+					      nullptr, nullptr, nullptr,
+					      heap, &mtr,
+					      btr_pcur_get_rec(&pcur))
+			: dict_load_field_none;
+
+		if (!err_msg) {
+		} else if (err_msg == dict_load_field_del) {
 			/* There could be delete marked records in
 			SYS_FIELDS because SYS_FIELDS.INDEX_ID can be
 			updated by ALTER TABLE ADD INDEX. */
-
-			goto next_rec;
-		} else if (err_msg) {
-			ib::error() << err_msg;
+		} else {
+			if (err_msg != dict_load_field_none
+			    || strstr(index->table->name.m_name,
+				      "/" TEMP_FILE_PREFIX_INNODB)) {
+				ib::error() << err_msg << " for index "
+					    << index->name
+					    << " of table "
+					    << index->table->name;
+			}
 			error = DB_CORRUPTION;
-			goto func_exit;
+			break;
 		}
-next_rec:
+
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
-	error = DB_SUCCESS;
 func_exit:
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-	return(error);
+	mtr.commit();
+	return error;
 }
 
 /** Error message for a delete-marked record in dict_load_index_low() */
-static const char* dict_load_index_del = "delete-marked record in SYS_INDEXES";
+static const char *dict_load_index_del= "delete-marked record in SYS_INDEXES";
 /** Error message for table->id mismatch in dict_load_index_low() */
-static const char* dict_load_index_id_err = "SYS_INDEXES.TABLE_ID mismatch";
+static const char *dict_load_index_none= "SYS_INDEXES record not found";
 /** Error message for SYS_TABLES flags mismatch in dict_load_table_low() */
-static const char* dict_load_table_flags = "incorrect flags in SYS_TABLES";
+static const char *dict_load_table_flags= "incorrect flags in SYS_TABLES";
 
 /** Load an index definition from a SYS_INDEXES record to dict_index_t.
-If allocate=TRUE, we will create a dict_index_t structure and fill it
-accordingly. If allocated=FALSE, the dict_index_t will be supplied by
-the caller and filled with information read from the record.
 @return	error message
 @retval	NULL on success */
 static
 const char*
 dict_load_index_low(
 	byte*		table_id,	/*!< in/out: table id (8 bytes),
-					an "in" value if allocate=TRUE
-					and "out" when allocate=FALSE */
+					an "in" value if mtr
+					and "out" when !mtr */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
 	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
 	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
-	ibool		allocate,	/*!< in: TRUE=allocate *index,
-					FALSE=fill in a pre-allocated
-					*index */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction,
+					or nullptr if a pre-allocated
+					*index is to be filled in */
+	dict_table_t*	table,		/*!< in/out: table, or NULL */
 	dict_index_t**	index)		/*!< out,own: index, or NULL */
 {
 	const byte*	field;
 	ulint		len;
-	ulint		name_len;
-	char*		name_buf;
 	index_id_t	id;
 	ulint		n_fields;
 	ulint		type;
 	unsigned	merge_threshold;
 
-	if (allocate) {
-		/* If allocate=TRUE, no dict_index_t will
-		be supplied. Initialize "*index" to NULL */
+	if (mtr) {
 		*index = NULL;
 	}
 
-	if (rec_get_deleted_flag(rec, 0)) {
-		return(dict_load_index_del);
-	}
-
 	if (rec_get_n_fields_old(rec) == DICT_NUM_FIELDS__SYS_INDEXES) {
 		/* MERGE_THRESHOLD exists */
 		field = rec_get_nth_field_old(
@@ -2257,13 +1796,13 @@ err_len:
 		return("incorrect column length in SYS_INDEXES");
 	}
 
-	if (!allocate) {
+	if (!mtr) {
 		/* We are reading a SYS_INDEXES record. Copy the table_id */
 		memcpy(table_id, (const char*) field, 8);
 	} else if (memcmp(field, table_id, 8)) {
 		/* Caller supplied table_id, verify it is the same
 		id as on the index record */
-		return(dict_load_index_id_err);
+		return dict_load_index_none;
 	}
 
 	field = rec_get_nth_field_old(
@@ -2274,7 +1813,7 @@ err_len:
 
 	id = mach_read_from_8(field);
 
-	rec_get_nth_field_offs_old(
+	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len);
 	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
 		goto err_len;
@@ -2285,15 +1824,32 @@ err_len:
 		goto err_len;
 	}
 
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_INDEXES__NAME, &name_len);
-	if (name_len == UNIV_SQL_NULL) {
-		goto err_len;
+	const trx_id_t trx_id = trx_read_trx_id(field);
+	if (!trx_id) {
+		ut_ad(!rec_get_deleted_flag(rec, 0));
+	} else if (!mtr || uncommitted) {
+	} else if (trx_sys.find(nullptr, trx_id, false)) {
+		const auto savepoint = mtr->get_savepoint();
+		dict_index_t* sys_index = UT_LIST_GET_FIRST(
+			dict_sys.sys_indexes->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, sys_index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!old_vers || rec_get_deleted_flag(rec, 0)) {
+			return dict_load_index_none;
+		}
+	} else if (rec_get_deleted_flag(rec, 0)
+		   && rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+		   != static_cast<byte>(*TEMP_INDEX_PREFIX_STR)
+		   && table->def_trx_id < trx_id) {
+		table->def_trx_id = trx_id;
 	}
 
-	name_buf = mem_heap_strdupl(heap, (const char*) field,
-				    name_len);
-
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len);
 	if (len != 4) {
@@ -2317,12 +1873,27 @@ err_len:
 		goto err_len;
 	}
 
-	if (allocate) {
-		*index = dict_mem_index_create(NULL, name_buf, type, n_fields);
-	} else {
-		ut_a(*index);
+	ut_d(const auto name_offs =)
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_INDEXES__NAME, &len);
+	ut_ad(name_offs == 8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
 
-		dict_mem_fill_index_struct(*index, NULL, name_buf,
+	if (rec_get_deleted_flag(rec, 0)) {
+		return dict_load_index_del;
+	}
+
+	char* name = mem_heap_strdupl(heap, reinterpret_cast<const char*>(rec)
+				      + (8 + 8 + DATA_TRX_ID_LEN
+					 + DATA_ROLL_PTR_LEN),
+				      len);
+
+	if (mtr) {
+		*index = dict_mem_index_create(table, name, type, n_fields);
+	} else {
+		dict_mem_fill_index_struct(*index, nullptr, name,
 					   type, n_fields);
 	}
 
@@ -2334,79 +1905,57 @@ err_len:
 	return(NULL);
 }
 
-/********************************************************************//**
-Loads definitions for table indexes. Adds them to the data dictionary
-cache.
-@return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary
-table or DB_UNSUPPORTED if table has unknown index type */
+/** Load definitions for table indexes. Adds them to the data dictionary cache.
+@param table       table definition
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED
+@param heap        memory heap for temporary storage
+@param ignore_err  errors to be ignored when loading the index definition
+@return error code
+@retval DB_SUCCESS if all indexes were successfully loaded
+@retval DB_CORRUPTION if corruption of dictionary table
+@retval DB_UNSUPPORTED if table has unknown index type */
 static MY_ATTRIBUTE((nonnull))
-dberr_t
-dict_load_indexes(
-/*==============*/
-	dict_table_t*	table,	/*!< in/out: table */
-	mem_heap_t*	heap,	/*!< in: memory heap for temporary storage */
-	dict_err_ignore_t ignore_err)
-				/*!< in: error to be ignored when
-				loading the index definition */
+dberr_t dict_load_indexes(dict_table_t *table, bool uncommitted,
+                          mem_heap_t *heap, dict_err_ignore_t ignore_err)
 {
-	dict_table_t*	sys_indexes;
 	dict_index_t*	sys_index;
 	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	byte*		buf;
+	byte		table_id[8];
 	mtr_t		mtr;
-	dberr_t		error = DB_SUCCESS;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	sys_indexes = dict_table_get_low("SYS_INDEXES");
-	sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
-	ut_ad(!dict_table_is_comp(sys_indexes));
-	ut_ad(name_of_col_is(sys_indexes, sys_index,
+	sys_index = dict_sys.sys_indexes->indexes.start;
+	ut_ad(!dict_sys.sys_indexes->not_redundant());
+	ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index,
 			     DICT_FLD__SYS_INDEXES__NAME, "NAME"));
-	ut_ad(name_of_col_is(sys_indexes, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index,
 			     DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO"));
 
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
-	mach_write_to_8(buf, table->id);
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	mach_write_to_8(table_id, table->id);
+	dfield_set_data(&dfield, table_id, 8);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
 
-	dfield_set_data(dfield, buf, 8);
-	dict_index_copy_types(tuple, sys_index, 1);
+	dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF,
+						  &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
 
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	for (;;) {
+	while (btr_pcur_is_on_user_rec(&pcur)) {
 		dict_index_t*	index = NULL;
 		const char*	err_msg;
-
-		if (!btr_pcur_is_on_user_rec(&pcur)) {
-
-			/* We should allow the table to open even
-			without index when DICT_ERR_IGNORE_CORRUPT is set.
-			DICT_ERR_IGNORE_CORRUPT is currently only set
-			for drop table */
-			if (dict_table_get_first_index(table) == NULL
-			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
-				ib::warn() << "Cannot load table "
-					<< table->name
-					<< " because it has no indexes in"
-					" InnoDB internal data dictionary.";
-				error = DB_CORRUPTION;
-				goto func_exit;
-			}
-
-			break;
-		}
-
-		rec = btr_pcur_get_rec(&pcur);
-
+		const rec_t*	rec = btr_pcur_get_rec(&pcur);
 		if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
 		    && (rec_get_n_fields_old(rec)
 			== DICT_NUM_FIELDS__SYS_INDEXES
@@ -2430,69 +1979,45 @@ dict_load_indexes(
 			}
 		}
 
-		err_msg = dict_load_index_low(buf, heap, rec, TRUE, &index);
-		ut_ad((index == NULL && err_msg != NULL)
-		      || (index != NULL && err_msg == NULL));
-
-		if (err_msg == dict_load_index_id_err) {
-			/* TABLE_ID mismatch means that we have
-			run out of index definitions for the table. */
-
-			if (dict_table_get_first_index(table) == NULL
-			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)) {
-
-				ib::warn() << "Failed to load the"
-					" clustered index for table "
-					<< table->name
-					<< " because of the following error: "
-					<< err_msg << "."
-					" Refusing to load the rest of the"
-					" indexes (if any) and the whole table"
-					" altogether.";
-				error = DB_CORRUPTION;
-				goto func_exit;
-			}
+		err_msg = dict_load_index_low(table_id, uncommitted, heap, rec,
+					      &mtr, table, &index);
+		ut_ad(!index == !!err_msg);
 
+		if (err_msg == dict_load_index_none) {
+			/* We have ran out of index definitions for
+			the table. */
 			break;
-		} else if (err_msg == dict_load_index_del) {
-			/* Skip delete-marked records. */
+		}
+
+		if (err_msg == dict_load_index_del) {
 			goto next_rec;
 		} else if (err_msg) {
 			ib::error() << err_msg;
-			if (ignore_err & DICT_ERR_IGNORE_CORRUPT) {
+			if (ignore_err & DICT_ERR_IGNORE_INDEX) {
 				goto next_rec;
 			}
 			error = DB_CORRUPTION;
 			goto func_exit;
+		} else if (rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+			   == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) {
+			dict_mem_index_free(index);
+			goto next_rec;
+		} else {
+			const trx_id_t id = trx_read_trx_id(rec + 8 + 8);
+			if (id > table->def_trx_id) {
+				table->def_trx_id = id;
+			}
 		}
 
 		ut_ad(index);
 		ut_ad(!dict_index_is_online_ddl(index));
 
 		/* Check whether the index is corrupted */
-		if (index->is_corrupted()) {
-			ib::error() << "Index " << index->name
-				<< " of table " << table->name
-				<< " is corrupted";
-
-			if (!srv_load_corrupted
-			    && !(ignore_err & DICT_ERR_IGNORE_CORRUPT)
-			    && dict_index_is_clust(index)) {
-				dict_mem_index_free(index);
-
-				error = DB_INDEX_CORRUPT;
-				goto func_exit;
-			} else {
-				/* We will load the index if
-				1) srv_load_corrupted is TRUE
-				2) ignore_err is set with
-				DICT_ERR_IGNORE_CORRUPT
-				3) if the index corrupted is a secondary
-				index */
-				ib::info() << "Load corrupted index "
-					<< index->name
-					<< " of table " << table->name;
-			}
+		if (ignore_err != DICT_ERR_IGNORE_DROP
+		    && index->is_corrupted() && index->is_clust()) {
+			dict_mem_index_free(index);
+			error = DB_TABLE_CORRUPT;
+			goto func_exit;
 		}
 
 		if (index->type & DICT_FTS
@@ -2518,31 +2043,34 @@ dict_load_indexes(
 		} else if (index->page == FIL_NULL
 			   && table->is_readable()
 			   && (!(index->type & DICT_FTS))) {
+			if (!uncommitted
+			    && ignore_err != DICT_ERR_IGNORE_DROP) {
+				ib::error_or_warn(!(ignore_err
+						    & DICT_ERR_IGNORE_INDEX))
+					<< "Index " << index->name
+					<< " for table " << table->name
+					<< " has been freed!";
+			}
 
-			ib::error() << "Trying to load index " << index->name
-				<< " for table " << table->name
-				<< ", but the index tree has been freed!";
-
-			if (ignore_err & DICT_ERR_IGNORE_INDEX_ROOT) {
-				/* If caller can tolerate this error,
-				we will continue to load the index and
-				let caller deal with this error. However
-				mark the index and table corrupted. We
-				only need to mark such in the index
-				dictionary cache for such metadata corruption,
-				since we would always be able to set it
-				when loading the dictionary cache */
-				index->table = table;
-				dict_set_corrupted_index_cache_only(index);
-
-				ib::info() << "Index is corrupt but forcing"
-					" load into data dictionary";
-			} else {
+			if (!(ignore_err & DICT_ERR_IGNORE_INDEX)) {
 corrupted:
 				dict_mem_index_free(index);
 				error = DB_CORRUPTION;
 				goto func_exit;
 			}
+			/* If caller can tolerate this error,
+			we will continue to load the index and
+			let caller deal with this error. However
+			mark the index and table corrupted. We
+			only need to mark such in the index
+			dictionary cache for such metadata corruption,
+			since we would always be able to set it
+			when loading the dictionary cache */
+			if (index->is_clust()) {
+				index->table->corrupted = true;
+				index->table->file_unreadable = true;
+			}
+			index->type |= DICT_CORRUPT;
 		} else if (!dict_index_is_clust(index)
 			   && NULL == dict_table_get_first_index(table)) {
 
@@ -2560,8 +2088,10 @@ corrupted:
 			of the database server */
 			dict_mem_index_free(index);
 		} else {
-			dict_load_fields(index, heap);
-			index->table = table;
+			error = dict_load_fields(index, uncommitted, heap);
+			if (error != DB_SUCCESS) {
+				goto func_exit;
+			}
 
 			/* The data dictionary tables should never contain
 			invalid index definitions.  If we ignored this error
@@ -2591,6 +2121,13 @@ next_rec:
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
 
+	if (!dict_table_get_first_index(table)
+	    && !(ignore_err & DICT_ERR_IGNORE_INDEX)) {
+		ib::warn() << "No indexes found for table " << table->name;
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
 	ut_ad(table->fts_doc_id_index == NULL);
 
 	if (table->fts != NULL) {
@@ -2610,21 +2147,20 @@ next_rec:
 	}
 
 func_exit:
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-
-	return(error);
+	mtr.commit();
+	return error;
 }
 
 /** Load a table definition from a SYS_TABLES record to dict_table_t.
 Do not load any columns or indexes.
-@param[in]	name		Table name
+@param[in,out]	mtr		mini-transaction
+@param[in]	uncommitted	whether to use READ UNCOMMITTED isolation level
 @param[in]	rec		SYS_TABLES record
-@param[out,own]	table		table, or NULL
+@param[out,own]	table		table, or nullptr
 @return	error message
-@retval	NULL on success */
-static const char* dict_load_table_low(const table_name_t& name,
-				       const rec_t* rec, dict_table_t** table)
+@retval	nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+                                const rec_t *rec, dict_table_t **table)
 {
 	table_id_t	table_id;
 	ulint		space_id;
@@ -2632,6 +2168,7 @@ static const char* dict_load_table_low(const table_name_t& name,
 	ulint		t_num;
 	ulint		flags;
 	ulint		flags2;
+	trx_id_t	trx_id;
 	ulint		n_v_col;
 
 	if (const char* error_text = dict_sys_tables_rec_check(rec)) {
@@ -2639,134 +2176,47 @@ static const char* dict_load_table_low(const table_name_t& name,
 		return(error_text);
 	}
 
-	if (!dict_sys_tables_rec_read(rec, name, &table_id, &space_id,
-				      &t_num, &flags, &flags2)) {
+	if (auto r = dict_sys_tables_rec_read(rec, uncommitted, mtr,
+					      &table_id, &space_id,
+					      &t_num, &flags, &flags2,
+					      &trx_id)) {
 		*table = NULL;
-		return(dict_load_table_flags);
+		return r == READ_ERROR ? dict_load_table_flags : nullptr;
 	}
 
 	dict_table_decode_n_col(t_num, &n_cols, &n_v_col);
 
-	*table = dict_mem_table_create(
-		name.m_name, NULL, n_cols + n_v_col, n_v_col, flags, flags2);
+	*table = dict_table_t::create(
+		span<const char>(reinterpret_cast<const char*>(rec),
+				 rec_get_field_start_offs(rec, 1)),
+		nullptr, n_cols + n_v_col, n_v_col, flags, flags2);
 	(*table)->space_id = space_id;
 	(*table)->id = table_id;
 	(*table)->file_unreadable = !!(flags2 & DICT_TF2_DISCARDED);
-
+	(*table)->def_trx_id = trx_id;
 	return(NULL);
 }
 
-/********************************************************************//**
-Using the table->heap, copy the null-terminated filepath into
-table->data_dir_path and replace the 'databasename/tablename.ibd'
-portion with 'tablename'.
-This allows SHOW CREATE TABLE to return the correct DATA DIRECTORY path.
-Make this data directory path only if it has not yet been saved. */
-static
-void
-dict_save_data_dir_path(
-/*====================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	const char*	filepath)	/*!< in: filepath of tablespace */
-{
-	ut_ad(mutex_own(&dict_sys.mutex));
-	ut_a(DICT_TF_HAS_DATA_DIR(table->flags));
-
-	ut_a(!table->data_dir_path);
-	ut_a(filepath);
-
-	/* Be sure this filepath is not the default filepath. */
-	char*	default_filepath = fil_make_filepath(
-			NULL, table->name.m_name, IBD, false);
-	if (default_filepath) {
-		if (0 != strcmp(filepath, default_filepath)) {
-			ulint pathlen = strlen(filepath);
-			ut_a(pathlen < OS_FILE_MAX_PATH);
-			ut_a(0 == strcmp(filepath + pathlen - 4, DOT_IBD));
-
-			table->data_dir_path = mem_heap_strdup(
-				table->heap, filepath);
-			os_file_make_data_dir_path(table->data_dir_path);
-		}
-
-		ut_free(default_filepath);
-	}
-}
-
-/** Make sure the data_dir_path is saved in dict_table_t if DATA DIRECTORY
-was used. Try to read it from the fil_system first, then from SYS_DATAFILES.
-@param[in]	table		Table object
-@param[in]	dict_mutex_own	true if dict_sys.mutex is owned already */
-void
-dict_get_and_save_data_dir_path(
-	dict_table_t*	table,
-	bool		dict_mutex_own)
+/** Make sure the data_file_name is saved in dict_table_t if needed.
+@param[in,out]	table		Table object */
+void dict_get_and_save_data_dir_path(dict_table_t *table)
 {
-	ut_ad(!table->is_temporary());
-	ut_ad(!table->space || table->space->id == table->space_id);
-
-	if (!table->data_dir_path && table->space_id && table->space) {
-		if (!dict_mutex_own) {
-			dict_mutex_enter_for_mysql();
-		}
-
-		table->flags |= 1 << DICT_TF_POS_DATA_DIR
-			& ((1U << DICT_TF_BITS) - 1);
-		dict_save_data_dir_path(table,
-					table->space->chain.start->name);
-
-		if (table->data_dir_path == NULL) {
-			/* Since we did not set the table data_dir_path,
-			unset the flag.  This does not change SYS_DATAFILES
-			or SYS_TABLES or FSP_SPACE_FLAGS on the header page
-			of the tablespace, but it makes dict_table_t
-			consistent. */
-			table->flags &= ~DICT_TF_MASK_DATA_DIR
-				& ((1U << DICT_TF_BITS) - 1);
-		}
-
-		if (!dict_mutex_own) {
-			dict_mutex_exit_for_mysql();
-		}
-	}
-}
-
-/** Loads a table definition and also all its index definitions, and also
-the cluster definition if the table is a member in a cluster. Also loads
-all foreign key constraints where the foreign key is in the table or where
-a foreign key references columns in this table.
-@param[in]	name		Table name in the dbname/tablename format
-@param[in]	ignore_err	Error to be ignored when loading
-				table and its index definition
-@return table, NULL if does not exist; if the table is stored in an
-.ibd file, but the file does not exist, then we set the file_unreadable
-flag in the table object we return. */
-dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err)
-{
-	dict_names_t			fk_list;
-	dict_table_t*			result;
-	dict_names_t::iterator		i;
-
-	DBUG_ENTER("dict_load_table");
-	DBUG_PRINT("dict_load_table", ("loading table: '%s'", name));
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	result = dict_table_check_if_in_cache_low(name);
-
-	if (!result) {
-		result = dict_load_table_one(const_cast<char*>(name),
-					     ignore_err, fk_list);
-		while (!fk_list.empty()) {
-			if (!dict_table_check_if_in_cache_low(fk_list.front()))
-				dict_load_table_one(
-					const_cast<char*>(fk_list.front()),
-					ignore_err, fk_list);
-			fk_list.pop_front();
-		}
-	}
-
-	DBUG_RETURN(result);
+  ut_ad(!table->is_temporary());
+  ut_ad(!table->space || table->space->id == table->space_id);
+
+  if (!table->data_dir_path && table->space_id && table->space)
+  {
+    const char *filepath= table->space->chain.start->name;
+    if (strncmp(fil_path_to_mysql_datadir, filepath,
+                strlen(fil_path_to_mysql_datadir)))
+    {
+      table->lock_mutex_lock();
+      table->flags|= 1 << DICT_TF_POS_DATA_DIR & ((1U << DICT_TF_BITS) - 1);
+      table->data_dir_path= mem_heap_strdup(table->heap, filepath);
+      os_file_make_data_dir_path(table->data_dir_path);
+      table->lock_mutex_unlock();
+    }
+  }
 }
 
 /** Opens a tablespace for dict_load_table_one()
@@ -2796,13 +2246,13 @@ dict_load_tablespace(
 	}
 
 	/* The tablespace may already be open. */
-	table->space = fil_space_for_table_exists_in_mem(
-		table->space_id, table->name.m_name, table->flags);
+	table->space = fil_space_for_table_exists_in_mem(table->space_id,
+							 table->flags);
 	if (table->space) {
 		return;
 	}
 
-	if (ignore_err == DICT_ERR_IGNORE_DROP) {
+	if (ignore_err >= DICT_ERR_IGNORE_TABLESPACE) {
 		table->file_unreadable = true;
 		return;
 	}
@@ -2819,23 +2269,19 @@ dict_load_tablespace(
 	from the table->name. */
 	char* filepath = NULL;
 	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-		/* This will set table->data_dir_path from either
-		fil_system or SYS_DATAFILES */
-		dict_get_and_save_data_dir_path(table, true);
+		/* This will set table->data_dir_path from fil_system */
+		dict_get_and_save_data_dir_path(table);
 
 		if (table->data_dir_path) {
 			filepath = fil_make_filepath(
-				table->data_dir_path,
-				table->name.m_name, IBD, true);
+				table->data_dir_path, table->name, IBD, true);
 		}
 	}
 
-	/* Try to open the tablespace.  We set the 2nd param (fix_dict) to
-	false because we do not have an x-lock on dict_sys.latch */
 	table->space = fil_ibd_open(
-		true, false, FIL_TYPE_TABLESPACE, table->space_id,
+		2, FIL_TYPE_TABLESPACE, table->space_id,
 		dict_tf_to_fsp_flags(table->flags),
-		table->name, filepath);
+		{table->name.m_name, strlen(table->name.m_name)}, filepath);
 
 	if (!table->space) {
 		/* We failed to find a sensible tablespace file */
@@ -2859,98 +2305,107 @@ key constraints are loaded into memory.
 @param[out]	fk_tables	Related table names that must also be
 				loaded to ensure that all foreign key
 				constraints are loaded.
-@return table, NULL if does not exist; if the table is stored in an
-.ibd file, but the file does not exist, then we set the
-file_unreadable flag in the table object we return */
-static
-dict_table_t*
-dict_load_table_one(
-	const table_name_t&	name,
-	dict_err_ignore_t	ignore_err,
-	dict_names_t&		fk_tables)
+@return table, possibly with file_unreadable flag set
+@retval nullptr if the table does not exist */
+static dict_table_t *dict_load_table_one(const span<const char> &name,
+                                         dict_err_ignore_t ignore_err,
+                                         dict_names_t &fk_tables)
 {
-	dberr_t		err;
-	dict_table_t*	sys_tables;
 	btr_pcur_t	pcur;
-	dict_index_t*	sys_index;
-	dtuple_t*	tuple;
-	mem_heap_t*	heap;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	const byte*	field;
-	ulint		len;
 	mtr_t		mtr;
 
 	DBUG_ENTER("dict_load_table_one");
-	DBUG_PRINT("dict_load_table_one", ("table: %s", name.m_name));
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	heap = mem_heap_create(32000);
+	DBUG_PRINT("dict_load_table_one",
+		   ("table: %.*s", int(name.size()), name.data()));
 
-	mtr_start(&mtr);
+	ut_ad(dict_sys.locked());
 
-	sys_tables = dict_table_get_low("SYS_TABLES");
-	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
-	ut_ad(!dict_table_is_comp(sys_tables));
-	ut_ad(name_of_col_is(sys_tables, sys_index,
+	dict_index_t *sys_index = dict_sys.sys_tables->indexes.start;
+	ut_ad(!dict_sys.sys_tables->not_redundant());
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
 			     DICT_FLD__SYS_TABLES__ID, "ID"));
-	ut_ad(name_of_col_is(sys_tables, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
 			     DICT_FLD__SYS_TABLES__N_COLS, "N_COLS"));
-	ut_ad(name_of_col_is(sys_tables, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
 			     DICT_FLD__SYS_TABLES__TYPE, "TYPE"));
-	ut_ad(name_of_col_is(sys_tables, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
 			     DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN"));
-	ut_ad(name_of_col_is(sys_tables, sys_index,
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
 			     DICT_FLD__SYS_TABLES__SPACE, "SPACE"));
 
-	tuple = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	dfield_set_data(dfield, name.m_name, strlen(name.m_name));
-	dict_index_copy_types(tuple, sys_index, 1);
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	dfield_set_data(&dfield, name.data(), name.size());
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
 
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	rec = btr_pcur_get_rec(&pcur);
+	bool uncommitted = false;
+reload:
+	mtr.start();
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
 
-	if (!btr_pcur_is_on_user_rec(&pcur)
-	    || rec_get_deleted_flag(rec, 0)) {
+	if (err != DB_SUCCESS || !btr_pcur_is_on_user_rec(&pcur)) {
 		/* Not found */
 err_exit:
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap);
-
-		DBUG_RETURN(NULL);
+		mtr.commit();
+		DBUG_RETURN(nullptr);
 	}
 
-	field = rec_get_nth_field_old(
-		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+	const rec_t* rec = btr_pcur_get_rec(&pcur);
 
 	/* Check if the table name in record is the searched one */
-	if (len != strlen(name.m_name)
-	    || memcmp(name.m_name, field, len)) {
-
+	if (rec_get_field_start_offs(rec, 1) != name.size()
+            || memcmp(name.data(), rec, name.size())) {
 		goto err_exit;
 	}
 
 	dict_table_t* table;
-	if (const char* err_msg = dict_load_table_low(name, rec, &table)) {
+	if (const char* err_msg =
+	    dict_load_table_low(&mtr, uncommitted, rec, &table)) {
 		if (err_msg != dict_load_table_flags) {
 			ib::error() << err_msg;
 		}
 		goto err_exit;
 	}
+	if (!table) {
+		goto err_exit;
+	}
 
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+        const unsigned use_uncommitted = uncommitted
+		? 2
+		: table->id == mach_read_from_8(
+			rec + rec_get_field_start_offs(
+				rec, DICT_FLD__SYS_TABLES__ID));
 
-	dict_load_tablespace(table, ignore_err);
+	mtr.commit();
+
+	mem_heap_t* heap = mem_heap_create(32000);
 
-	dict_load_columns(table, heap);
+	dict_load_tablespace(table, ignore_err);
 
-	dict_load_virtual(table, heap);
+	switch (dict_load_columns(table, use_uncommitted, heap)) {
+	case DB_SUCCESS_LOCKED_REC:
+		ut_ad(!uncommitted);
+		uncommitted = true;
+		dict_mem_table_free(table);
+		mem_heap_free(heap);
+		goto reload;
+	case DB_SUCCESS:
+		if (!dict_load_virtual(table, uncommitted)) {
+			break;
+		}
+		/* fall through */
+	default:
+		dict_mem_table_free(table);
+		mem_heap_free(heap);
+		DBUG_RETURN(nullptr);
+	}
 
 	dict_table_add_system_columns(table, heap);
 
@@ -2972,51 +2427,50 @@ err_exit:
 		? DICT_ERR_IGNORE_ALL
 		: ignore_err;
 
-	err = dict_load_indexes(table, heap, index_load_err);
+	err = dict_load_indexes(table, uncommitted, heap, index_load_err);
 
-	if (err == DB_INDEX_CORRUPT) {
+	if (err == DB_TABLE_CORRUPT) {
 		/* Refuse to load the table if the table has a corrupted
 		cluster index */
-		if (!srv_load_corrupted) {
-
-			ib::error() << "Load table " << table->name
-				<< " failed, the table has"
-				" corrupted clustered indexes. Turn on"
-				" 'innodb_force_load_corrupted' to drop it";
-			dict_sys.remove(table);
-			table = NULL;
-			goto func_exit;
-		} else {
-			if (table->indexes.start->is_corrupted()) {
-				table->corrupted = true;
-			}
-		}
+		ut_ad(index_load_err != DICT_ERR_IGNORE_DROP);
+		ib::error() << "Refusing to load corrupted table "
+			    << table->name;
+evict:
+		dict_sys.remove(table);
+		mem_heap_free(heap);
+		DBUG_RETURN(nullptr);
 	}
 
-	if (err == DB_SUCCESS && table->is_readable()) {
-		const auto root = dict_table_get_first_index(table)->page;
-
-		if (root >= table->space->get_size()) {
+	if (err != DB_SUCCESS || !table->is_readable()) {
+	} else if (dict_index_t* pk = dict_table_get_first_index(table)) {
+		ut_ad(pk->is_primary());
+		if (pk->is_corrupted()
+		    || pk->page >= table->space->get_size()) {
 corrupted:
 			table->corrupted = true;
 			table->file_unreadable = true;
-			err = DB_CORRUPTION;
+			err = DB_TABLE_CORRUPT;
+		} else if (table->space->id
+			   && ignore_err == DICT_ERR_IGNORE_DROP) {
+			/* Do not bother to load data from .ibd files
+			only to delete the .ibd files. */
+			goto corrupted;
 		} else {
-			const page_id_t page_id(table->space->id, root);
+			const page_id_t page_id{table->space->id, pk->page};
 			mtr.start();
 			buf_block_t* block = buf_page_get(
 				page_id, table->space->zip_size(),
 				RW_S_LATCH, &mtr);
 			const bool corrupted = !block
-				|| page_get_space_id(block->frame)
+				|| page_get_space_id(block->page.frame)
 				!= page_id.space()
-				|| page_get_page_no(block->frame)
+				|| page_get_page_no(block->page.frame)
 				!= page_id.page_no()
 				|| (mach_read_from_2(FIL_PAGE_TYPE
-						    + block->frame)
+						    + block->page.frame)
 				    != FIL_PAGE_INDEX
 				    && mach_read_from_2(FIL_PAGE_TYPE
-							+ block->frame)
+							+ block->page.frame)
 				    != FIL_PAGE_TYPE_INSTANT);
 			mtr.commit();
 			if (corrupted) {
@@ -3027,57 +2481,38 @@ corrupted:
 				err = btr_cur_instant_init(table);
 			}
 		}
+	} else {
+		ut_ad(ignore_err & DICT_ERR_IGNORE_INDEX);
+		if (ignore_err != DICT_ERR_IGNORE_DROP) {
+			err = DB_CORRUPTION;
+			goto evict;
+		}
 	}
 
 	/* Initialize table foreign_child value. Its value could be
 	changed when dict_load_foreigns() is called below */
 	table->fk_max_recusive_level = 0;
 
-	/* If the force recovery flag is set, we open the table irrespective
-	of the error condition, since the user may want to dump data from the
-	clustered index. However we load the foreign key information only if
+	/* We will load the foreign key information only if
 	all indexes were loaded. */
 	if (!table->is_readable()) {
 		/* Don't attempt to load the indexes from disk. */
 	} else if (err == DB_SUCCESS) {
-		err = dict_load_foreigns(table->name.m_name, NULL,
-					 true, true,
-					 ignore_err, fk_tables);
+		err = dict_load_foreigns(table->name.m_name, nullptr,
+					 0, true, ignore_err, fk_tables);
 
 		if (err != DB_SUCCESS) {
 			ib::warn() << "Load table " << table->name
 				<< " failed, the table has missing"
 				" foreign key indexes. Turn off"
 				" 'foreign_key_checks' and try again.";
-
-			dict_sys.remove(table);
-			table = NULL;
+			goto evict;
 		} else {
 			dict_mem_table_fill_foreign_vcol_set(table);
 			table->fk_max_recusive_level = 0;
 		}
-	} else {
-		dict_index_t*   index;
-
-		/* Make sure that at least the clustered index was loaded.
-		Otherwise refuse to load the table */
-		index = dict_table_get_first_index(table);
-
-		if (!srv_force_recovery
-		    || !index
-		    || !index->is_primary()) {
-			dict_sys.remove(table);
-			table = NULL;
-		} else if (index->is_corrupted()
-			   && table->is_readable()) {
-			/* It is possible we force to load a corrupted
-			clustered index if srv_load_corrupted is set.
-			Mark the table as corrupted in this case */
-			table->corrupted = true;
-		}
 	}
 
-func_exit:
 	mem_heap_free(heap);
 
 	ut_ad(!table
@@ -3108,6 +2543,25 @@ func_exit:
 	DBUG_RETURN(table);
 }
 
+dict_table_t *dict_sys_t::load_table(const span<const char> &name,
+                                     dict_err_ignore_t ignore)
+{
+  if (dict_table_t *table= find_table(name))
+    return table;
+  dict_names_t fk_list;
+  dict_table_t *table= dict_load_table_one(name, ignore, fk_list);
+  while (!fk_list.empty())
+  {
+    const char *f= fk_list.front();
+    const span<const char> name{f, strlen(f)};
+    if (!find_table(name))
+      dict_load_table_one(name, ignore, fk_list);
+    fk_list.pop_front();
+  }
+
+  return table;
+}
+
 /***********************************************************************//**
 Loads a table object based on the table id.
 @return table; NULL if table does not exist */
@@ -3120,53 +2574,45 @@ dict_load_table_on_id(
 {
 	byte		id_buf[8];
 	btr_pcur_t	pcur;
-	mem_heap_t*	heap;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	dict_index_t*	sys_table_ids;
-	dict_table_t*	sys_tables;
-	const rec_t*	rec;
 	const byte*	field;
 	ulint		len;
-	dict_table_t*	table;
 	mtr_t		mtr;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	table = NULL;
+	ut_ad(dict_sys.locked());
 
 	/* NOTE that the operation of this function is protected by
-	the dictionary mutex, and therefore no deadlocks can occur
+	dict_sys.latch, and therefore no deadlocks can occur
 	with other dictionary operations. */
 
-	mtr_start(&mtr);
+	mtr.start();
 	/*---------------------------------------------------*/
 	/* Get the secondary index based on ID for table SYS_TABLES */
-	sys_tables = dict_sys.sys_tables;
-	sys_table_ids = dict_table_get_next_index(
-		dict_table_get_first_index(sys_tables));
-	ut_ad(!dict_table_is_comp(sys_tables));
-	ut_ad(!dict_index_is_clust(sys_table_ids));
-	heap = mem_heap_create(256);
+	dict_index_t *sys_table_ids =
+		dict_sys.sys_tables->indexes.start->indexes.next;
 
-	tuple  = dtuple_create(heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
 
 	/* Write the table id in byte format to id_buf */
 	mach_write_to_8(id_buf, table_id);
+	dfield_set_data(&dfield, id_buf, 8);
+	dict_index_copy_types(&tuple, sys_table_ids, 1);
+	pcur.btr_cur.page_cur.index = sys_table_ids;
 
-	dfield_set_data(dfield, id_buf, 8);
-	dict_index_copy_types(tuple, sys_table_ids, 1);
-
-	btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-
-	rec = btr_pcur_get_rec(&pcur);
+	dict_table_t* table = nullptr;
 
-	if (page_rec_is_user_rec(rec)) {
+	if (btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr)
+	    == DB_SUCCESS
+	    && btr_pcur_is_on_user_rec(&pcur)) {
 		/*---------------------------------------------------*/
 		/* Now we have the record in the secondary index
 		containing the table ID and NAME */
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
 check_rec:
 		field = rec_get_nth_field_old(
 			rec, DICT_FLD__SYS_TABLE_IDS__ID, &len);
@@ -3174,11 +2620,16 @@ check_rec:
 
 		/* Check if the table id in record is the one searched for */
 		if (table_id == mach_read_from_8(field)) {
-			if (rec_get_deleted_flag(rec, 0)) {
-				/* Until purge has completed, there
-				may be delete-marked duplicate records
-				for the same SYS_TABLES.ID, but different
-				SYS_TABLES.NAME. */
+			field = rec_get_nth_field_old(rec,
+				DICT_FLD__SYS_TABLE_IDS__NAME, &len);
+			table = dict_sys.load_table(
+				{reinterpret_cast<const char*>(field),
+				 len}, ignore_err);
+			if (table && table->id != table_id) {
+				ut_ad(rec_get_deleted_flag(rec, 0));
+				table = nullptr;
+			}
+			if (!table) {
 				while (btr_pcur_move_to_next(&pcur, &mtr)) {
 					rec = btr_pcur_get_rec(&pcur);
 
@@ -3186,23 +2637,12 @@ check_rec:
 						goto check_rec;
 					}
 				}
-			} else {
-				/* Now we get the table name from the record */
-				field = rec_get_nth_field_old(rec,
-					DICT_FLD__SYS_TABLE_IDS__NAME, &len);
-				/* Load the table definition to memory */
-				char*	table_name = mem_heap_strdupl(
-					heap, (char*) field, len);
-				table = dict_load_table(table_name, ignore_err);
 			}
 		}
 	}
 
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-	mem_heap_free(heap);
-
-	return(table);
+	mtr.commit();
+	return table;
 }
 
 /********************************************************************//**
@@ -3216,15 +2656,16 @@ dict_load_sys_table(
 {
 	mem_heap_t*	heap;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	heap = mem_heap_create(1000);
 
-	dict_load_indexes(table, heap, DICT_ERR_IGNORE_NONE);
+	dict_load_indexes(table, false, heap, DICT_ERR_IGNORE_NONE);
 
 	mem_heap_free(heap);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /********************************************************************//**
 Loads foreign key constraint col names (also for the referenced table).
 Members that must be set (and valid) in foreign:
@@ -3235,25 +2676,13 @@ Members that will be created and set by this function:
 foreign->foreign_col_names[i]
 foreign->referenced_col_names[i]
 (for i=0..foreign->n_fields-1) */
-static
-void
-dict_load_foreign_cols(
-/*===================*/
-	dict_foreign_t*	foreign)/*!< in/out: foreign constraint object */
+static dberr_t dict_load_foreign_cols(dict_foreign_t *foreign, trx_id_t trx_id)
 {
-	dict_table_t*	sys_foreign_cols;
-	dict_index_t*	sys_index;
 	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	const rec_t*	rec;
-	const byte*	field;
-	ulint		len;
-	ulint		i;
 	mtr_t		mtr;
 	size_t		id_len;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	id_len = strlen(foreign->id);
 
@@ -3265,27 +2694,64 @@ dict_load_foreign_cols(
 		mem_heap_alloc(foreign->heap,
 			       foreign->n_fields * sizeof(void*)));
 
-	mtr_start(&mtr);
+	mtr.start();
 
-	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+	dict_index_t* sys_index = dict_sys.sys_foreign_cols->indexes.start;
+	ut_ad(!dict_sys.sys_foreign_cols->not_redundant());
 
-	sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
-	ut_ad(!dict_table_is_comp(sys_foreign_cols));
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
 
-	tuple = dtuple_create(foreign->heap, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
+	dfield_set_data(&dfield, foreign->id, id_len);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
 
-	dfield_set_data(dfield, foreign->id, id_len);
-	dict_index_copy_types(tuple, sys_index, 1);
+	mem_heap_t* heap = nullptr;
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+	for (ulint i = 0; i < foreign->n_fields; i++) {
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
 
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	for (i = 0; i < foreign->n_fields; i++) {
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
+		ulint len;
+		const byte* field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+		ut_a(len == DATA_TRX_ID_LEN);
 
-		rec = btr_pcur_get_rec(&pcur);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_empty(heap);
+		}
 
-		ut_a(btr_pcur_is_on_user_rec(&pcur));
-		ut_a(!rec_get_deleted_flag(rec, 0));
+		const trx_id_t id = trx_read_trx_id(field);
+		if (!id) {
+		} else if (id != trx_id && trx_sys.find(nullptr, id, false)) {
+			const auto savepoint = mtr.get_savepoint();
+			rec_offs* offsets = rec_get_offsets(
+				rec, sys_index, nullptr, true, ULINT_UNDEFINED,
+				&heap);
+			const rec_t* old_vers;
+			row_vers_build_for_semi_consistent_read(
+				nullptr, rec, &mtr, sys_index, &offsets, &heap,
+				heap, &old_vers, nullptr);
+			mtr.rollback_to_savepoint(savepoint);
+			rec = old_vers;
+			if (!rec || rec_get_deleted_flag(rec, 0)) {
+				goto next;
+			}
+		}
+
+		if (rec_get_deleted_flag(rec, 0)) {
+			ut_ad(id);
+			goto next;
+		}
 
 		field = rec_get_nth_field_old(
 			rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
@@ -3310,7 +2776,7 @@ dict_load_foreign_cols(
 				rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME,
 				&ref_col_name_len);
 
-			ib::fatal	sout;
+			ib::error	sout;
 
 			sout << "Unable to load column names for foreign"
 				" key '" << foreign->id
@@ -3325,6 +2791,9 @@ dict_load_foreign_cols(
 			sout << "', REF_COL_NAME='";
 			sout.write(ref_col_name, ref_col_name_len);
 			sout << "')";
+
+			err = DB_CORRUPTION;
+			break;
 		}
 
 		field = rec_get_nth_field_old(
@@ -3342,27 +2811,33 @@ dict_load_foreign_cols(
 		foreign->referenced_col_names[i] = mem_heap_strdupl(
 			foreign->heap, (char*) field, len);
 
+next:
 		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 	}
-
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+func_exit:
+	mtr.commit();
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return err;
 }
 
 /***********************************************************************//**
 Loads a foreign key constraint to the dictionary cache. If the referenced
 table is not yet loaded, it is added in the output parameter (fk_tables).
 @return DB_SUCCESS or error code */
-static MY_ATTRIBUTE((nonnull(1), warn_unused_result))
+static MY_ATTRIBUTE((warn_unused_result))
 dberr_t
 dict_load_foreign(
 /*==============*/
-	const char*		id,
-				/*!< in: foreign constraint id, must be
-				'\0'-terminated */
+	const char*		table_name,	/*!< in: table name */
+	bool			uncommitted,	/*!< in: use READ UNCOMMITTED
+						transaction isolation level */
 	const char**		col_names,
 				/*!< in: column names, or NULL
 				to use foreign->foreign_table->col_names */
+	trx_id_t		trx_id,
+				/*!< in: current transaction id, or 0 */
 	bool			check_recursive,
 				/*!< in: whether to record the foreign table
 				parent count to avoid unlimited recursive
@@ -3370,6 +2845,8 @@ dict_load_foreign(
 	bool			check_charsets,
 				/*!< in: whether to check charset
 				compatibility */
+	span<const char>	id,
+				/*!< in: foreign constraint id */
 	dict_err_ignore_t	ignore_err,
 				/*!< in: error to be ignored */
 	dict_names_t&	fk_tables)
@@ -3381,86 +2858,93 @@ dict_load_foreign(
 				stack. */
 {
 	dict_foreign_t*	foreign;
-	dict_table_t*	sys_foreign;
 	btr_pcur_t	pcur;
-	dict_index_t*	sys_index;
-	dtuple_t*	tuple;
-	mem_heap_t*	heap2;
-	dfield_t*	dfield;
-	const rec_t*	rec;
 	const byte*	field;
 	ulint		len;
 	mtr_t		mtr;
 	dict_table_t*	for_table;
 	dict_table_t*	ref_table;
-	size_t		id_len;
 
 	DBUG_ENTER("dict_load_foreign");
 	DBUG_PRINT("dict_load_foreign",
-		   ("id: '%s', check_recursive: %d", id, check_recursive));
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	id_len = strlen(id);
-
-	heap2 = mem_heap_create(1000);
-
-	mtr_start(&mtr);
-
-	sys_foreign = dict_table_get_low("SYS_FOREIGN");
-
-	sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
-	ut_ad(!dict_table_is_comp(sys_foreign));
+		   ("id: '%.*s', check_recursive: %d",
+		    int(id.size()), id.data(), check_recursive));
 
-	tuple = dtuple_create(heap2, 1);
-	dfield = dtuple_get_nth_field(tuple, 0);
+	ut_ad(dict_sys.locked());
 
-	dfield_set_data(dfield, id, id_len);
-	dict_index_copy_types(tuple, sys_index, 1);
+	dict_index_t* sys_index = dict_sys.sys_foreign->indexes.start;
+	ut_ad(!dict_sys.sys_foreign->not_redundant());
 
-	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
-	rec = btr_pcur_get_rec(&pcur);
-
-	if (!btr_pcur_is_on_user_rec(&pcur)
-	    || rec_get_deleted_flag(rec, 0)) {
-		/* Not found */
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	dfield_set_data(&dfield, id.data(), id.size());
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
 
-		ib::error() << "Cannot load foreign constraint " << id
-			<< ": could not find the relevant record in "
-			<< "SYS_FOREIGN";
+	mtr.start();
 
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap2);
+	mem_heap_t* heap = nullptr;
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto err_exit;
+	}
 
-		DBUG_RETURN(DB_ERROR);
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+not_found:
+		err = DB_NOT_FOUND;
+err_exit:
+		mtr.commit();
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		DBUG_RETURN(err);
 	}
 
+	const rec_t* rec = btr_pcur_get_rec(&pcur);
+	static_assert(DICT_FLD__SYS_FOREIGN__ID == 0, "compatibility");
 	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
 
 	/* Check if the id in record is the searched one */
-	if (len != id_len || memcmp(id, field, len)) {
-		{
-			ib::error	err;
-			err << "Cannot load foreign constraint " << id
-				<< ": found ";
-			err.write(field, len);
-			err << " instead in SYS_FOREIGN";
-		}
+	if (len != id.size() || memcmp(id.data(), field, id.size())) {
+		goto not_found;
+	}
 
-		btr_pcur_close(&pcur);
-		mtr_commit(&mtr);
-		mem_heap_free(heap2);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+	ut_a(len == DATA_TRX_ID_LEN);
+
+	const trx_id_t tid = trx_read_trx_id(field);
+
+	if (tid && tid != trx_id && !uncommitted
+	    && trx_sys.find(nullptr, tid, false)) {
+		const auto savepoint = mtr.get_savepoint();
+		rec_offs* offsets = rec_get_offsets(
+			rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, &mtr, sys_index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr.rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!rec) {
+			goto not_found;
+		}
+	}
 
-		DBUG_RETURN(DB_ERROR);
+	if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(tid);
+		goto not_found;
 	}
 
 	/* Read the table names and the number of columns associated
 	with the constraint */
 
-	mem_heap_free(heap2);
-
 	foreign = dict_mem_foreign_create();
 
 	uint32_t n_fields_and_type = mach_read_from_4(
@@ -3474,7 +2958,7 @@ dict_load_foreign(
 	foreign->type = (n_fields_and_type >> 24) & ((1U << 6) - 1);
 	foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
 
-	foreign->id = mem_heap_strdupl(foreign->heap, id, id_len);
+	foreign->id = mem_heap_strdupl(foreign->heap, id.data(), id.size());
 
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
@@ -3483,23 +2967,43 @@ dict_load_foreign(
 		foreign->heap, (char*) field, len);
 	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
 
-	const ulint foreign_table_name_len = len;
+	const size_t foreign_table_name_len = len;
+	const size_t table_name_len = strlen(table_name);
 
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+
+	if (!my_charset_latin1.strnncoll(table_name, table_name_len,
+					 foreign->foreign_table_name,
+					 foreign_table_name_len)) {
+	} else if (!check_recursive
+		   && !my_charset_latin1.strnncoll(table_name, table_name_len,
+						   (const char*) field, len)) {
+	} else {
+		dict_foreign_free(foreign);
+		goto not_found;
+	}
+
 	foreign->referenced_table_name = mem_heap_strdupl(
-		foreign->heap, (char*) field, len);
+		foreign->heap, (const char*) field, len);
 	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
 
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+	mtr.commit();
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
 
-	dict_load_foreign_cols(foreign);
+	err = dict_load_foreign_cols(foreign, trx_id);
+	if (err != DB_SUCCESS) {
+		goto load_error;
+	}
 
-	ref_table = dict_table_check_if_in_cache_low(
-		foreign->referenced_table_name_lookup);
-	for_table = dict_table_check_if_in_cache_low(
-		foreign->foreign_table_name_lookup);
+	ref_table = dict_sys.find_table(
+		{foreign->referenced_table_name_lookup,
+		 strlen(foreign->referenced_table_name_lookup)});
+	for_table = dict_sys.find_table(
+		{foreign->foreign_table_name_lookup,
+		 strlen(foreign->foreign_table_name_lookup)});
 
 	if (!for_table) {
 		/* To avoid recursively loading the tables related through
@@ -3512,9 +3016,9 @@ dict_load_foreign(
 			mem_heap_strdupl(ref_table->heap,
 					 foreign->foreign_table_name_lookup,
 					 foreign_table_name_len));
-
+load_error:
 		dict_foreign_remove_from_cache(foreign);
-		DBUG_RETURN(DB_SUCCESS);
+		DBUG_RETURN(err);
 	}
 
 	ut_a(for_table || ref_table);
@@ -3547,7 +3051,8 @@ dict_load_foreigns(
 	const char*		table_name,	/*!< in: table name */
 	const char**		col_names,	/*!< in: column names, or NULL
 						to use table->col_names */
-	bool			check_recursive,/*!< in: Whether to check
+	trx_id_t		trx_id,		/*!< in: DDL transaction id,
+						or 0 to check
 						recursive load of tables
 						chained by FK */
 	bool			check_charsets,	/*!< in: whether to check
@@ -3559,53 +3064,51 @@ dict_load_foreigns(
 						subsequently to load all the
 						foreign key constraints. */
 {
-	ulint		tuple_buf[(DTUPLE_EST_ALLOC(1) + sizeof(ulint) - 1)
-				/ sizeof(ulint)];
 	btr_pcur_t	pcur;
-	dtuple_t*	tuple;
-	dfield_t*	dfield;
-	dict_index_t*	sec_index;
-	dict_table_t*	sys_foreign;
-	const rec_t*	rec;
-	const byte*	field;
-	ulint		len;
-	dberr_t		err;
 	mtr_t		mtr;
 
 	DBUG_ENTER("dict_load_foreigns");
 
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	sys_foreign = dict_table_get_low("SYS_FOREIGN");
-
-	if (sys_foreign == NULL) {
-		/* No foreign keys defined yet in this database */
+	ut_ad(dict_sys.locked());
 
-		ib::info() << "No foreign key system tables in the database";
+	if (!dict_sys.sys_foreign || !dict_sys.sys_foreign_cols) {
+		if (ignore_err & DICT_ERR_IGNORE_FK_NOKEY) {
+			DBUG_RETURN(DB_SUCCESS);
+		}
+		sql_print_information("InnoDB: No foreign key system tables"
+				      " in the database");
 		DBUG_RETURN(DB_ERROR);
 	}
 
-	ut_ad(!dict_table_is_comp(sys_foreign));
-	mtr_start(&mtr);
+	ut_ad(!dict_sys.sys_foreign->not_redundant());
 
-	/* Get the secondary index based on FOR_NAME from table
-	SYS_FOREIGN */
+	dict_index_t *sec_index = dict_table_get_next_index(
+		dict_table_get_first_index(dict_sys.sys_foreign));
+	ut_ad(!strcmp(sec_index->fields[0].name, "FOR_NAME"));
+	bool check_recursive = !trx_id;
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
 
-	sec_index = dict_table_get_next_index(
-		dict_table_get_first_index(sys_foreign));
-	ut_ad(!dict_index_is_clust(sec_index));
 start_load:
+	mtr.start();
+	dfield_set_data(&dfield, table_name, strlen(table_name));
+	dict_index_copy_types(&tuple, sec_index, 1);
+	pcur.btr_cur.page_cur.index = sec_index;
 
-	tuple = dtuple_create_from_mem(tuple_buf, sizeof(tuple_buf), 1, 0);
-	dfield = dtuple_get_nth_field(tuple, 0);
-
-	dfield_set_data(dfield, table_name, strlen(table_name));
-	dict_index_copy_types(tuple, sec_index, 1);
-
-	btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE,
-				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(err);
+	}
 loop:
-	rec = btr_pcur_get_rec(&pcur);
+	const rec_t* rec = btr_pcur_get_rec(&pcur);
+	const byte* field;
+	const auto maybe_deleted = rec_get_deleted_flag(rec, 0);
 
 	if (!btr_pcur_is_on_user_rec(&pcur)) {
 		/* End of index */
@@ -3616,6 +3119,7 @@ loop:
 	/* Now we have the record in the secondary index containing a table
 	name and a foreign constraint ID */
 
+	ulint len;
 	field = rec_get_nth_field_old(
 		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len);
 
@@ -3623,11 +3127,10 @@ loop:
 	following call does the comparison in the latin1_swedish_ci
 	charset-collation, in a case-insensitive way. */
 
-	if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
-			       dfield_get_type(dfield)->prtype,
-			       static_cast<const byte*>(
-				       dfield_get_data(dfield)),
-			       dfield_get_len(dfield),
+	if (0 != cmp_data_data(dfield_get_type(&dfield)->mtype,
+			       dfield_get_type(&dfield)->prtype,
+			       reinterpret_cast<const byte*>(table_name),
+			       dfield_get_len(&dfield),
 			       field, len)) {
 
 		goto load_next_index;
@@ -3640,12 +3143,7 @@ loop:
 	may not be the same case, but the previous comparison showed that they
 	match with no-case.  */
 
-	if (rec_get_deleted_flag(rec, 0)) {
-		goto next_rec;
-	}
-
-	if (innobase_get_lower_case_table_names() != 2
-	    && memcmp(field, table_name, len)) {
+	if (lower_case_table_names != 2 && memcmp(field, table_name, len)) {
 		goto next_rec;
 	}
 
@@ -3654,59 +3152,64 @@ loop:
 		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len);
 
 	/* Copy the string because the page may be modified or evicted
-	after mtr_commit() below. */
-	char	fk_id[MAX_TABLE_NAME_LEN + NAME_LEN + 1];
+	after mtr.commit() below. */
+	char	fk_id[MAX_TABLE_NAME_LEN + NAME_LEN];
 	err = DB_SUCCESS;
 	if (UNIV_LIKELY(len < sizeof fk_id)) {
 		memcpy(fk_id, field, len);
-		fk_id[len] = '\0';
-	} else {
-		err = DB_CORRUPTION;
 	}
 
 	btr_pcur_store_position(&pcur, &mtr);
 
-	mtr_commit(&mtr);
+	mtr.commit();
 
 	/* Load the foreign constraint definition to the dictionary cache */
 
-	if (err == DB_SUCCESS) {
-		err = dict_load_foreign(fk_id, col_names,
-					check_recursive, check_charsets,
-					ignore_err, fk_tables);
-	}
-
-	if (err != DB_SUCCESS) {
-		btr_pcur_close(&pcur);
-
+	err = len < sizeof fk_id
+		? dict_load_foreign(table_name, false, col_names, trx_id,
+				    check_recursive, check_charsets,
+				    {fk_id, len}, ignore_err, fk_tables)
+		: DB_CORRUPTION;
+
+	switch (err) {
+	case DB_SUCCESS:
+		break;
+	case DB_NOT_FOUND:
+		if (maybe_deleted) {
+			break;
+		}
+		sql_print_error("InnoDB: Cannot load foreign constraint %.*s:"
+				" could not find the relevant record in "
+				"SYS_FOREIGN", int(len), fk_id);
+		/* fall through */
+	default:
+corrupted:
+		ut_free(pcur.old_rec_buf);
 		DBUG_RETURN(err);
 	}
 
-	mtr_start(&mtr);
-
-	btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+	mtr.start();
+	if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr)
+	    == btr_pcur_t::CORRUPTED) {
+		mtr.commit();
+		goto corrupted;
+	}
 next_rec:
 	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
 
 	goto loop;
 
 load_next_index:
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
-
-	sec_index = dict_table_get_next_index(sec_index);
-
-	if (sec_index != NULL) {
-
-		mtr_start(&mtr);
+	mtr.commit();
 
+	if ((sec_index = dict_table_get_next_index(sec_index))) {
 		/* Switch to scan index on REF_NAME, fk_max_recusive_level
 		already been updated when scanning FOR_NAME index, no need to
 		update again */
-		check_recursive = FALSE;
-
+		check_recursive = false;
 		goto start_load;
 	}
 
+	ut_free(pcur.old_rec_buf);
 	DBUG_RETURN(DB_SUCCESS);
 }
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
index bbdab865e3a..59189d3e053 100644
--- a/storage/innobase/dict/dict0mem.cc
+++ b/storage/innobase/dict/dict0mem.cc
@@ -35,7 +35,6 @@ Created 1/8/1996 Heikki Tuuri
 #include "dict0dict.h"
 #include "fts0priv.h"
 #include "lock0lock.h"
-#include "sync0sync.h"
 #include "row0row.h"
 #include "sql_string.h"
 #include <iostream>
@@ -124,85 +123,74 @@ bool dict_col_t::same_encoding(uint16_t a, uint16_t b)
   return false;
 }
 
-/** Create a table memory object.
+/** Create metadata.
 @param name     table name
 @param space    tablespace
 @param n_cols   total number of columns (both virtual and non-virtual)
 @param n_v_cols number of virtual columns
 @param flags    table flags
 @param flags2   table flags2
-@return own: table object */
-dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
-                                    ulint n_cols, ulint n_v_cols, ulint flags,
-                                    ulint flags2)
+@return newly allocated table object */
+dict_table_t *dict_table_t::create(const span<const char> &name,
+                                   fil_space_t *space,
+                                   ulint n_cols, ulint n_v_cols, ulint flags,
+                                   ulint flags2)
 {
-	dict_table_t*	table;
-	mem_heap_t*	heap;
-
-	ut_ad(name);
-	ut_ad(!space
-	      || space->purpose == FIL_TYPE_TABLESPACE
-	      || space->purpose == FIL_TYPE_TEMPORARY
-	      || space->purpose == FIL_TYPE_IMPORT);
-	ut_a(dict_tf2_is_valid(flags, flags2));
-	ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK));
+  ut_ad(!space || space->purpose == FIL_TYPE_TABLESPACE ||
+        space->purpose == FIL_TYPE_TEMPORARY ||
+        space->purpose == FIL_TYPE_IMPORT);
+  ut_a(dict_tf2_is_valid(flags, flags2));
+  ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK));
 
-	heap = mem_heap_create(DICT_HEAP_SIZE);
+  mem_heap_t *heap= mem_heap_create(DICT_HEAP_SIZE);
 
-	table = static_cast<dict_table_t*>(
-		mem_heap_zalloc(heap, sizeof(*table)));
+  dict_table_t *table= static_cast<dict_table_t*>
+    (mem_heap_zalloc(heap, sizeof(*table)));
 
-	lock_table_lock_list_init(&table->locks);
-
-	UT_LIST_INIT(table->indexes, &dict_index_t::indexes);
+  lock_table_lock_list_init(&table->locks);
+  UT_LIST_INIT(table->indexes, &dict_index_t::indexes);
 #ifdef BTR_CUR_HASH_ADAPT
-	UT_LIST_INIT(table->freed_indexes, &dict_index_t::indexes);
+  UT_LIST_INIT(table->freed_indexes, &dict_index_t::indexes);
 #endif /* BTR_CUR_HASH_ADAPT */
+  table->heap= heap;
+
+  ut_d(table->magic_n= DICT_TABLE_MAGIC_N);
+
+  table->flags= static_cast<unsigned>(flags) & ((1U << DICT_TF_BITS) - 1);
+  table->flags2= static_cast<unsigned>(flags2) & ((1U << DICT_TF2_BITS) - 1);
+  table->name.m_name= mem_strdupl(name.data(), name.size());
+  table->mdl_name.m_name= table->name.m_name;
+  table->is_system_db= dict_mem_table_is_system(table->name.m_name);
+  table->space= space;
+  table->space_id= space ? space->id : ULINT_UNDEFINED;
+  table->n_t_cols= static_cast<unsigned>(n_cols + DATA_N_SYS_COLS) &
+    dict_index_t::MAX_N_FIELDS;
+  table->n_v_cols= static_cast<unsigned>(n_v_cols) &
+    dict_index_t::MAX_N_FIELDS;
+  table->n_cols= static_cast<unsigned>(table->n_t_cols - table->n_v_cols) &
+    dict_index_t::MAX_N_FIELDS;
+  table->cols= static_cast<dict_col_t*>
+    (mem_heap_alloc(heap, table->n_cols * sizeof *table->cols));
+  table->v_cols= static_cast<dict_v_col_t*>
+    (mem_heap_alloc(heap, n_v_cols * sizeof *table->v_cols));
+  for (ulint i = n_v_cols; i--; )
+    new (&table->v_cols[i]) dict_v_col_t();
+  table->autoinc_lock= static_cast<ib_lock_t*>
+    (mem_heap_alloc(heap, sizeof *table->autoinc_lock));
+  /* If the table has an FTS index or we are in the process
+  of building one, create the table->fts */
+  if (dict_table_has_fts_index(table) ||
+      DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID |
+                           DICT_TF2_FTS_ADD_DOC_ID))
+  {
+    table->fts= fts_create(table);
+    table->fts->cache= fts_cache_create(table);
+  }
 
-	table->heap = heap;
-
-	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
-
-	table->flags = static_cast<unsigned>(flags)
-		& ((1U << DICT_TF_BITS) - 1);
-	table->flags2 = static_cast<unsigned>(flags2)
-		& ((1U << DICT_TF2_BITS) - 1);
-	table->name.m_name = mem_strdup(name);
-	table->is_system_db = dict_mem_table_is_system(table->name.m_name);
-	table->space = space;
-	table->space_id = space ? space->id : ULINT_UNDEFINED;
-	table->n_t_cols = static_cast<unsigned>(n_cols + DATA_N_SYS_COLS)
-		& dict_index_t::MAX_N_FIELDS;
-	table->n_v_cols = static_cast<unsigned>(n_v_cols)
-		& dict_index_t::MAX_N_FIELDS;
-	table->n_cols = static_cast<unsigned>(
-		table->n_t_cols - table->n_v_cols)
-		& dict_index_t::MAX_N_FIELDS;
-
-	table->cols = static_cast<dict_col_t*>(
-		mem_heap_alloc(heap, table->n_cols * sizeof(dict_col_t)));
-	table->v_cols = static_cast<dict_v_col_t*>(
-		mem_heap_alloc(heap, n_v_cols * sizeof(*table->v_cols)));
-	for (ulint i = n_v_cols; i--; ) {
-		new (&table->v_cols[i]) dict_v_col_t();
-	}
-
-	table->autoinc_lock = static_cast<ib_lock_t*>(
-		mem_heap_alloc(heap, lock_get_size()));
-
-	/* If the table has an FTS index or we are in the process
-	of building one, create the table->fts */
-	if (dict_table_has_fts_index(table)
-	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
-	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
-		table->fts = fts_create(table);
-		table->fts->cache = fts_cache_create(table);
-	}
-
-	new(&table->foreign_set) dict_foreign_set();
-	new(&table->referenced_set) dict_foreign_set();
+  new (&table->foreign_set) dict_foreign_set();
+  new (&table->referenced_set) dict_foreign_set();
 
-	return(table);
+  return table;
 }
 
 /****************************************************************//**
@@ -234,7 +222,6 @@ dict_mem_table_free(
 	table->referenced_set.~dict_foreign_set();
 
 	ut_free(table->name.m_name);
-	table->name.m_name = NULL;
 
 	/* Clean up virtual index info structures that are registered
 	with virtual columns */
@@ -791,8 +778,8 @@ dict_mem_index_create(
 		index->rtr_track = new
 			(mem_heap_alloc(heap, sizeof *index->rtr_track))
 			rtr_info_track_t();
-		mutex_create(LATCH_ID_RTR_ACTIVE_MUTEX,
-			     &index->rtr_track->rtr_active_mutex);
+		mysql_mutex_init(rtr_active_mutex_key,
+				 &index->rtr_track->rtr_active_mutex, nullptr);
 	}
 
 	return(index);
@@ -834,7 +821,7 @@ dict_mem_foreign_table_name_lookup_set(
 	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
 	ibool		do_alloc)	/*!< in: is an alloc needed */
 {
-	if (innobase_get_lower_case_table_names() == 2) {
+	if (lower_case_table_names == 2) {
 		if (do_alloc) {
 			ulint	len;
 
@@ -864,7 +851,7 @@ dict_mem_referenced_table_name_lookup_set(
 	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
 	ibool		do_alloc)	/*!< in: is an alloc needed */
 {
-	if (innobase_get_lower_case_table_names() == 2) {
+	if (lower_case_table_names == 2) {
 		if (do_alloc) {
 			ulint	len;
 
@@ -1101,7 +1088,7 @@ dict_mem_index_free(
 			rtr_info->index = NULL;
 		}
 
-		mutex_destroy(&index->rtr_track->rtr_active_mutex);
+		mysql_mutex_destroy(&index->rtr_track->rtr_active_mutex);
 		index->rtr_track->~rtr_info_track_t();
 	}
 
@@ -1126,7 +1113,7 @@ dict_mem_create_temporary_tablename(
 	ut_ad(dbend);
 	size_t		dblen   = size_t(dbend - dbtab) + 1;
 
-	size = dblen + (sizeof(TEMP_FILE_PREFIX) + 3 + 20);
+	size = dblen + (sizeof(TEMP_FILE_PREFIX_INNODB) + 20);
 	name = static_cast<char*>(mem_heap_alloc(heap, size));
 	memcpy(name, dbtab, dblen);
 	snprintf(name + dblen, size - dblen,
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index ed8f9561a90..7bdccd899b8 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2009, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,19 +25,20 @@ Created Jan 06, 2010 Vasil Dimov
 *******************************************************/
 
 #include "dict0stats.h"
-#include "ut0ut.h"
-#include "ut0rnd.h"
 #include "dyn0buf.h"
 #include "row0sel.h"
 #include "trx0trx.h"
+#include "lock0lock.h"
 #include "pars0pars.h"
 #include <mysql_com.h>
+#include "log.h"
 #include "btr0btr.h"
-#include "sync0sync.h"
+#include "que0que.h"
 
 #include <algorithm>
 #include <map>
 #include <vector>
+#include <thread>
 
 /* Sampling algorithm description @{
 
@@ -141,6 +142,20 @@ typedef ut_allocator<std::pair<const char* const, dict_index_t*> >
 typedef std::map<const char*, dict_index_t*, ut_strcmp_functor,
 		index_map_t_allocator>	index_map_t;
 
+bool dict_table_t::is_stats_table() const
+{
+  return !strcmp(name.m_name, TABLE_STATS_NAME) ||
+         !strcmp(name.m_name, INDEX_STATS_NAME);
+}
+
+bool trx_t::has_stats_table_lock() const
+{
+  for (const lock_t *l : lock.table_locks)
+    if (l && l->un_member.tab_lock.table->is_stats_table())
+      return true;
+  return false;
+}
+
 /*********************************************************************//**
 Checks whether an index should be ignored in stats manipulations:
 * stats fetch
@@ -153,93 +168,324 @@ dict_stats_should_ignore_index(
 /*===========================*/
 	const dict_index_t*	index)	/*!< in: index */
 {
-	return((index->type & (DICT_FTS | DICT_SPATIAL))
-	       || index->is_corrupted()
-	       || index->to_be_dropped
-	       || !index->is_committed());
+  return !index->is_btree() || index->to_be_dropped || !index->is_committed();
+}
+
+
+/** expected column definition */
+struct dict_col_meta_t
+{
+  /** column name */
+  const char *name;
+  /** main type */
+  unsigned mtype;
+  /** prtype mask; all these bits have to be set in prtype */
+  unsigned prtype_mask;
+  /** column length in bytes */
+  unsigned len;
+};
+
+/** For checking whether a table exists and has a predefined schema */
+struct dict_table_schema_t
+{
+  /** table name */
+  span<const char> table_name;
+  /** table name in SQL */
+  const char *table_name_sql;
+  /** number of columns */
+  unsigned n_cols;
+  /** columns */
+  const dict_col_meta_t columns[8];
+};
+
+static const dict_table_schema_t table_stats_schema =
+{
+  {C_STRING_WITH_LEN(TABLE_STATS_NAME)}, TABLE_STATS_NAME_PRINT, 6,
+  {
+    {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192},
+    {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597},
+    {"last_update", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 4},
+    {"n_rows", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+    {"clustered_index_size", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+    {"sum_of_other_index_sizes", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+  }
+};
+
+static const dict_table_schema_t index_stats_schema =
+{
+  {C_STRING_WITH_LEN(INDEX_STATS_NAME)}, INDEX_STATS_NAME_PRINT, 8,
+  {
+    {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192},
+    {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597},
+    {"index_name", DATA_VARMYSQL, DATA_NOT_NULL, 192},
+    {"last_update", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 4},
+    {"stat_name", DATA_VARMYSQL, DATA_NOT_NULL, 64*3},
+    {"stat_value", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+    {"sample_size", DATA_INT, DATA_UNSIGNED, 8},
+    {"stat_description", DATA_VARMYSQL, DATA_NOT_NULL, 1024*3}
+  }
+};
+
+/** Construct the type's SQL name (e.g. BIGINT UNSIGNED)
+@param mtype   InnoDB main type
+@param prtype  InnoDB precise type
+@param len     length of the column
+@param name    the SQL name
+@param name_sz size of the name buffer
+@return number of bytes written (excluding the terminating NUL byte) */
+static int dtype_sql_name(unsigned mtype, unsigned prtype, unsigned len,
+                          char *name, size_t name_sz)
+{
+  const char *Unsigned= "";
+  const char *Main= "UNKNOWN";
+
+  switch (mtype) {
+  case DATA_INT:
+    switch (len) {
+    case 1:
+      Main= "TINYINT";
+      break;
+    case 2:
+      Main= "SMALLINT";
+      break;
+    case 3:
+      Main= "MEDIUMINT";
+      break;
+    case 4:
+      Main= "INT";
+      break;
+    case 8:
+      Main= "BIGINT";
+      break;
+    }
+
+  append_unsigned:
+    if (prtype & DATA_UNSIGNED)
+      Unsigned= " UNSIGNED";
+    len= 0;
+    break;
+  case DATA_FLOAT:
+    Main= "FLOAT";
+    goto append_unsigned;
+  case DATA_DOUBLE:
+    Main= "DOUBLE";
+    goto append_unsigned;
+  case DATA_FIXBINARY:
+    Main= "BINARY";
+    break;
+  case DATA_CHAR:
+  case DATA_MYSQL:
+    Main= "CHAR";
+    break;
+  case DATA_VARCHAR:
+  case DATA_VARMYSQL:
+    Main= "VARCHAR";
+    break;
+  case DATA_BINARY:
+    Main= "VARBINARY";
+    break;
+  case DATA_GEOMETRY:
+    Main= "GEOMETRY";
+    len= 0;
+    break;
+  case DATA_BLOB:
+    switch (len) {
+    case 9:
+      Main= "TINYBLOB";
+      break;
+    case 10:
+      Main= "BLOB";
+      break;
+    case 11:
+      Main= "MEDIUMBLOB";
+      break;
+    case 12:
+      Main= "LONGBLOB";
+      break;
+    }
+    len= 0;
+  }
+
+  const char* Not_null= (prtype & DATA_NOT_NULL) ? " NOT NULL" : "";
+  if (len)
+    return snprintf(name, name_sz, "%s(%u)%s%s", Main, len, Unsigned,
+                    Not_null);
+  else
+    return snprintf(name, name_sz, "%s%s%s", Main, Unsigned, Not_null);
+}
+
+static bool innodb_table_stats_not_found;
+static bool innodb_index_stats_not_found;
+static bool innodb_table_stats_not_found_reported;
+static bool innodb_index_stats_not_found_reported;
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+static
+dberr_t
+dict_table_schema_check(
+/*====================*/
+	const dict_table_schema_t* req_schema,	/*!< in: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+{
+	const dict_table_t* table= dict_sys.load_table(req_schema->table_name);
+
+	if (!table) {
+		if (opt_bootstrap)
+			return DB_TABLE_NOT_FOUND;
+		if (req_schema == &table_stats_schema) {
+			if (innodb_table_stats_not_found_reported) {
+				return DB_STATS_DO_NOT_EXIST;
+			}
+			innodb_table_stats_not_found = true;
+			innodb_table_stats_not_found_reported = true;
+		} else {
+			ut_ad(req_schema == &index_stats_schema);
+			if (innodb_index_stats_not_found_reported) {
+				return DB_STATS_DO_NOT_EXIST;
+			}
+			innodb_index_stats_not_found = true;
+			innodb_index_stats_not_found_reported = true;
+		}
+
+		snprintf(errstr, errstr_sz, "Table %s not found.",
+			 req_schema->table_name_sql);
+		return DB_TABLE_NOT_FOUND;
+	}
+
+	if (!table->is_readable() && !table->space) {
+		/* missing tablespace */
+		snprintf(errstr, errstr_sz,
+			 "Tablespace for table %s is missing.",
+			 req_schema->table_name_sql);
+		return DB_TABLE_NOT_FOUND;
+	}
+
+	if (unsigned(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) {
+		/* the table has a different number of columns than required */
+		snprintf(errstr, errstr_sz,
+			 "%s has %d columns but should have %u.",
+			 req_schema->table_name_sql,
+			 table->n_def - DATA_N_SYS_COLS,
+			 req_schema->n_cols);
+		return DB_ERROR;
+	}
+
+	/* For each column from req_schema->columns[] search
+	whether it is present in table->cols[].
+	The following algorithm is O(n_cols^2), but is optimized to
+	be O(n_cols) if the columns are in the same order in both arrays. */
+
+	for (unsigned i = 0; i < req_schema->n_cols; i++) {
+		ulint	j = dict_table_has_column(
+			table, req_schema->columns[i].name, i);
+
+		if (j == table->n_def) {
+			snprintf(errstr, errstr_sz,
+				    "required column %s"
+				    " not found in table %s.",
+				    req_schema->columns[i].name,
+				    req_schema->table_name_sql);
+
+			return(DB_ERROR);
+		}
+
+		/* we found a column with the same name on j'th position,
+		compare column types and flags */
+
+		/* check length for exact match */
+		if (req_schema->columns[i].len != table->cols[j].len) {
+			sql_print_warning("InnoDB: Table %s has"
+					  " length mismatch in the"
+					  " column name %s."
+					  " Please run mariadb-upgrade",
+					  req_schema->table_name_sql,
+					  req_schema->columns[i].name);
+		}
+
+		/*
+                  check mtype for exact match.
+                  This check is relaxed to allow use to use TIMESTAMP
+                  (ie INT) for last_update instead of DATA_BINARY.
+                  We have to test for both values as the innodb_table_stats
+                  table may come from MySQL and have the old type.
+                */
+		if (req_schema->columns[i].mtype != table->cols[j].mtype &&
+                    !(req_schema->columns[i].mtype == DATA_INT &&
+                      table->cols[j].mtype == DATA_FIXBINARY)) {
+		} else if ((~table->cols[j].prtype
+			    & req_schema->columns[i].prtype_mask)) {
+		} else {
+			continue;
+		}
+
+		int s = snprintf(errstr, errstr_sz,
+				 "Column %s in table %s is ",
+				 req_schema->columns[i].name,
+				 req_schema->table_name_sql);
+		if (s < 0 || static_cast<size_t>(s) >= errstr_sz) {
+			return DB_ERROR;
+		}
+		errstr += s;
+		errstr_sz -= s;
+		s = dtype_sql_name(table->cols[j].mtype, table->cols[j].prtype,
+				   table->cols[j].len, errstr, errstr_sz);
+		if (s < 0 || static_cast<size_t>(s) + sizeof " but should be "
+		    >= errstr_sz) {
+			return DB_ERROR;
+		}
+		errstr += s;
+		memcpy(errstr, " but should be ", sizeof " but should be ");
+		errstr += (sizeof " but should be ") - 1;
+		errstr_sz -= s + (sizeof " but should be ") - 1;
+		s = dtype_sql_name(req_schema->columns[i].mtype,
+				   req_schema->columns[i].prtype_mask,
+				   req_schema->columns[i].len,
+				   errstr, errstr_sz);
+		return DB_ERROR;
+	}
+
+	if (size_t n_foreign = table->foreign_set.size()) {
+		snprintf(errstr, errstr_sz,
+			 "Table %s has %zu foreign key(s) pointing"
+			 " to other tables, but it must have 0.",
+			 req_schema->table_name_sql, n_foreign);
+		return DB_ERROR;
+	}
+
+	if (size_t n_referenced = table->referenced_set.size()) {
+		snprintf(errstr, errstr_sz,
+			 "There are %zu foreign key(s) pointing to %s, "
+			 "but there must be 0.", n_referenced,
+			 req_schema->table_name_sql);
+		return DB_ERROR;
+	}
+
+	return DB_SUCCESS;
 }
 
 /*********************************************************************//**
 Checks whether the persistent statistics storage exists and that all
 tables have the proper structure.
 @return true if exists and all tables are ok */
-static
-bool
-dict_stats_persistent_storage_check(
-/*================================*/
-	bool	caller_has_dict_sys_mutex)	/*!< in: true if the caller
-						owns dict_sys.mutex */
+static bool dict_stats_persistent_storage_check(bool dict_already_locked)
 {
-	/* definition for the table TABLE_STATS_NAME */
-	dict_col_meta_t	table_stats_columns[] = {
-		{"database_name", DATA_VARMYSQL,
-			DATA_NOT_NULL, 192},
-
-		{"table_name", DATA_VARMYSQL,
-			DATA_NOT_NULL, 597},
-
-		{"last_update", DATA_INT,
-			DATA_NOT_NULL | DATA_UNSIGNED, 4},
-
-		{"n_rows", DATA_INT,
-			DATA_NOT_NULL | DATA_UNSIGNED, 8},
-
-		{"clustered_index_size", DATA_INT,
-			DATA_NOT_NULL | DATA_UNSIGNED, 8},
-
-		{"sum_of_other_index_sizes", DATA_INT,
-			DATA_NOT_NULL | DATA_UNSIGNED, 8}
-	};
-	dict_table_schema_t	table_stats_schema = {
-		TABLE_STATS_NAME,
-		UT_ARR_SIZE(table_stats_columns),
-		table_stats_columns,
-		0 /* n_foreign */,
-		0 /* n_referenced */
-	};
-
-	/* definition for the table INDEX_STATS_NAME */
-	dict_col_meta_t	index_stats_columns[] = {
-		{"database_name", DATA_VARMYSQL,
-			DATA_NOT_NULL, 192},
-
-		{"table_name", DATA_VARMYSQL,
-			DATA_NOT_NULL, 597},
-
-		{"index_name", DATA_VARMYSQL,
-			DATA_NOT_NULL, 192},
-
-		{"last_update", DATA_INT,
-			DATA_NOT_NULL | DATA_UNSIGNED, 4},
-
-		{"stat_name", DATA_VARMYSQL,
-			DATA_NOT_NULL, 64*3},
-
-		{"stat_value", DATA_INT,
-			DATA_NOT_NULL | DATA_UNSIGNED, 8},
-
-		{"sample_size", DATA_INT,
-			DATA_UNSIGNED, 8},
-
-		{"stat_description", DATA_VARMYSQL,
-			DATA_NOT_NULL, 1024*3}
-	};
-	dict_table_schema_t	index_stats_schema = {
-		INDEX_STATS_NAME,
-		UT_ARR_SIZE(index_stats_columns),
-		index_stats_columns,
-		0 /* n_foreign */,
-		0 /* n_referenced */
-	};
-
 	char		errstr[512];
 	dberr_t		ret;
 
-	if (!caller_has_dict_sys_mutex) {
-		mutex_enter(&dict_sys.mutex);
+	if (!dict_already_locked) {
+		dict_sys.lock(SRW_LOCK_CALL);
 	}
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	/* first check table_stats */
 	ret = dict_table_schema_check(&table_stats_schema, errstr,
@@ -250,8 +496,8 @@ dict_stats_persistent_storage_check(
 					      sizeof(errstr));
 	}
 
-	if (!caller_has_dict_sys_mutex) {
-		mutex_exit(&dict_sys.mutex);
+	if (!dict_already_locked) {
+		dict_sys.unlock();
 	}
 
 	if (ret != DB_SUCCESS && ret != DB_STATS_DO_NOT_EXIST) {
@@ -270,66 +516,20 @@ This function will free the pinfo object.
 @param[in,out]	pinfo	pinfo to pass to que_eval_sql() must already
 have any literals bound to it
 @param[in]	sql	SQL string to execute
-@param[in,out]	trx	in case of NULL the function will allocate and
-free the trx object. If it is not NULL then it will be rolled back
-only in the case of error, but not freed.
+@param[in,out]	trx	transaction
 @return DB_SUCCESS or error code */
 static
-dberr_t
-dict_stats_exec_sql(
-	pars_info_t*	pinfo,
-	const char*	sql,
-	trx_t*		trx)
+dberr_t dict_stats_exec_sql(pars_info_t *pinfo, const char* sql, trx_t *trx)
 {
-	dberr_t	err;
-	bool	trx_started = false;
+  ut_ad(dict_sys.locked());
 
-	ut_d(dict_sys.assert_locked());
-
-	if (!dict_stats_persistent_storage_check(true)) {
-		pars_info_free(pinfo);
-		return(DB_STATS_DO_NOT_EXIST);
-	}
-
-	if (trx == NULL) {
-		trx = trx_create();
-		trx_started = true;
-
-		if (srv_read_only_mode) {
-			trx_start_internal_read_only(trx);
-		} else {
-			trx_start_internal(trx);
-		}
-	}
-
-	err = que_eval_sql(pinfo, sql, FALSE, trx); /* pinfo is freed here */
-
-	DBUG_EXECUTE_IF("stats_index_error",
-		if (!trx_started) {
-			err = DB_STATS_DO_NOT_EXIST;
-			trx->error_state = DB_STATS_DO_NOT_EXIST;
-		});
-
-	if (!trx_started && err == DB_SUCCESS) {
-		return(DB_SUCCESS);
-	}
-
-	if (err == DB_SUCCESS) {
-		trx_commit_for_mysql(trx);
-	} else {
-		trx->op_info = "rollback of internal trx on stats tables";
-		trx->dict_operation_lock_mode = RW_X_LATCH;
-		trx->rollback();
-		trx->dict_operation_lock_mode = 0;
-		trx->op_info = "";
-		ut_a(trx->error_state == DB_SUCCESS);
-	}
-
-	if (trx_started) {
-		trx->free();
-	}
+  if (!dict_stats_persistent_storage_check(true))
+  {
+    pars_info_free(pinfo);
+    return DB_STATS_DO_NOT_EXIST;
+  }
 
-	return(err);
+  return que_eval_sql(pinfo, sql, trx);
 }
 
 /*********************************************************************//**
@@ -407,7 +607,9 @@ dict_stats_table_clone_create(
 
 	dict_table_t*	t;
 
-	t = (dict_table_t*) mem_heap_alloc(heap, sizeof(*t));
+	t = (dict_table_t*) mem_heap_zalloc(heap, sizeof(*t));
+
+	t->stats_mutex_init();
 
 	MEM_CHECK_DEFINED(&table->id, sizeof(table->id));
 	t->id = table->id;
@@ -415,6 +617,7 @@ dict_stats_table_clone_create(
 	t->heap = heap;
 
 	t->name.m_name = mem_heap_strdup(heap, table->name.m_name);
+	t->mdl_name.m_name = t->name.m_name;
 
 	t->corrupted = table->corrupted;
 
@@ -435,7 +638,7 @@ dict_stats_table_clone_create(
 
 		dict_index_t*	idx;
 
-		idx = (dict_index_t*) mem_heap_alloc(heap, sizeof(*idx));
+		idx = (dict_index_t*) mem_heap_zalloc(heap, sizeof(*idx));
 
 		MEM_CHECK_DEFINED(&index->id, sizeof(index->id));
 		idx->id = index->id;
@@ -453,7 +656,7 @@ dict_stats_table_clone_create(
 
 		idx->n_uniq = index->n_uniq;
 
-		idx->fields = (dict_field_t*) mem_heap_alloc(
+		idx->fields = (dict_field_t*) mem_heap_zalloc(
 			heap, idx->n_uniq * sizeof(idx->fields[0]));
 
 		for (ulint i = 0; i < idx->n_uniq; i++) {
@@ -464,15 +667,15 @@ dict_stats_table_clone_create(
 		/* hook idx into t->indexes */
 		UT_LIST_ADD_LAST(t->indexes, idx);
 
-		idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_alloc(
+		idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_zalloc(
 			heap,
 			idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0]));
 
-		idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_alloc(
+		idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_zalloc(
 			heap,
 			idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0]));
 
-		idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_alloc(
+		idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_zalloc(
 			heap,
 			idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
 		ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
@@ -495,6 +698,7 @@ dict_stats_table_clone_free(
 /*========================*/
 	dict_table_t*	t)	/*!< in: dummy table object to free */
 {
+	t->stats_mutex_destroy();
 	mem_heap_free(t->heap);
 }
 
@@ -511,7 +715,7 @@ dict_stats_empty_index(
 {
 	ut_ad(!(index->type & DICT_FTS));
 	ut_ad(!dict_index_is_ibuf(index));
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(index->table->stats_mutex_is_owner());
 
 	ulint	n_uniq = index->n_uniq;
 
@@ -541,7 +745,9 @@ dict_stats_empty_table(
 	bool		empty_defrag_stats)
 				/*!< in: whether to empty defrag stats */
 {
-	mutex_enter(&dict_sys.mutex);
+	/* Initialize table/index level stats is now protected by
+	table level lock_mutex.*/
+	table->stats_mutex_lock();
 
 	/* Zero the stats members */
 	table->stat_n_rows = 0;
@@ -567,7 +773,7 @@ dict_stats_empty_table(
 	}
 
 	table->stat_initialized = TRUE;
-	mutex_exit(&dict_sys.mutex);
+	table->stats_mutex_unlock();
 }
 
 /*********************************************************************//**
@@ -633,9 +839,6 @@ dict_stats_assert_initialized(
 	MEM_CHECK_DEFINED(&table->stat_modified_counter,
 			  sizeof table->stat_modified_counter);
 
-	MEM_CHECK_DEFINED(&table->stats_bg_flag,
-			  sizeof table->stats_bg_flag);
-
 	for (dict_index_t* index = dict_table_get_first_index(table);
 	     index != NULL;
 	     index = dict_table_get_next_index(index)) {
@@ -666,7 +869,8 @@ dict_stats_copy(
                                              to have the same statistics as if
                                              the table was empty */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(src->stats_mutex_is_owner());
+	ut_ad(dst->stats_mutex_is_owner());
 
 	dst->stats_last_recalc = src->stats_last_recalc;
 	dst->stat_n_rows = src->stat_n_rows;
@@ -783,7 +987,7 @@ dict_table_t*
 dict_stats_snapshot_create(
 	dict_table_t*	table)
 {
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.lock(SRW_LOCK_CALL);
 
 	dict_stats_assert_initialized(table);
 
@@ -791,14 +995,19 @@ dict_stats_snapshot_create(
 
 	t = dict_stats_table_clone_create(table);
 
+	table->stats_mutex_lock();
+	ut_d(t->stats_mutex_lock());
+
 	dict_stats_copy(t, table, false);
 
+	ut_d(t->stats_mutex_unlock());
+	table->stats_mutex_unlock();
+
 	t->stat_persistent = table->stat_persistent;
 	t->stats_auto_recalc = table->stats_auto_recalc;
 	t->stats_sample_pages = table->stats_sample_pages;
-	t->stats_bg_flag = table->stats_bg_flag;
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unlock();
 
 	return(t);
 }
@@ -815,19 +1024,430 @@ dict_stats_snapshot_free(
 	dict_stats_table_clone_free(t);
 }
 
+/** Statistics for one field of an index. */
+struct index_field_stats_t
+{
+  ib_uint64_t n_diff_key_vals;
+  ib_uint64_t n_sample_sizes;
+  ib_uint64_t n_non_null_key_vals;
+
+  index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
+                      ib_uint64_t n_sample_sizes= 0,
+                      ib_uint64_t n_non_null_key_vals= 0)
+      : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
+        n_non_null_key_vals(n_non_null_key_vals)
+  {
+  }
+
+  bool is_bulk_operation() const
+  {
+    return n_diff_key_vals == UINT64_MAX &&
+      n_sample_sizes == UINT64_MAX && n_non_null_key_vals == UINT64_MAX;
+  }
+};
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
+					number of columns uniquely determine
+					an index entry */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					its size could be for all fields or
+					that of "n_unique" */
+	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
+					not null rows for n-column prefix */
+{
+	ulint	i;
+
+	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+	if (n_not_null == NULL) {
+		return;
+	}
+
+	for (i = 0; i < n_unique; i++) {
+		if (rec_offs_nth_sql_null(offsets, i)) {
+			break;
+		}
+
+		n_not_null[i]++;
+	}
+}
+
+inline dberr_t
+btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr)
+{
+  ut_ad(!index()->is_spatial());
+  ut_ad(!mtr.get_savepoint());
+
+  mtr_s_lock_index(index(), &mtr);
+
+  if (index()->page == FIL_NULL)
+    return DB_CORRUPTION;
+
+  dberr_t err;
+  auto offset= index()->page;
+  bool merge= false;
+  ulint height= ULINT_UNDEFINED;
+
+  while (buf_block_t *block=
+         btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err))
+  {
+    page_cur.block= block;
+
+    if (height == ULINT_UNDEFINED)
+    {
+      height= btr_page_get_level(block->page.frame);
+      if (height > BTR_MAX_LEVELS)
+        return DB_CORRUPTION;
+
+      if (height == 0)
+        goto got_leaf;
+    }
+
+    if (height == 0)
+    {
+      mtr.rollback_to_savepoint(0, mtr.get_savepoint() - 1);
+    got_leaf:
+      page_cur.rec= page_get_infimum_rec(block->page.frame);
+      return DB_SUCCESS;
+    }
+
+    if (!--height)
+      merge= !index()->is_clust();
+
+    page_cur_open_on_rnd_user_rec(&page_cur);
+
+    offsets= rec_get_offsets(page_cur.rec, page_cur.index, offsets, 0,
+                             ULINT_UNDEFINED, &heap);
+
+    /* Go to the child node */
+    offset= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
+  }
+
+  return err;
+}
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
+	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+result.n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array result.n_non_null_key_vals.
+@param index          B-tree index
+@param bulk_trx_id    the value of index->table->bulk_trx_id at the start
+@return vector with statistics information
+empty vector if the index is unavailable. */
+static
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index,
+					  trx_id_t bulk_trx_id)
+{
+	page_t*		page;
+	rec_t*		rec;
+	ulint		n_cols;
+	ib_uint64_t*	n_diff;
+	ib_uint64_t*	n_not_null;
+	ibool		stats_null_not_equal;
+	uintmax_t	n_sample_pages=1; /* number of pages to sample */
+	ulint		not_empty_flag	= 0;
+	ulint		total_external_size = 0;
+	uintmax_t	add_on;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	rec_offs*	offsets_rec	= NULL;
+	rec_offs*	offsets_next_rec = NULL;
+
+	std::vector<index_field_stats_t> result;
+
+	ut_ad(!index->is_spatial());
+
+	n_cols = dict_index_get_n_unique(index);
+
+	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+			       * n_cols
+			       + dict_index_get_n_fields(index)
+			       * (sizeof *offsets_rec
+				  + sizeof *offsets_next_rec));
+
+	n_diff = (ib_uint64_t*) mem_heap_zalloc(
+		heap, n_cols * sizeof(n_diff[0]));
+
+	n_not_null = NULL;
+
+	/* Check srv_innodb_stats_method setting, and decide whether we
+	need to record non-null value and also decide if NULL is
+	considered equal (by setting stats_null_not_equal value) */
+	switch (srv_innodb_stats_method) {
+	case SRV_STATS_NULLS_IGNORED:
+		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+			heap, n_cols * sizeof *n_not_null);
+		/* fall through */
+
+	case SRV_STATS_NULLS_UNEQUAL:
+		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+		case, we will treat NULLs as unequal value */
+		stats_null_not_equal = TRUE;
+		break;
+
+	case SRV_STATS_NULLS_EQUAL:
+		stats_null_not_equal = FALSE;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (srv_stats_sample_traditional) {
+		/* It makes no sense to test more pages than are contained
+		in the index, thus we lower the number if it is too high */
+		if (srv_stats_transient_sample_pages > index->stat_index_size) {
+			if (index->stat_index_size > 0) {
+				n_sample_pages = index->stat_index_size;
+			}
+		} else {
+			n_sample_pages = srv_stats_transient_sample_pages;
+		}
+	} else {
+		/* New logaritmic number of pages that are estimated.
+		Number of pages estimated should be between 1 and
+		index->stat_index_size.
+
+		If we have only 0 or 1 index pages then we can only take 1
+		sample. We have already initialized n_sample_pages to 1.
+
+		So taking index size as I and sample as S and log(I)*S as L
+
+		requirement 1) we want the out limit of the expression to not exceed I;
+		requirement 2) we want the ideal pages to be at least S;
+		so the current expression is min(I, max( min(S,I), L)
+
+		looking for simplifications:
+
+		case 1: assume S < I
+		min(I, max( min(S,I), L) -> min(I , max( S, L))
+
+		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
+
+		so we have: min(I , L)
+
+		case 2: assume I < S
+		    min(I, max( min(S,I), L) -> min(I, max( I, L))
+
+		case 2a: L > I
+		    min(I, max( I, L)) -> min(I, L) -> I
+
+		case 2b: when L < I
+		    min(I, max( I, L))  ->  min(I, I ) -> I
+
+		so taking all case2 paths is I, our expression is:
+		n_pages = S < I? min(I,L) : I
+                */
+		if (index->stat_index_size > 1) {
+			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+				? ut_min(index->stat_index_size,
+					 static_cast<ulint>(
+						 log2(double(index->stat_index_size))
+						 * double(srv_stats_transient_sample_pages)))
+				: index->stat_index_size;
+		}
+	}
+
+	/* Sanity check */
+	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+
+	/* We sample some pages in the index to get an estimate */
+	btr_cur_t cursor;
+	cursor.page_cur.index = index;
+
+	for (ulint i = 0; i < n_sample_pages; i++) {
+		mtr.start();
+
+		if (cursor.open_random_leaf(offsets_rec, heap, mtr) !=
+                    DB_SUCCESS
+		    || index->table->bulk_trx_id != bulk_trx_id) {
+			mtr.commit();
+			goto exit_loop;
+		}
+
+		/* Count the number of different key values for each prefix of
+		the key on this index page. If the prefix does not determine
+		the index record uniquely in the B-tree, then we subtract one
+		because otherwise our algorithm would give a wrong estimate
+		for an index where there is just one key value. */
+
+		page = btr_cur_get_page(&cursor);
+
+		rec = page_rec_get_next(cursor.page_cur.rec);
+		const ulint n_core = index->n_core_fields;
+
+		if (rec && !page_rec_is_supremum(rec)) {
+			not_empty_flag = 1;
+			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+						      n_core,
+						      ULINT_UNDEFINED, &heap);
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_rec, n_not_null);
+			}
+		}
+
+		while (!page_rec_is_supremum(rec)) {
+			ulint	matched_fields;
+			rec_t*	next_rec = page_rec_get_next(rec);
+			if (!next_rec || page_rec_is_supremum(next_rec)) {
+				total_external_size +=
+					btr_rec_get_externally_stored_len(
+						rec, offsets_rec);
+				break;
+			}
+
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   n_core,
+							   ULINT_UNDEFINED,
+							   &heap);
+
+			cmp_rec_rec(rec, next_rec,
+				    offsets_rec, offsets_next_rec,
+				    index, stats_null_not_equal,
+				    &matched_fields);
+
+			for (ulint j = matched_fields; j < n_cols; j++) {
+				/* We add one if this index record has
+				a different prefix from the previous */
+
+				n_diff[j]++;
+			}
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_next_rec, n_not_null);
+			}
+
+			total_external_size
+				+= btr_rec_get_externally_stored_len(
+					rec, offsets_rec);
+
+			rec = next_rec;
+			/* Initialize offsets_rec for the next round
+			and assign the old offsets_rec buffer to
+			offsets_next_rec. */
+			{
+				rec_offs* offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+		}
+
+		if (n_cols == dict_index_get_n_unique_in_tree(index)
+		    && page_has_siblings(page)) {
+
+			/* If there is more than one leaf page in the tree,
+			we add one because we know that the first record
+			on the page certainly had a different prefix than the
+			last record on the previous index page in the
+			alphabetical order. Before this fix, if there was
+			just one big record on each clustered index page, the
+			algorithm grossly underestimated the number of rows
+			in the table. */
+
+			n_diff[n_cols - 1]++;
+		}
+
+		mtr.commit();
+	}
+
+exit_loop:
+	/* If we saw k borders between different key values on
+	n_sample_pages leaf pages, we can estimate how many
+	there will be in index->stat_n_leaf_pages */
+
+	/* We must take into account that our sample actually represents
+	also the pages used for external storage of fields (those pages are
+	included in index->stat_n_leaf_pages) */
+
+	result.reserve(n_cols);
+
+	for (ulint j = 0; j < n_cols; j++) {
+		index_field_stats_t stat;
+
+		stat.n_diff_key_vals
+			= BTR_TABLE_STATS_FROM_SAMPLE(
+				n_diff[j], index, n_sample_pages,
+				total_external_size, not_empty_flag);
+
+		/* If the tree is small, smaller than
+		10 * n_sample_pages + total_external_size, then
+		the above estimate is ok. For bigger trees it is common that we
+		do not see any borders between key values in the few pages
+		we pick. But still there may be n_sample_pages
+		different key values, or even more. Let us try to approximate
+		that: */
+
+		add_on = index->stat_n_leaf_pages
+			/ (10 * (n_sample_pages
+				 + total_external_size));
+
+		if (add_on > n_sample_pages) {
+			add_on = n_sample_pages;
+		}
+
+		stat.n_diff_key_vals += add_on;
+
+		stat.n_sample_sizes = n_sample_pages;
+
+		if (n_not_null != NULL) {
+			stat.n_non_null_key_vals =
+				 BTR_TABLE_STATS_FROM_SAMPLE(
+					n_not_null[j], index, n_sample_pages,
+					total_external_size, not_empty_flag);
+		}
+
+		result.push_back(stat);
+	}
+
+	mem_heap_free(heap);
+	return result;
+}
+
 /*********************************************************************//**
 Calculates new estimates for index statistics. This function is
 relatively quick and is used to calculate transient statistics that
 are not saved on disk. This was the only way to calculate statistics
 before the Persistent Statistics feature was introduced.
 This function doesn't update the defragmentation related stats.
-Only persistent statistics supports defragmentation stats. */
+Only persistent statistics supports defragmentation stats.
+@return error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
 static
-void
+dberr_t
 dict_stats_update_transient_for_index(
 /*==================================*/
 	dict_index_t*	index)	/*!< in/out: index */
 {
+	dberr_t err = DB_SUCCESS;
 	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
 	    && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO
 		|| !dict_index_is_clust(index))) {
@@ -837,55 +1457,65 @@ dict_stats_update_transient_for_index(
 		Initialize some bogus index cardinality
 		statistics, so that the data can be queried in
 		various means, also via secondary indexes. */
-		mutex_enter(&dict_sys.mutex);
+dummy_empty:
+		index->table->stats_mutex_lock();
 		dict_stats_empty_index(index, false);
-		mutex_exit(&dict_sys.mutex);
+		index->table->stats_mutex_unlock();
+		return err;
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 	} else if (ibuf_debug && !dict_index_is_clust(index)) {
-		mutex_enter(&dict_sys.mutex);
-		dict_stats_empty_index(index, false);
-		mutex_exit(&dict_sys.mutex);
+		goto dummy_empty;
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+	} else if (dict_index_is_online_ddl(index) || !index->is_committed()
+		   || !index->table->space) {
+		goto dummy_empty;
 	} else {
 		mtr_t	mtr;
-		ulint	size;
 
 		mtr.start();
 		mtr_sx_lock_index(index, &mtr);
-		size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
 
-		if (size != ULINT_UNDEFINED) {
-			index->stat_index_size = size;
+		dberr_t err;
+		buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
+						       &mtr, &err);
+		if (!root) {
+invalid:
+			mtr.commit();
+			goto dummy_empty;
+		}
 
-			size = btr_get_size(
-				index, BTR_N_LEAF_PAGES, &mtr);
+		const auto bulk_trx_id = index->table->bulk_trx_id;
+		if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+			err= DB_SUCCESS_LOCKED_REC;
+			goto invalid;
 		}
 
-		mtr.commit();
+		mtr.x_lock_space(index->table->space);
 
-		switch (size) {
-		case ULINT_UNDEFINED:
-			mutex_enter(&dict_sys.mutex);
-			dict_stats_empty_index(index, false);
-			mutex_exit(&dict_sys.mutex);
-			return;
-		case 0:
-			/* The root node of the tree is a leaf */
-			size = 1;
-		}
+		ulint dummy, size;
+		index->stat_index_size
+			= fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_LEAF
+						+ root->page.frame, &size,
+						&mtr)
+			+ fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_TOP
+						+ root->page.frame, &dummy,
+						&mtr);
 
-		index->stat_n_leaf_pages = size;
+		mtr.commit();
+
+		index->stat_n_leaf_pages = size ? size : 1;
 
 		/* Do not continue if table decryption has failed or
 		table is already marked as corrupted. */
 		if (index->is_readable()) {
 			std::vector<index_field_stats_t> stats
 				= btr_estimate_number_of_different_key_vals(
-					index);
+					index, bulk_trx_id);
 
 			if (!stats.empty()) {
-				ut_ad(!mutex_own(&dict_sys.mutex));
-				mutex_enter(&dict_sys.mutex);
+				index->table->stats_mutex_lock();
 				for (size_t i = 0; i < stats.size(); ++i) {
 					index->stat_n_diff_key_vals[i]
 						= stats[i].n_diff_key_vals;
@@ -894,10 +1524,12 @@ dict_stats_update_transient_for_index(
 					index->stat_n_non_null_key_vals[i]
 						= stats[i].n_non_null_key_vals;
 				}
-				mutex_exit(&dict_sys.mutex);
+				index->table->stats_mutex_unlock();
 			}
 		}
 	}
+
+	return err;
 }
 
 /*********************************************************************//**
@@ -905,17 +1537,20 @@ Calculates new estimates for table and index statistics. This function
 is relatively quick and is used to calculate transient statistics that
 are not saved on disk.
 This was the only way to calculate statistics before the
-Persistent Statistics feature was introduced. */
+Persistent Statistics feature was introduced.
+@return error code
+@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */
 static
-void
+dberr_t
 dict_stats_update_transient(
 /*========================*/
 	dict_table_t*	table)	/*!< in/out: table */
 {
-	ut_ad(!mutex_own(&dict_sys.mutex));
+	ut_ad(!table->stats_mutex_is_owner());
 
 	dict_index_t*	index;
 	ulint		sum_of_index_sizes	= 0;
+	dberr_t		err = DB_SUCCESS;
 
 	/* Find out the sizes of the indexes and how many different values
 	for the key they approximately have */
@@ -924,15 +1559,15 @@ dict_stats_update_transient(
 
 	if (!table->space) {
 		/* Nothing to do. */
+empty_table:
 		dict_stats_empty_table(table, true);
-		return;
+		return err;
 	} else if (index == NULL) {
 		/* Table definition is corrupt */
 
 		ib::warn() << "Table " << table->name
 			<< " has no indexes. Cannot calculate statistics.";
-		dict_stats_empty_table(table, true);
-		return;
+		goto empty_table;
 	}
 
 	for (; index != NULL; index = dict_table_get_next_index(index)) {
@@ -944,19 +1579,20 @@ dict_stats_update_transient(
 		}
 
 		if (dict_stats_should_ignore_index(index)
-		    || !index->is_readable()) {
-			mutex_enter(&dict_sys.mutex);
+		    || !index->is_readable()
+		    || err == DB_SUCCESS_LOCKED_REC) {
+			index->table->stats_mutex_lock();
 			dict_stats_empty_index(index, false);
-			mutex_exit(&dict_sys.mutex);
+			index->table->stats_mutex_unlock();
 			continue;
 		}
 
-		dict_stats_update_transient_for_index(index);
+		err = dict_stats_update_transient_for_index(index);
 
 		sum_of_index_sizes += index->stat_index_size;
 	}
 
-	mutex_enter(&dict_sys.mutex);
+	table->stats_mutex_lock();
 
 	index = dict_table_get_first_index(table);
 
@@ -974,9 +1610,101 @@ dict_stats_update_transient(
 
 	table->stat_initialized = TRUE;
 
-	mutex_exit(&dict_sys.mutex);
+	table->stats_mutex_unlock();
+
+	return err;
 }
 
+/** Open a cursor at the first page in a tree level.
+@param page_cur  cursor
+@param level     level to search for (0=leaf)
+@param mtr       mini-transaction */
+static dberr_t page_cur_open_level(page_cur_t *page_cur, ulint level,
+                                   mtr_t *mtr)
+{
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  dberr_t err;
+
+  dict_index_t *const index= page_cur->index;
+
+  rec_offs_init(offsets_);
+  ut_ad(level != ULINT_UNDEFINED);
+  ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_SX_LOCK));
+  ut_ad(mtr->get_savepoint() == 1);
+
+  uint32_t page= index->page;
+
+  for (ulint height = ULINT_UNDEFINED;; height--)
+  {
+    buf_block_t* block=
+      btr_block_get(*index, page, RW_S_LATCH,
+                    !height && !index->is_clust(), mtr, &err);
+    if (!block)
+      break;
+
+    const uint32_t l= btr_page_get_level(block->page.frame);
+
+    if (height == ULINT_UNDEFINED)
+    {
+      ut_ad(!heap);
+      /* We are in the root node */
+      height= l;
+      if (UNIV_UNLIKELY(height < level))
+        return DB_CORRUPTION;
+    }
+    else if (UNIV_UNLIKELY(height != l) || page_has_prev(block->page.frame))
+    {
+      err= DB_CORRUPTION;
+      break;
+    }
+
+    page_cur_set_before_first(block, page_cur);
+
+    if (height == level)
+      break;
+
+    ut_ad(height);
+
+    if (!page_cur_move_to_next(page_cur))
+    {
+      err= DB_CORRUPTION;
+      break;
+    }
+
+    offsets= rec_get_offsets(page_cur->rec, index, offsets, 0, ULINT_UNDEFINED,
+                             &heap);
+    page= btr_node_ptr_get_child_page_no(page_cur->rec, offsets);
+  }
+
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
+
+  /* Release all page latches except the one on the desired page. */
+  const auto end= mtr->get_savepoint();
+  if (end > 1)
+    mtr->rollback_to_savepoint(1, end - 1);
+
+  return err;
+}
+
+/** Open a cursor at the first page in a tree level.
+@param page_cur  cursor
+@param level     level to search for (0=leaf)
+@param mtr       mini-transaction
+@param index     index tree */
+static dberr_t btr_pcur_open_level(btr_pcur_t *pcur, ulint level, mtr_t *mtr,
+                                   dict_index_t *index)
+{
+  pcur->latch_mode= BTR_SEARCH_LEAF;
+  pcur->search_mode= PAGE_CUR_G;
+  pcur->pos_state= BTR_PCUR_IS_POSITIONED;
+  pcur->btr_cur.page_cur.index= index;
+  return page_cur_open_level(&pcur->btr_cur.page_cur, level, mtr);
+}
+
+
 /* @{ Pseudo code about the relation between the following functions
 
 let N = N_SAMPLE_PAGES(index)
@@ -1033,7 +1761,8 @@ dict_stats_analyze_index_level(
 	DEBUG_PRINTF("    %s(table=%s, index=%s, level=" ULINTPF ")\n",
 		     __func__, index->table->name, index->name, level);
 
-	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
+	*total_recs = 0;
+	*total_pages = 0;
 
 	n_uniq = dict_index_get_n_unique(index);
 
@@ -1066,24 +1795,19 @@ dict_stats_analyze_index_level(
 	/* Position pcur on the leftmost record on the leftmost page
 	on the desired level. */
 
-	btr_pcur_open_at_index_side(
-		true, index, BTR_SEARCH_TREE_ALREADY_S_LATCHED,
-		&pcur, true, level, mtr);
-	btr_pcur_move_to_next_on_page(&pcur);
+	if (btr_pcur_open_level(&pcur, level, mtr, index) != DB_SUCCESS
+	    || !btr_pcur_move_to_next_on_page(&pcur)) {
+		goto func_exit;
+	}
 
 	page = btr_pcur_get_page(&pcur);
 
 	/* The page must not be empty, except when
 	it is the root page (and the whole index is empty). */
 	ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
-	ut_ad(btr_pcur_get_rec(&pcur)
-	      == page_rec_get_next_const(page_get_infimum_rec(page)));
-
-	/* check that we are indeed on the desired level */
-	ut_a(btr_page_get_level(page) == level);
 
-	/* there should not be any pages on the left */
-	ut_a(!page_has_prev(page));
+	prev_rec = NULL;
+	prev_rec_is_copied = false;
 
 	if (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
 		    btr_pcur_get_rec(&pcur), page_is_comp(page))) {
@@ -1093,20 +1817,12 @@ dict_stats_analyze_index_level(
 			ut_ad(index->is_instant());
 			btr_pcur_move_to_next_user_rec(&pcur, mtr);
 		}
-	} else {
+	} else if (UNIV_UNLIKELY(level != 0)) {
 		/* The first record on the leftmost page must be
 		marked as such on each level except the leaf level. */
-		ut_a(level == 0);
+		goto func_exit;
 	}
 
-	prev_rec = NULL;
-	prev_rec_is_copied = false;
-
-	/* no records by default */
-	*total_recs = 0;
-
-	*total_pages = 0;
-
 	/* iterate over all user records on this level
 	and compare each two adjacent ones, even the last on page
 	X and the fist on page X+1 */
@@ -1145,10 +1861,7 @@ dict_stats_analyze_index_level(
 
 		if (level == 0
 		    && !srv_stats_include_delete_marked
-		    && rec_get_deleted_flag(
-			    rec,
-			    page_is_comp(btr_pcur_get_page(&pcur)))) {
-
+		    && rec_get_deleted_flag(rec, page_rec_is_comp(rec))) {
 			if (rec_is_last_on_page
 			    && !prev_rec_is_copied
 			    && prev_rec != NULL) {
@@ -1228,7 +1941,7 @@ dict_stats_analyze_index_level(
 			records on this level at some point we will jump from
 			one page to the next and then rec and prev_rec will
 			be on different pages and
-			btr_pcur_move_to_next_user_rec() will release the
+			btr_cur_move_to_next_user_rec() will release the
 			latch on the page that prev_rec is on */
 			prev_rec = rec_copy_prefix_to_buf(
 				rec, index, n_uniq,
@@ -1237,7 +1950,7 @@ dict_stats_analyze_index_level(
 
 		} else {
 			/* still on the same page, the next call to
-			btr_pcur_move_to_next_user_rec() will not jump
+			btr_cur_move_to_next_user_rec() will not jump
 			on the next page, we can simply assign pointers
 			instead of copying the records like above */
 
@@ -1308,15 +2021,41 @@ dict_stats_analyze_index_level(
 	}
 #endif /* UNIV_STATS_DEBUG */
 
-	/* Release the latch on the last page, because that is not done by
-	btr_pcur_close(). This function works also for non-leaf pages. */
-	btr_leaf_page_release(btr_pcur_get_block(&pcur), BTR_SEARCH_LEAF, mtr);
-
-	btr_pcur_close(&pcur);
+func_exit:
 	ut_free(prev_rec_buf);
 	mem_heap_free(heap);
 }
 
+
+/************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return pointer to next non delete-marked record or pointer to supremum */
+static
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+  const page_t *const page= page_align(rec);
+
+  if (page_is_comp(page))
+  {
+    for (rec= page_rec_get_next_low(rec, TRUE);
+         rec && rec_get_deleted_flag(rec, TRUE);
+         rec= page_rec_get_next_low(rec, TRUE));
+    return rec ? rec : page + PAGE_NEW_SUPREMUM;
+  }
+  else
+  {
+    for (rec= page_rec_get_next_low(rec, FALSE);
+         rec && rec_get_deleted_flag(rec, FALSE);
+         rec= page_rec_get_next_low(rec, FALSE));
+    return rec ? rec : page + PAGE_OLD_SUPREMUM;
+  }
+}
+
 /** Scan a page, reading records from left to right and counting the number
 of distinct records (looking only at the first n_prefix
 columns) and the number of external pages pointed by records from this page.
@@ -1374,7 +2113,7 @@ dict_stats_scan_page(
 
 	rec = get_next(page_get_infimum_rec(page));
 
-	if (page_rec_is_supremum(rec)) {
+	if (!rec || page_rec_is_supremum(rec)) {
 		/* the page is empty or contains only delete-marked records */
 		*n_diff = 0;
 		*out_rec = NULL;
@@ -1393,7 +2132,7 @@ dict_stats_scan_page(
 
 	*n_diff = 1;
 
-	while (!page_rec_is_supremum(next_rec)) {
+	while (next_rec && !page_rec_is_supremum(next_rec)) {
 
 		ulint	matched_fields;
 
@@ -1515,16 +2254,18 @@ dict_stats_analyze_index_below_cur(
 
 	/* descend to the leaf level on the B-tree */
 	for (;;) {
-
-		dberr_t err = DB_SUCCESS;
+		dberr_t err;
 
 		block = buf_page_get_gen(page_id, zip_size,
 					 RW_S_LATCH, NULL, BUF_GET,
-					 __FILE__, __LINE__, &mtr, &err,
+					 &mtr, &err,
 					 !index->is_clust()
 					 && 1 == btr_page_get_level(page));
+		if (!block) {
+			goto func_exit;
+		}
 
-		page = buf_block_get_frame(block);
+		page = block->page.frame;
 
 		if (page_is_leaf(page)) {
 			/* leaf level */
@@ -1588,6 +2329,7 @@ dict_stats_analyze_index_below_cur(
 		     __func__, page_no, n_diff);
 #endif
 
+func_exit:
 	mtr_commit(&mtr);
 	mem_heap_free(heap);
 }
@@ -1667,49 +2409,40 @@ dict_stats_analyze_index_for_n_prefix(
 		     n_prefix, n_diff_data->n_diff_on_level);
 #endif
 
-	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
+	ut_ad(n_diff_data->level);
 
 	/* Position pcur on the leftmost record on the leftmost page
 	on the desired level. */
 
-	btr_pcur_open_at_index_side(
-		true, index, BTR_SEARCH_TREE_ALREADY_S_LATCHED,
-		&pcur, true, n_diff_data->level, mtr);
-	btr_pcur_move_to_next_on_page(&pcur);
+	n_diff_data->n_diff_all_analyzed_pages = 0;
+	n_diff_data->n_external_pages_sum = 0;
+
+	if (btr_pcur_open_level(&pcur, n_diff_data->level, mtr, index)
+	    != DB_SUCCESS
+	    || !btr_pcur_move_to_next_on_page(&pcur)) {
+		return;
+	}
 
 	page = btr_pcur_get_page(&pcur);
 
 	const rec_t*	first_rec = btr_pcur_get_rec(&pcur);
 
-	/* We shouldn't be scanning the leaf level. The caller of this function
-	should have stopped the descend on level 1 or higher. */
-	ut_ad(n_diff_data->level > 0);
-	ut_ad(!page_is_leaf(page));
-
 	/* The page must not be empty, except when
 	it is the root page (and the whole index is empty). */
-	ut_ad(btr_pcur_is_on_user_rec(&pcur));
-	ut_ad(first_rec == page_rec_get_next_const(page_get_infimum_rec(page)));
-
-	/* check that we are indeed on the desired level */
-	ut_a(btr_page_get_level(page) == n_diff_data->level);
-
-	/* there should not be any pages on the left */
-	ut_a(!page_has_prev(page));
-
-	/* check whether the first record on the leftmost page is marked
-	as such; we are on a non-leaf level */
-	ut_a(rec_get_info_bits(first_rec, page_is_comp(page))
-	     & REC_INFO_MIN_REC_FLAG);
+	if (page_has_prev(page)
+	    || !btr_pcur_is_on_user_rec(&pcur)
+	    || btr_page_get_level(page) != n_diff_data->level
+	    || first_rec != page_rec_get_next_const(page_get_infimum_rec(page))
+	    || !(rec_get_info_bits(first_rec, page_is_comp(page))
+		 & REC_INFO_MIN_REC_FLAG)) {
+		return;
+	}
 
 	const ib_uint64_t	last_idx_on_level = boundaries->at(
 		static_cast<unsigned>(n_diff_data->n_diff_on_level - 1));
 
 	rec_idx = 0;
 
-	n_diff_data->n_diff_all_analyzed_pages = 0;
-	n_diff_data->n_external_pages_sum = 0;
-
 	for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) {
 		/* there are n_diff_on_level elements
 		in 'boundaries' and we divide those elements
@@ -1818,8 +2551,6 @@ dict_stats_analyze_index_for_n_prefix(
 
 		n_diff_data->n_external_pages_sum += n_external_pages;
 	}
-
-	btr_pcur_close(&pcur);
 }
 
 /** statistics for an index */
@@ -1833,7 +2564,20 @@ struct index_stats_t
   {
     stats.reserve(n_uniq);
     for (ulint i= 0; i < n_uniq; ++i)
-      stats.push_back(index_field_stats_t(0, 1, 0));
+      stats.push_back(index_field_stats_t{0, 1, 0});
+  }
+
+  void set_bulk_operation()
+  {
+    memset((void*) &stats[0], 0xff, stats.size() * sizeof stats[0]);
+  }
+
+  bool is_bulk_operation() const
+  {
+    for (auto &s : stats)
+      if (!s.is_bulk_operation())
+        return false;
+    return true;
   }
 };
 
@@ -1919,22 +2663,19 @@ stat_n_leaf_pages. This function can be slow.
 @return index stats */
 static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 {
-	ulint		root_level;
-	ulint		level;
 	bool		level_is_analyzed;
 	ulint		n_uniq;
 	ulint		n_prefix;
 	ib_uint64_t	total_recs;
 	ib_uint64_t	total_pages;
 	mtr_t		mtr;
-	ulint		size;
 	index_stats_t	result(index->n_uniq);
 	DBUG_ENTER("dict_stats_analyze_index");
 
 	DBUG_PRINT("info", ("index: %s, online status: %d", index->name(),
 			    dict_index_get_online_status(index)));
 
-	ut_ad(!mutex_own(&dict_sys.mutex)); // because this function is slow
+	ut_ad(!index->table->stats_mutex_is_owner());
 	ut_ad(index->table->get_ref_count());
 
 	/* Disable update statistic for Rtree */
@@ -1946,30 +2687,35 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 
 	mtr.start();
 	mtr_sx_lock_index(index, &mtr);
-	size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
-
-	if (size != ULINT_UNDEFINED) {
-		result.index_size = size;
-		size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
-	}
-
-	/* Release the X locks on the root page taken by btr_get_size() */
-	mtr.commit();
-
-	switch (size) {
-	case ULINT_UNDEFINED:
+	dberr_t err;
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err);
+	if (!root) {
+empty_index:
+		mtr.commit();
 		dict_stats_assert_initialized_index(index);
 		DBUG_RETURN(result);
-	case 0:
-		/* The root node of the tree is a leaf */
-		size = 1;
 	}
 
-	result.n_leaf_pages = size;
+	uint16_t root_level = btr_page_get_level(root->page.frame);
+	mtr.x_lock_space(index->table->space);
+	ulint dummy, size;
+	result.index_size
+		= fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					+ root->page.frame, &size, &mtr)
+		+ fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
+					+ root->page.frame, &dummy, &mtr);
+	result.n_leaf_pages = size ? size : 1;
+
+	const auto bulk_trx_id = index->table->bulk_trx_id;
+	if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+		result.set_bulk_operation();
+		goto empty_index;
+	}
+
+	mtr.commit();
 
 	mtr.start();
 	mtr_sx_lock_index(index, &mtr);
-	root_level = btr_height_get(index, &mtr);
 
 	n_uniq = dict_index_get_n_unique(index);
 
@@ -2006,14 +2752,14 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 
 		mtr.commit();
 
-		mutex_enter(&dict_sys.mutex);
+		index->table->stats_mutex_lock();
 		for (ulint i = 0; i < n_uniq; i++) {
 			result.stats[i].n_diff_key_vals = index->stat_n_diff_key_vals[i];
 			result.stats[i].n_sample_sizes = total_pages;
 			result.stats[i].n_non_null_key_vals = index->stat_n_non_null_key_vals[i];
 		}
 		result.n_leaf_pages = index->stat_n_leaf_pages;
-		mutex_exit(&dict_sys.mutex);
+		index->table->stats_mutex_unlock();
 
 		DBUG_RETURN(result);
 	}
@@ -2047,7 +2793,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 	So if we find that the first level containing D distinct
 	keys (on n_prefix columns) is L, we continue from L when
 	searching for D distinct keys on n_prefix-1 columns. */
-	level = root_level;
+	auto level = root_level;
 	level_is_analyzed = false;
 
 	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
@@ -2061,7 +2807,11 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 		mtr.commit();
 		mtr.start();
 		mtr_sx_lock_index(index, &mtr);
-		if (root_level != btr_height_get(index, &mtr)) {
+		ut_ad(mtr.get_savepoint() == 1);
+		buf_block_t *root = btr_root_block_get(index, RW_S_LATCH,
+						       &mtr, &err);
+		if (!root || root_level != btr_page_get_level(root->page.frame)
+		    || index->table->bulk_trx_id != bulk_trx_id) {
 			/* Just quit if the tree has changed beyond
 			recognition here. The old stats from previous
 			runs will remain in the values that we have
@@ -2073,6 +2823,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 			break;
 		}
 
+		mtr.rollback_to_savepoint(1);
+
 		/* check whether we should pick the current level;
 		we pick level 1 even if it does not have enough
 		distinct records because we do not want to scan the
@@ -2133,6 +2885,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 				break;
 			}
 
+			mtr.rollback_to_savepoint(1);
 			dict_stats_analyze_index_level(index,
 						       level,
 						       n_diff_on_level,
@@ -2140,7 +2893,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 						       &total_pages,
 						       n_diff_boundaries,
 						       &mtr);
-
+			mtr.rollback_to_savepoint(1);
 			level_is_analyzed = true;
 
 			if (level == 1
@@ -2219,7 +2972,8 @@ found_level:
 Calculates new estimates for table and index statistics. This function
 is relatively slow and is used to calculate persistent statistics that
 will be saved on disk.
-@return DB_SUCCESS or error code */
+@return DB_SUCCESS or error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
 static
 dberr_t
 dict_stats_update_persistent(
@@ -2247,13 +3001,18 @@ dict_stats_update_persistent(
 	}
 
 	ut_ad(!dict_index_is_ibuf(index));
-	mutex_enter(&dict_sys.mutex);
+	table->stats_mutex_lock();
 	dict_stats_empty_index(index, false);
-	mutex_exit(&dict_sys.mutex);
+	table->stats_mutex_unlock();
 
 	index_stats_t stats = dict_stats_analyze_index(index);
 
-	mutex_enter(&dict_sys.mutex);
+	if (stats.is_bulk_operation()) {
+		dict_stats_empty_table(table, false);
+		return DB_SUCCESS_LOCKED_REC;
+	}
+
+	table->stats_mutex_lock();
 	index->stat_index_size = stats.index_size;
 	index->stat_n_leaf_pages = stats.n_leaf_pages;
 	for (size_t i = 0; i < stats.stats.size(); ++i) {
@@ -2288,21 +3047,26 @@ dict_stats_update_persistent(
 			continue;
 		}
 
-		if (!(table->stats_bg_flag & BG_STAT_SHOULD_QUIT)) {
-			mutex_exit(&dict_sys.mutex);
-			stats = dict_stats_analyze_index(index);
-			mutex_enter(&dict_sys.mutex);
+		table->stats_mutex_unlock();
+		stats = dict_stats_analyze_index(index);
+		table->stats_mutex_lock();
 
-			index->stat_index_size = stats.index_size;
-			index->stat_n_leaf_pages = stats.n_leaf_pages;
-			for (size_t i = 0; i < stats.stats.size(); ++i) {
-				index->stat_n_diff_key_vals[i]
-					= stats.stats[i].n_diff_key_vals;
-				index->stat_n_sample_sizes[i]
-					= stats.stats[i].n_sample_sizes;
-				index->stat_n_non_null_key_vals[i]
-					= stats.stats[i].n_non_null_key_vals;
-			}
+		if (stats.is_bulk_operation()) {
+			table->stats_mutex_unlock();
+			dict_stats_empty_table(table, false);
+			return DB_SUCCESS_LOCKED_REC;
+		}
+
+		index->stat_index_size = stats.index_size;
+		index->stat_n_leaf_pages = stats.n_leaf_pages;
+
+		for (size_t i = 0; i < stats.stats.size(); ++i) {
+			index->stat_n_diff_key_vals[i]
+				= stats.stats[i].n_diff_key_vals;
+			index->stat_n_sample_sizes[i]
+				= stats.stats[i].n_sample_sizes;
+			index->stat_n_non_null_key_vals[i]
+				= stats.stats[i].n_non_null_key_vals;
 		}
 
 		table->stat_sum_of_other_index_sizes
@@ -2317,7 +3081,7 @@ dict_stats_update_persistent(
 
 	dict_stats_assert_initialized(table);
 
-	mutex_exit(&dict_sys.mutex);
+	table->stats_mutex_unlock();
 
 	return(DB_SUCCESS);
 }
@@ -2331,9 +3095,7 @@ storage.
 @param[in]	stat_value		value of the stat
 @param[in]	sample_size		n pages sampled or NULL
 @param[in]	stat_description	description of the stat
-@param[in,out]	trx			in case of NULL the function will
-allocate and free the trx object. If it is not NULL then it will be
-rolled back only in the case of error, but not freed.
+@param[in,out]	trx			transaction
 @return DB_SUCCESS or error code */
 dberr_t
 dict_stats_save_index_stat(
@@ -2350,8 +3112,7 @@ dict_stats_save_index_stat(
 	char		db_utf8[MAX_DB_UTF8_LEN];
 	char		table_utf8[MAX_TABLE_UTF8_LEN];
 
-	ut_ad(!trx || trx->internal || trx->mysql_thd);
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8),
 		     table_utf8, sizeof(table_utf8));
@@ -2465,8 +3226,6 @@ dict_stats_save(
 	const index_id_t*	only_for_index)
 {
 	pars_info_t*	pinfo;
-	dberr_t		ret;
-	dict_table_t*	table;
 	char		db_utf8[MAX_DB_UTF8_LEN];
 	char		table_utf8[MAX_TABLE_UTF8_LEN];
 
@@ -2478,13 +3237,61 @@ dict_stats_save(
 		return (dict_stats_report_error(table_orig));
 	}
 
-	table = dict_stats_snapshot_create(table_orig);
+	THD* thd = current_thd;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dict_table_t* table_stats = dict_table_open_on_name(
+		TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (table_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		table_stats = dict_acquire_mdl_shared<false>(table_stats, thd,
+							     &mdl_table);
+		dict_sys.unfreeze();
+	}
+	if (!table_stats
+	    || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) {
+release_and_exit:
+		if (table_stats) {
+			dict_table_close(table_stats, false, thd, mdl_table);
+		}
+		return DB_STATS_DO_NOT_EXIST;
+	}
+
+	dict_table_t* index_stats = dict_table_open_on_name(
+		INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (index_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		index_stats = dict_acquire_mdl_shared<false>(index_stats, thd,
+							     &mdl_index);
+		dict_sys.unfreeze();
+	}
+	if (!index_stats) {
+		goto release_and_exit;
+	}
+	if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
+		dict_table_close(index_stats, false, thd, mdl_index);
+		goto release_and_exit;
+	}
+
+	dict_table_t* table = dict_stats_snapshot_create(table_orig);
 
 	dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
 		     table_utf8, sizeof(table_utf8));
-
 	const time_t now = time(NULL);
-	dict_sys_lock();
+	trx_t*	trx = trx_create();
+	trx->mysql_thd = thd;
+	trx_start_internal(trx);
+	dberr_t ret = trx->read_only
+		? DB_READ_ONLY
+		: lock_table_for_trx(table_stats, trx, LOCK_X);
+	if (ret == DB_SUCCESS) {
+		ret = lock_table_for_trx(index_stats, trx, LOCK_X);
+	}
+	if (ret != DB_SUCCESS) {
+		if (trx->state != TRX_STATE_NOT_STARTED) {
+			trx->commit();
+		}
+		goto unlocked_free_and_exit;
+	}
 
 	pinfo = pars_info_create();
 
@@ -2497,6 +3304,9 @@ dict_stats_save(
 	pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes",
 		table->stat_sum_of_other_index_sizes);
 
+	dict_sys.lock(SRW_LOCK_CALL);
+	trx->dict_operation_lock_mode = true;
+
 	ret = dict_stats_exec_sql(
 		pinfo,
 		"PROCEDURE TABLE_STATS_SAVE () IS\n"
@@ -2517,20 +3327,24 @@ dict_stats_save(
 		":clustered_index_size,\n"
 		":sum_of_other_index_sizes\n"
 		");\n"
-		"END;", NULL);
+		"END;", trx);
 
 	if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
 		ib::error() << "Cannot save table statistics for table "
 			<< table->name << ": " << ret;
-func_exit:
-		dict_sys_unlock();
+rollback_and_exit:
+		trx->rollback();
+free_and_exit:
+		trx->dict_operation_lock_mode = false;
+		dict_sys.unlock();
+unlocked_free_and_exit:
+		trx->free();
 		dict_stats_snapshot_free(table);
+		dict_table_close(table_stats, false, thd, mdl_table);
+		dict_table_close(index_stats, false, thd, mdl_index);
 		return ret;
 	}
 
-	trx_t*	trx = trx_create();
-	trx_start_internal(trx);
-
 	dict_index_t*	index;
 	index_map_t	indexes(
 		(ut_strcmp_functor()),
@@ -2599,7 +3413,7 @@ func_exit:
 				stat_description, trx);
 
 			if (ret != DB_SUCCESS) {
-				goto end;
+				goto rollback_and_exit;
 			}
 		}
 
@@ -2609,7 +3423,7 @@ func_exit:
 						 "Number of leaf pages "
 						 "in the index", trx);
 		if (ret != DB_SUCCESS) {
-			goto end;
+			goto rollback_and_exit;
 		}
 
 		ret = dict_stats_save_index_stat(index, now, "size",
@@ -2618,15 +3432,12 @@ func_exit:
 						 "Number of pages "
 						 "in the index", trx);
 		if (ret != DB_SUCCESS) {
-			goto end;
+			goto rollback_and_exit;
 		}
 	}
 
-	trx_commit_for_mysql(trx);
-
-end:
-	trx->free();
-	goto func_exit;
+	trx->commit();
+	goto free_and_exit;
 }
 
 /*********************************************************************//**
@@ -2993,27 +3804,50 @@ dict_stats_fetch_from_ps(
 	char		db_utf8[MAX_DB_UTF8_LEN];
 	char		table_utf8[MAX_TABLE_UTF8_LEN];
 
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
 	/* Initialize all stats to dummy values before fetching because if
 	the persistent storage contains incomplete stats (e.g. missing stats
 	for some index) then we would end up with (partially) uninitialized
 	stats. */
 	dict_stats_empty_table(table, true);
 
-	trx = trx_create();
+	THD* thd = current_thd;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dict_table_t* table_stats = dict_table_open_on_name(
+		TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (table_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		table_stats = dict_acquire_mdl_shared<false>(table_stats, thd,
+							     &mdl_table);
+		dict_sys.unfreeze();
+	}
+	if (!table_stats
+	    || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) {
+release_and_exit:
+		if (table_stats) {
+			dict_table_close(table_stats, false, thd, mdl_table);
+		}
+		return DB_STATS_DO_NOT_EXIST;
+	}
 
-	/* Use 'read-uncommitted' so that the SELECTs we execute
-	do not get blocked in case some user has locked the rows we
-	are SELECTing */
+	dict_table_t* index_stats = dict_table_open_on_name(
+		INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (index_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		index_stats = dict_acquire_mdl_shared<false>(index_stats, thd,
+							     &mdl_index);
+		dict_sys.unfreeze();
+	}
+	if (!index_stats) {
+		goto release_and_exit;
+	}
+	if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
+		dict_table_close(index_stats, false, thd, mdl_index);
+		goto release_and_exit;
+	}
 
-	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+	trx = trx_create();
 
-	if (srv_read_only_mode) {
-		trx_start_internal_read_only(trx);
-	} else {
-		trx_start_internal(trx);
-	}
+	trx_start_internal_read_only(trx);
 
 	dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
 		     table_utf8, sizeof(table_utf8));
@@ -3035,7 +3869,7 @@ dict_stats_fetch_from_ps(
 			        "fetch_index_stats_step",
 			        dict_stats_fetch_index_stats_step,
 			        &index_fetch_arg);
-
+	dict_sys.lock(SRW_LOCK_CALL); /* FIXME: remove this */
 	ret = que_eval_sql(pinfo,
 			   "PROCEDURE FETCH_STATS () IS\n"
 			   "found INT;\n"
@@ -3089,9 +3923,12 @@ dict_stats_fetch_from_ps(
 			   "END LOOP;\n"
 			   "CLOSE index_stats_cur;\n"
 
-			   "END;",
-			   TRUE, trx);
+			   "END;", trx);
 	/* pinfo is freed by que_eval_sql() */
+	dict_sys.unlock();
+
+	dict_table_close(table_stats, false, thd, mdl_table);
+	dict_table_close(index_stats, false, thd, mdl_index);
 
 	trx_commit_for_mysql(trx);
 
@@ -3129,13 +3966,11 @@ dict_stats_update_for_index(
 {
 	DBUG_ENTER("dict_stats_update_for_index");
 
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
 	if (dict_stats_is_persistent_enabled(index->table)) {
 
 		if (dict_stats_persistent_storage_check(false)) {
 			index_stats_t stats = dict_stats_analyze_index(index);
-			mutex_enter(&dict_sys.mutex);
+			index->table->stats_mutex_lock();
 			index->stat_index_size = stats.index_size;
 			index->stat_n_leaf_pages = stats.n_leaf_pages;
 			for (size_t i = 0; i < stats.stats.size(); ++i) {
@@ -3148,7 +3983,7 @@ dict_stats_update_for_index(
 			}
 			index->table->stat_sum_of_other_index_sizes
 				+= index->stat_index_size;
-			mutex_exit(&dict_sys.mutex);
+			index->table->stats_mutex_unlock();
 
 			dict_stats_save(index->table, &index->id);
 			DBUG_VOID_RETURN;
@@ -3178,7 +4013,8 @@ dict_stats_update_for_index(
 /*********************************************************************//**
 Calculates new estimates for table and index statistics. The statistics
 are used in query optimization.
-@return DB_SUCCESS or error code */
+@return DB_SUCCESS or error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
 dberr_t
 dict_stats_update(
 /*==============*/
@@ -3189,11 +4025,11 @@ dict_stats_update(
 					the persistent statistics
 					storage */
 {
-	ut_ad(!mutex_own(&dict_sys.mutex));
+	ut_ad(!table->stats_mutex_is_owner());
 
 	if (!table->is_readable()) {
 		return (dict_stats_report_error(table));
-	} else if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
+	} else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
 		/* If we have set a high innodb_force_recovery level, do
 		not calculate statistics, as a badly corrupted index can
 		cause a crash in it. */
@@ -3325,7 +4161,10 @@ dict_stats_update(
 		switch (err) {
 		case DB_SUCCESS:
 
-			mutex_enter(&dict_sys.mutex);
+			table->stats_mutex_lock();
+			/* t is localized to this thread so no need to
+			take stats mutex lock (limiting it to debug only) */
+			ut_d(t->stats_mutex_lock());
 
 			/* Pass reset_ignored_indexes=true as parameter
 			to dict_stats_copy. This will cause statictics
@@ -3334,7 +4173,8 @@ dict_stats_update(
 
 			dict_stats_assert_initialized(table);
 
-			mutex_exit(&dict_sys.mutex);
+			ut_d(t->stats_mutex_unlock());
+			table->stats_mutex_unlock();
 
 			dict_stats_table_clone_free(t);
 
@@ -3388,654 +4228,195 @@ dict_stats_update(
 	}
 
 transient:
-	dict_stats_update_transient(table);
-
-	return(DB_SUCCESS);
+	return dict_stats_update_transient(table);
 }
 
-/** Remove the information for a particular index's stats from the persistent
-storage if it exists and if there is data stored for this index.
-This function creates its own trx and commits it.
-
-We must modify system tables in a separate transaction in order to
-adhere to the InnoDB design constraint that dict_sys.latch prevents
-lock waits on system tables. If we modified system and user tables in
-the same transaction, we should exclusively hold dict_sys.latch until
-the transaction is committed, and effectively block other transactions
-that will attempt to open any InnoDB tables. Because we have no
-guarantee that user transactions will be committed fast, we cannot
-afford to keep the system tables locked in a user transaction.
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction (nullptr=start and commit a new one)
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_drop_index(
-/*==================*/
-	const char*	db_and_table,/*!< in: db and table, e.g. 'db/table' */
-	const char*	iname,	/*!< in: index name */
-	char*		errstr, /*!< out: error message if != DB_SUCCESS
-				is returned */
-	ulint		errstr_sz)/*!< in: size of the errstr buffer */
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+                                           const char *table_name, trx_t *trx)
 {
-	char		db_utf8[MAX_DB_UTF8_LEN];
-	char		table_utf8[MAX_TABLE_UTF8_LEN];
 	pars_info_t*	pinfo;
-	dberr_t		ret;
-
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
-	/* skip indexes whose table names do not contain a database name
-	e.g. if we are dropping an index from SYS_TABLES */
-	if (strchr(db_and_table, '/') == NULL) {
-
-		return(DB_SUCCESS);
-	}
 
-	dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
-		     table_utf8, sizeof(table_utf8));
-
-	pinfo = pars_info_create();
-
-	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
-
-	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
-
-	pars_info_add_str_literal(pinfo, "index_name", iname);
-
-	dict_sys_lock();
-
-	ret = dict_stats_exec_sql(
-		pinfo,
-		"PROCEDURE DROP_INDEX_STATS () IS\n"
-		"BEGIN\n"
-		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
-		"database_name = :database_name AND\n"
-		"table_name = :table_name AND\n"
-		"index_name = :index_name;\n"
-		"END;\n", NULL);
-
-	dict_sys_unlock();
-
-	if (ret == DB_STATS_DO_NOT_EXIST) {
-		ret = DB_SUCCESS;
-	}
-
-	if (ret != DB_SUCCESS) {
-		snprintf(errstr, errstr_sz,
-			 "Unable to delete statistics for index %s"
-			 " from %s%s: %s. They can be deleted later using"
-			 " DELETE FROM %s WHERE"
-			 " database_name = '%s' AND"
-			 " table_name = '%s' AND"
-			 " index_name = '%s';",
-			 iname,
-			 INDEX_STATS_NAME_PRINT,
-			 (ret == DB_LOCK_WAIT_TIMEOUT
-			  ? " because the rows are locked"
-			  : ""),
-			 ut_strerr(ret),
-			 INDEX_STATS_NAME_PRINT,
-			 db_utf8,
-			 table_utf8,
-			 iname);
-
-		ut_print_timestamp(stderr);
-		fprintf(stderr, " InnoDB: %s\n", errstr);
-	}
-
-	return(ret);
-}
-
-/*********************************************************************//**
-Executes
-DELETE FROM mysql.innodb_table_stats
-WHERE database_name = '...' AND table_name = '...';
-Creates its own transaction and commits it.
-@return DB_SUCCESS or error code */
-UNIV_INLINE
-dberr_t
-dict_stats_delete_from_table_stats(
-/*===============================*/
-	const char*	database_name,	/*!< in: database name, e.g. 'db' */
-	const char*	table_name)	/*!< in: table name, e.g. 'table' */
-{
-	pars_info_t*	pinfo;
-	dberr_t		ret;
-
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	pinfo = pars_info_create();
 
 	pars_info_add_str_literal(pinfo, "database_name", database_name);
 	pars_info_add_str_literal(pinfo, "table_name", table_name);
 
-	ret = dict_stats_exec_sql(
+	return dict_stats_exec_sql(
 		pinfo,
 		"PROCEDURE DELETE_FROM_TABLE_STATS () IS\n"
 		"BEGIN\n"
 		"DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n"
 		"database_name = :database_name AND\n"
 		"table_name = :table_name;\n"
-		"END;\n", NULL);
-
-	return(ret);
+		"END;\n", trx);
 }
 
-/*********************************************************************//**
-Executes
-DELETE FROM mysql.innodb_index_stats
-WHERE database_name = '...' AND table_name = '...';
-Creates its own transaction and commits it.
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
 @return DB_SUCCESS or error code */
-UNIV_INLINE
-dberr_t
-dict_stats_delete_from_index_stats(
-/*===============================*/
-	const char*	database_name,	/*!< in: database name, e.g. 'db' */
-	const char*	table_name)	/*!< in: table name, e.g. 'table' */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name, trx_t *trx)
 {
 	pars_info_t*	pinfo;
-	dberr_t		ret;
 
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	pinfo = pars_info_create();
 
 	pars_info_add_str_literal(pinfo, "database_name", database_name);
 	pars_info_add_str_literal(pinfo, "table_name", table_name);
 
-	ret = dict_stats_exec_sql(
+	return dict_stats_exec_sql(
 		pinfo,
 		"PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
 		"BEGIN\n"
 		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
 		"database_name = :database_name AND\n"
 		"table_name = :table_name;\n"
-		"END;\n", NULL);
-
-	return(ret);
-}
-
-/*********************************************************************//**
-Removes the statistics for a table and all of its indexes from the
-persistent statistics storage if it exists and if there is data stored for
-the table. This function creates its own transaction and commits it.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_stats_drop_table(
-/*==================*/
-	const char*	db_and_table,	/*!< in: db and table, e.g. 'db/table' */
-	char*		errstr,		/*!< out: error message
-					if != DB_SUCCESS is returned */
-	ulint		errstr_sz)	/*!< in: size of errstr buffer */
-{
-	char		db_utf8[MAX_DB_UTF8_LEN];
-	char		table_utf8[MAX_TABLE_UTF8_LEN];
-	dberr_t		ret;
-
-	ut_d(dict_sys.assert_locked());
-
-	/* skip tables that do not contain a database name
-	e.g. if we are dropping SYS_TABLES */
-	if (strchr(db_and_table, '/') == NULL) {
-
-		return(DB_SUCCESS);
-	}
-
-	/* skip innodb_table_stats and innodb_index_stats themselves */
-	if (strcmp(db_and_table, TABLE_STATS_NAME) == 0
-	    || strcmp(db_and_table, INDEX_STATS_NAME) == 0) {
-
-		return(DB_SUCCESS);
-	}
-
-	dict_fs2utf8(db_and_table, db_utf8, sizeof(db_utf8),
-		     table_utf8, sizeof(table_utf8));
-
-	ret = dict_stats_delete_from_table_stats(db_utf8, table_utf8);
-
-	if (ret == DB_SUCCESS) {
-		ret = dict_stats_delete_from_index_stats(db_utf8, table_utf8);
-	}
-
-	if (ret == DB_STATS_DO_NOT_EXIST) {
-		ret = DB_SUCCESS;
-	}
-
-	if (ret != DB_SUCCESS) {
-
-		snprintf(errstr, errstr_sz,
-			 "Unable to delete statistics for table %s.%s: %s."
-			 " They can be deleted later using"
-
-			 " DELETE FROM %s WHERE"
-			 " database_name = '%s' AND"
-			 " table_name = '%s';"
-
-			 " DELETE FROM %s WHERE"
-			 " database_name = '%s' AND"
-			 " table_name = '%s';",
-
-			 db_utf8, table_utf8,
-			 ut_strerr(ret),
-
-			 INDEX_STATS_NAME_PRINT,
-			 db_utf8, table_utf8,
-
-			 TABLE_STATS_NAME_PRINT,
-			 db_utf8, table_utf8);
-	}
-
-	return(ret);
+		"END;\n", trx);
 }
 
-/*********************************************************************//**
-Executes
-UPDATE mysql.innodb_table_stats SET
-database_name = '...', table_name = '...'
-WHERE database_name = '...' AND table_name = '...';
-Creates its own transaction and commits it.
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param index_name     name of the index
+@param trx            transaction
 @return DB_SUCCESS or error code */
-UNIV_INLINE
-dberr_t
-dict_stats_rename_table_in_table_stats(
-/*===================================*/
-	const char*	old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
-	const char*	old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
-	const char*	new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
-	const char*	new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           const char *index_name, trx_t *trx)
 {
 	pars_info_t*	pinfo;
-	dberr_t		ret;
 
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	pinfo = pars_info_create();
 
-	pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
-	pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
-	pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
-	pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+	pars_info_add_str_literal(pinfo, "index_name", index_name);
 
-	ret = dict_stats_exec_sql(
+	return dict_stats_exec_sql(
 		pinfo,
-		"PROCEDURE RENAME_TABLE_IN_TABLE_STATS () IS\n"
+		"PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
 		"BEGIN\n"
-		"UPDATE \"" TABLE_STATS_NAME "\" SET\n"
-		"database_name = :new_dbname_utf8,\n"
-		"table_name = :new_tablename_utf8\n"
-		"WHERE\n"
-		"database_name = :old_dbname_utf8 AND\n"
-		"table_name = :old_tablename_utf8;\n"
-		"END;\n", NULL);
-
-	return(ret);
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name AND\n"
+		"index_name = :index_name;\n"
+		"END;\n", trx);
 }
 
-/*********************************************************************//**
-Executes
-UPDATE mysql.innodb_index_stats SET
-database_name = '...', table_name = '...'
-WHERE database_name = '...' AND table_name = '...';
-Creates its own transaction and commits it.
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name  old table name
+@param new_name  new table name
+@param trx       transaction
 @return DB_SUCCESS or error code */
-UNIV_INLINE
-dberr_t
-dict_stats_rename_table_in_index_stats(
-/*===================================*/
-	const char*	old_dbname_utf8,/*!< in: database name, e.g. 'olddb' */
-	const char*	old_tablename_utf8,/*!< in: table name, e.g. 'oldtable' */
-	const char*	new_dbname_utf8,/*!< in: database name, e.g. 'newdb' */
-	const char*	new_tablename_utf8)/*!< in: table name, e.g. 'newtable' */
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+                                trx_t *trx)
 {
-	pars_info_t*	pinfo;
-	dberr_t		ret;
-
-	ut_d(dict_sys.assert_locked());
-
-	pinfo = pars_info_create();
-
-	pars_info_add_str_literal(pinfo, "old_dbname_utf8", old_dbname_utf8);
-	pars_info_add_str_literal(pinfo, "old_tablename_utf8", old_tablename_utf8);
-	pars_info_add_str_literal(pinfo, "new_dbname_utf8", new_dbname_utf8);
-	pars_info_add_str_literal(pinfo, "new_tablename_utf8", new_tablename_utf8);
-
-	ret = dict_stats_exec_sql(
-		pinfo,
-		"PROCEDURE RENAME_TABLE_IN_INDEX_STATS () IS\n"
-		"BEGIN\n"
-		"UPDATE \"" INDEX_STATS_NAME "\" SET\n"
-		"database_name = :new_dbname_utf8,\n"
-		"table_name = :new_tablename_utf8\n"
-		"WHERE\n"
-		"database_name = :old_dbname_utf8 AND\n"
-		"table_name = :old_tablename_utf8;\n"
-		"END;\n", NULL);
+  /* skip the statistics tables themselves */
+  if (!strcmp(old_name, TABLE_STATS_NAME) ||
+      !strcmp(old_name, INDEX_STATS_NAME) ||
+      !strcmp(new_name, TABLE_STATS_NAME) ||
+      !strcmp(new_name, INDEX_STATS_NAME))
+    return DB_SUCCESS;
+
+  char old_db[MAX_DB_UTF8_LEN];
+  char new_db[MAX_DB_UTF8_LEN];
+  char old_table[MAX_TABLE_UTF8_LEN];
+  char new_table[MAX_TABLE_UTF8_LEN];
+
+  dict_fs2utf8(old_name, old_db, sizeof old_db, old_table, sizeof old_table);
+  dict_fs2utf8(new_name, new_db, sizeof new_db, new_table, sizeof new_table);
+
+  if (dict_table_t::is_temporary_name(old_name) ||
+      dict_table_t::is_temporary_name(new_name))
+  {
+    if (dberr_t e= dict_stats_delete_from_table_stats(old_db, old_table, trx))
+      return e;
+    return dict_stats_delete_from_index_stats(old_db, old_table, trx);
+  }
 
-	return(ret);
+  pars_info_t *pinfo= pars_info_create();
+  pars_info_add_str_literal(pinfo, "old_db", old_db);
+  pars_info_add_str_literal(pinfo, "old_table", old_table);
+  pars_info_add_str_literal(pinfo, "new_db", new_db);
+  pars_info_add_str_literal(pinfo, "new_table", new_table);
+
+  static const char sql[]=
+    "PROCEDURE RENAME_TABLE_IN_STATS() IS\n"
+    "BEGIN\n"
+    "UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+    "database_name=:new_db, table_name=:new_table\n"
+    "WHERE database_name=:old_db AND table_name=:old_table;\n"
+    "UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+    "database_name=:new_db, table_name=:new_table\n"
+    "WHERE database_name=:old_db AND table_name=:old_table;\n"
+    "END;\n";
+
+  return dict_stats_exec_sql(pinfo, sql, trx);
 }
 
-/*********************************************************************//**
-Renames a table in InnoDB persistent stats storage.
-This function creates its own transaction and commits it.
+/** Rename an index in InnoDB persistent statistics.
+@param db         database name
+@param table      table name
+@param old_name   old table name
+@param new_name   new table name
+@param trx        transaction
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_rename_table(
-/*====================*/
-	const char*	old_name,	/*!< in: old name, e.g. 'db/table' */
-	const char*	new_name,	/*!< in: new name, e.g. 'db/table' */
-	char*		errstr,		/*!< out: error string if != DB_SUCCESS
-					is returned */
-	size_t		errstr_sz)	/*!< in: errstr size */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+                                const char *old_name, const char *new_name,
+                                trx_t *trx)
 {
-	char		old_db_utf8[MAX_DB_UTF8_LEN];
-	char		new_db_utf8[MAX_DB_UTF8_LEN];
-	char		old_table_utf8[MAX_TABLE_UTF8_LEN];
-	char		new_table_utf8[MAX_TABLE_UTF8_LEN];
-	dberr_t		ret;
-
-	/* skip innodb_table_stats and innodb_index_stats themselves */
-	if (strcmp(old_name, TABLE_STATS_NAME) == 0
-	    || strcmp(old_name, INDEX_STATS_NAME) == 0
-	    || strcmp(new_name, TABLE_STATS_NAME) == 0
-	    || strcmp(new_name, INDEX_STATS_NAME) == 0) {
-
-		return(DB_SUCCESS);
-	}
-
-	dict_fs2utf8(old_name, old_db_utf8, sizeof(old_db_utf8),
-		     old_table_utf8, sizeof(old_table_utf8));
-
-	dict_fs2utf8(new_name, new_db_utf8, sizeof(new_db_utf8),
-		     new_table_utf8, sizeof(new_table_utf8));
-
-	dict_sys_lock();
-
-	ulint	n_attempts = 0;
-	do {
-		n_attempts++;
-
-		ret = dict_stats_rename_table_in_table_stats(
-			old_db_utf8, old_table_utf8,
-			new_db_utf8, new_table_utf8);
-
-		if (ret == DB_DUPLICATE_KEY) {
-			dict_stats_delete_from_table_stats(
-				new_db_utf8, new_table_utf8);
-		}
-
-		if (ret == DB_STATS_DO_NOT_EXIST) {
-			ret = DB_SUCCESS;
-		}
-
-		if (ret != DB_SUCCESS) {
-			dict_sys_unlock();
-			os_thread_sleep(200000 /* 0.2 sec */);
-			dict_sys_lock();
-		}
-	} while ((ret == DB_DEADLOCK
-		  || ret == DB_DUPLICATE_KEY
-		  || ret == DB_LOCK_WAIT_TIMEOUT)
-		 && n_attempts < 5);
-
-	if (ret != DB_SUCCESS) {
-		snprintf(errstr, errstr_sz,
-			 "Unable to rename statistics from"
-			 " %s.%s to %s.%s in %s: %s."
-			 " They can be renamed later using"
-
-			 " UPDATE %s SET"
-			 " database_name = '%s',"
-			 " table_name = '%s'"
-			 " WHERE"
-			 " database_name = '%s' AND"
-			 " table_name = '%s';",
-
-			 old_db_utf8, old_table_utf8,
-			 new_db_utf8, new_table_utf8,
-			 TABLE_STATS_NAME_PRINT,
-			 ut_strerr(ret),
-
-			 TABLE_STATS_NAME_PRINT,
-			 new_db_utf8, new_table_utf8,
-			 old_db_utf8, old_table_utf8);
-		dict_sys_unlock();
-		return(ret);
-	}
-	/* else */
-
-	n_attempts = 0;
-	do {
-		n_attempts++;
-
-		ret = dict_stats_rename_table_in_index_stats(
-			old_db_utf8, old_table_utf8,
-			new_db_utf8, new_table_utf8);
-
-		if (ret == DB_DUPLICATE_KEY) {
-			dict_stats_delete_from_index_stats(
-				new_db_utf8, new_table_utf8);
-		}
-
-		if (ret == DB_STATS_DO_NOT_EXIST) {
-			ret = DB_SUCCESS;
-		}
-
-		if (ret != DB_SUCCESS) {
-			dict_sys_unlock();
-			os_thread_sleep(200000 /* 0.2 sec */);
-			dict_sys_lock();
-		}
-	} while ((ret == DB_DEADLOCK
-		  || ret == DB_DUPLICATE_KEY
-		  || ret == DB_LOCK_WAIT_TIMEOUT)
-		 && n_attempts < 5);
-
-	dict_sys_unlock();
-
-	if (ret != DB_SUCCESS) {
-		snprintf(errstr, errstr_sz,
-			 "Unable to rename statistics from"
-			 " %s.%s to %s.%s in %s: %s."
-			 " They can be renamed later using"
-
-			 " UPDATE %s SET"
-			 " database_name = '%s',"
-			 " table_name = '%s'"
-			 " WHERE"
-			 " database_name = '%s' AND"
-			 " table_name = '%s';",
-
-			 old_db_utf8, old_table_utf8,
-			 new_db_utf8, new_table_utf8,
-			 INDEX_STATS_NAME_PRINT,
-			 ut_strerr(ret),
-
-			 INDEX_STATS_NAME_PRINT,
-			 new_db_utf8, new_table_utf8,
-			 old_db_utf8, old_table_utf8);
-	}
-
-	return(ret);
+  if (!dict_stats_persistent_storage_check(true))
+    return DB_STATS_DO_NOT_EXIST;
+  pars_info_t *pinfo= pars_info_create();
+
+  pars_info_add_str_literal(pinfo, "db", db);
+  pars_info_add_str_literal(pinfo, "table", table);
+  pars_info_add_str_literal(pinfo, "old", old_name);
+  pars_info_add_str_literal(pinfo, "new", new_name);
+
+  static const char sql[]=
+    "PROCEDURE RENAME_INDEX_IN_STATS() IS\n"
+    "BEGIN\n"
+    "UPDATE \"" INDEX_STATS_NAME "\" SET index_name=:new\n"
+    "WHERE database_name=:db AND table_name=:table AND index_name=:old;\n"
+    "END;\n";
+
+  return dict_stats_exec_sql(pinfo, sql, trx);
 }
 
-/*********************************************************************//**
-Renames an index in InnoDB persistent stats storage.
-This function creates its own transaction and commits it.
-@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned
-if the persistent stats do not exist. */
-dberr_t
-dict_stats_rename_index(
-/*====================*/
-	const dict_table_t*	table,		/*!< in: table whose index
-						is renamed */
-	const char*		old_index_name,	/*!< in: old index name */
-	const char*		new_index_name)	/*!< in: new index name */
+/** Delete all persistent statistics for a database.
+@param db    database name
+@param trx   transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx)
 {
-	dict_sys_lock();
-
-	if (!dict_stats_persistent_storage_check(true)) {
-		dict_sys_unlock();
-		return(DB_STATS_DO_NOT_EXIST);
-	}
-
-	char	dbname_utf8[MAX_DB_UTF8_LEN];
-	char	tablename_utf8[MAX_TABLE_UTF8_LEN];
-
-	dict_fs2utf8(table->name.m_name, dbname_utf8, sizeof(dbname_utf8),
-		     tablename_utf8, sizeof(tablename_utf8));
-
-	pars_info_t*	pinfo;
-
-	pinfo = pars_info_create();
-
-	pars_info_add_str_literal(pinfo, "dbname_utf8", dbname_utf8);
-	pars_info_add_str_literal(pinfo, "tablename_utf8", tablename_utf8);
-	pars_info_add_str_literal(pinfo, "new_index_name", new_index_name);
-	pars_info_add_str_literal(pinfo, "old_index_name", old_index_name);
-
-	dberr_t	ret;
-
-	ret = dict_stats_exec_sql(
-		pinfo,
-		"PROCEDURE RENAME_INDEX_IN_INDEX_STATS () IS\n"
-		"BEGIN\n"
-		"UPDATE \"" INDEX_STATS_NAME "\" SET\n"
-		"index_name = :new_index_name\n"
-		"WHERE\n"
-		"database_name = :dbname_utf8 AND\n"
-		"table_name = :tablename_utf8 AND\n"
-		"index_name = :old_index_name;\n"
-		"END;\n", NULL);
-
-	dict_sys_unlock();
-
-	return(ret);
+  static const char sql[] =
+    "PROCEDURE DROP_DATABASE_STATS () IS\n"
+    "BEGIN\n"
+    "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE database_name=:db;\n"
+    "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE database_name=:db;\n"
+    "END;\n";
+
+  pars_info_t *pinfo= pars_info_create();
+  pars_info_add_str_literal(pinfo, "db", db);
+  return dict_stats_exec_sql(pinfo, sql, trx);
 }
 
 /* tests @{ */
 #ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
-
-/* The following unit tests test some of the functions in this file
-individually, such testing cannot be performed by the mysql-test framework
-via SQL. */
-
-/* test_dict_table_schema_check() @{ */
-void
-test_dict_table_schema_check()
-{
-	/*
-	CREATE TABLE tcheck (
-		c01 VARCHAR(123),
-		c02 INT,
-		c03 INT NOT NULL,
-		c04 INT UNSIGNED,
-		c05 BIGINT,
-		c06 BIGINT UNSIGNED NOT NULL,
-		c07 TIMESTAMP
-	) ENGINE=INNODB;
-	*/
-	/* definition for the table 'test/tcheck' */
-	dict_col_meta_t	columns[] = {
-		{"c01", DATA_VARCHAR, 0, 123},
-		{"c02", DATA_INT, 0, 4},
-		{"c03", DATA_INT, DATA_NOT_NULL, 4},
-		{"c04", DATA_INT, DATA_UNSIGNED, 4},
-		{"c05", DATA_INT, 0, 8},
-		{"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
-		{"c07", DATA_INT, 0, 4},
-		{"c_extra", DATA_INT, 0, 4}
-	};
-	dict_table_schema_t	schema = {
-		"test/tcheck",
-		0 /* will be set individually for each test below */,
-		columns
-	};
-	char	errstr[512];
-
-	snprintf(errstr, sizeof(errstr), "Table not found");
-
-	/* prevent any data dictionary modifications while we are checking
-	the tables' structure */
-
-	mutex_enter(&dict_sys.mutex);
-
-	/* check that a valid table is reported as valid */
-	schema.n_cols = 7;
-	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
-	    == DB_SUCCESS) {
-		printf("OK: test.tcheck ok\n");
-	} else {
-		printf("ERROR: %s\n", errstr);
-		printf("ERROR: test.tcheck not present or corrupted\n");
-		goto test_dict_table_schema_check_end;
-	}
-
-	/* check columns with wrong length */
-	schema.columns[1].len = 8;
-	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
-	    != DB_SUCCESS) {
-		printf("OK: test.tcheck.c02 has different length and is"
-		       " reported as corrupted\n");
-	} else {
-		printf("OK: test.tcheck.c02 has different length but is"
-		       " reported as ok\n");
-		goto test_dict_table_schema_check_end;
-	}
-	schema.columns[1].len = 4;
-
-	/* request that c02 is NOT NULL while actually it does not have
-	this flag set */
-	schema.columns[1].prtype_mask |= DATA_NOT_NULL;
-	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
-	    != DB_SUCCESS) {
-		printf("OK: test.tcheck.c02 does not have NOT NULL while"
-		       " it should and is reported as corrupted\n");
-	} else {
-		printf("ERROR: test.tcheck.c02 does not have NOT NULL while"
-		       " it should and is not reported as corrupted\n");
-		goto test_dict_table_schema_check_end;
-	}
-	schema.columns[1].prtype_mask &= ~DATA_NOT_NULL;
-
-	/* check a table that contains some extra columns */
-	schema.n_cols = 6;
-	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
-	    == DB_SUCCESS) {
-		printf("ERROR: test.tcheck has more columns but is not"
-		       " reported as corrupted\n");
-		goto test_dict_table_schema_check_end;
-	} else {
-		printf("OK: test.tcheck has more columns and is"
-		       " reported as corrupted\n");
-	}
-
-	/* check a table that has some columns missing */
-	schema.n_cols = 8;
-	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
-	    != DB_SUCCESS) {
-		printf("OK: test.tcheck has missing columns and is"
-		       " reported as corrupted\n");
-	} else {
-		printf("ERROR: test.tcheck has missing columns but is"
-		       " reported as ok\n");
-		goto test_dict_table_schema_check_end;
-	}
-
-	/* check non-existent table */
-	schema.table_name = "test/tcheck_nonexistent";
-	if (dict_table_schema_check(&schema, errstr, sizeof(errstr))
-	    != DB_SUCCESS) {
-		printf("OK: test.tcheck_nonexistent is not present\n");
-	} else {
-		printf("ERROR: test.tcheck_nonexistent is present!?\n");
-		goto test_dict_table_schema_check_end;
-	}
-
-test_dict_table_schema_check_end:
-
-	mutex_exit(&dict_sys.mutex);
-}
-/* @} */
-
 /* save/fetch aux macros @{ */
 #define TEST_DATABASE_NAME		"foobardb"
 #define TEST_TABLE_NAME			"test_dict_stats"
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index 280c5c1dec8..a66aac226a3 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -31,6 +31,7 @@ Created Apr 25, 2012 Vasil Dimov
 #include "row0mysql.h"
 #include "srv0start.h"
 #include "fil0fil.h"
+#include "mysqld.h"
 #ifdef WITH_WSREP
 # include "trx0trx.h"
 # include "mysql/service_wsrep.h"
@@ -44,22 +45,23 @@ Created Apr 25, 2012 Vasil Dimov
 #define MIN_RECALC_INTERVAL	10 /* seconds */
 static void dict_stats_schedule(int ms);
 
-/** This mutex protects the "recalc_pool" variable. */
-static ib_mutex_t		recalc_pool_mutex;
+/** Protects recalc_pool */
+static mysql_mutex_t recalc_pool_mutex;
 
-/** Allocator type, used by std::vector */
-typedef ut_allocator<table_id_t>
-	recalc_pool_allocator_t;
+/** for signaling recalc::state */
+static pthread_cond_t recalc_pool_cond;
 
-/** The multitude of tables whose stats are to be automatically
-recalculated - an STL vector */
-typedef std::vector<table_id_t, recalc_pool_allocator_t>
-	recalc_pool_t;
+/** Work item of the recalc_pool; protected by recalc_pool_mutex */
+struct recalc
+{
+  /** identifies a table with persistent statistics */
+  table_id_t id;
+  /** state of the entry */
+  enum { IDLE, IN_PROGRESS, IN_PROGRESS_DELETING, DELETING} state;
+};
 
-/** Iterator type for iterating over the elements of objects of type
-recalc_pool_t. */
-typedef recalc_pool_t::iterator
-	recalc_pool_iterator_t;
+/** The multitude of tables whose stats are to be automatically recalculated */
+typedef std::vector<recalc, ut_allocator<recalc>> recalc_pool_t;
 
 /** Pool where we store information on which tables are to be processed
 by background statistics gathering. */
@@ -96,35 +98,23 @@ background stats gathering thread. Only the table id is added to the
 list, so the table can be closed after being enqueued and it will be
 opened when needed. If the table does not exist later (has been DROPped),
 then it will be removed from the pool and skipped. */
-static
-void
-dict_stats_recalc_pool_add(
-/*=======================*/
-	const dict_table_t*	table,	/*!< in: table to add */
-	bool schedule_dict_stats_task = true /*!< in: schedule dict stats task */
-)
+static void dict_stats_recalc_pool_add(table_id_t id)
 {
-	ut_ad(!srv_read_only_mode);
-
-	mutex_enter(&recalc_pool_mutex);
+  ut_ad(!srv_read_only_mode);
+  ut_ad(id);
+  bool schedule = false;
+  mysql_mutex_lock(&recalc_pool_mutex);
 
-	/* quit if already in the list */
-	for (recalc_pool_iterator_t iter = recalc_pool.begin();
-	     iter != recalc_pool.end();
-	     ++iter) {
-
-		if (*iter == table->id) {
-			mutex_exit(&recalc_pool_mutex);
-			return;
-		}
-	}
-
-	recalc_pool.push_back(table->id);
-	if (recalc_pool.size() == 1 && schedule_dict_stats_task) {
-		dict_stats_schedule_now();
-	}
-	mutex_exit(&recalc_pool_mutex);
+  const auto begin= recalc_pool.begin(), end= recalc_pool.end();
+  if (end == std::find_if(begin, end, [&](const recalc &r){return r.id == id;}))
+  {
+    recalc_pool.emplace_back(recalc{id, recalc::IDLE});
+    schedule = true;
+  }
 
+  mysql_mutex_unlock(&recalc_pool_mutex);
+  if (schedule)
+    dict_stats_schedule_now();
 }
 
 #ifdef WITH_WSREP
@@ -140,8 +130,6 @@ schedule new estimates for table and index statistics to be calculated.
 void dict_stats_update_if_needed_func(dict_table_t *table)
 #endif
 {
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
 	if (UNIV_UNLIKELY(!table->stat_initialized)) {
 		/* The table may have been evicted from dict_sys
 		and reloaded internally by InnoDB for FOREIGN KEY
@@ -162,6 +150,9 @@ void dict_stats_update_if_needed_func(dict_table_t *table)
 	ulonglong	n_rows = dict_table_get_n_rows(table);
 
 	if (dict_stats_is_persistent_enabled(table)) {
+		if (table->name.is_temporary()) {
+			return;
+		}
 		if (counter > n_rows / 10 /* 10% */
 		    && dict_stats_auto_recalc_is_enabled(table)) {
 
@@ -189,7 +180,7 @@ void dict_stats_update_if_needed_func(dict_table_t *table)
 			}
 #endif /* WITH_WSREP */
 
-			dict_stats_recalc_pool_add(table);
+			dict_stats_recalc_pool_add(table->id);
 			table->stat_modified_counter = 0;
 		}
 		return;
@@ -211,83 +202,49 @@ void dict_stats_update_if_needed_func(dict_table_t *table)
 	}
 }
 
-/*****************************************************************//**
-Get a table from the auto recalc pool. The returned table id is removed
-from the pool.
-@return true if the pool was non-empty and "id" was set, false otherwise */
-static
-bool
-dict_stats_recalc_pool_get(
-/*=======================*/
-	table_id_t*	id)	/*!< out: table id, or unmodified if list is
-				empty */
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive)
 {
-	ut_ad(!srv_read_only_mode);
+  ut_ad(!srv_read_only_mode);
+  ut_ad(id);
 
-	mutex_enter(&recalc_pool_mutex);
-
-	if (recalc_pool.empty()) {
-		mutex_exit(&recalc_pool_mutex);
-		return(false);
-	}
+  mysql_mutex_lock(&recalc_pool_mutex);
 
-	*id = recalc_pool.at(0);
-
-	recalc_pool.erase(recalc_pool.begin());
-
-	mutex_exit(&recalc_pool_mutex);
-
-	return(true);
-}
-
-/*****************************************************************//**
-Delete a given table from the auto recalc pool.
-dict_stats_recalc_pool_del() */
-void
-dict_stats_recalc_pool_del(
-/*=======================*/
-	const dict_table_t*	table)	/*!< in: table to remove */
-{
-	ut_ad(!srv_read_only_mode);
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	mutex_enter(&recalc_pool_mutex);
-
-	ut_ad(table->id > 0);
-
-	for (recalc_pool_iterator_t iter = recalc_pool.begin();
-	     iter != recalc_pool.end();
-	     ++iter) {
-
-		if (*iter == table->id) {
-			/* erase() invalidates the iterator */
-			recalc_pool.erase(iter);
-			break;
-		}
-	}
-
-	mutex_exit(&recalc_pool_mutex);
-}
+  auto end= recalc_pool.end();
+  auto i= std::find_if(recalc_pool.begin(), end,
+                       [&](const recalc &r){return r.id == id;});
+  if (i != end)
+  {
+    switch (i->state) {
+    case recalc::IN_PROGRESS:
+      if (!have_mdl_exclusive)
+      {
+        i->state= recalc::IN_PROGRESS_DELETING;
+        do
+        {
+          my_cond_wait(&recalc_pool_cond, &recalc_pool_mutex.m_mutex);
+          end= recalc_pool.end();
+          i= std::find_if(recalc_pool.begin(), end,
+                          [&](const recalc &r){return r.id == id;});
+          if (i == end)
+            goto done;
+        }
+        while (i->state == recalc::IN_PROGRESS_DELETING);
+      }
+      /* fall through */
+    case recalc::IDLE:
+      recalc_pool.erase(i);
+      break;
+    case recalc::IN_PROGRESS_DELETING:
+    case recalc::DELETING:
+      /* another thread will delete the entry in dict_stats_recalc_pool_del() */
+      break;
+    }
+  }
 
-/*****************************************************************//**
-Wait until background stats thread has stopped using the specified table.
-The caller must have locked the data dictionary using
-row_mysql_lock_data_dictionary() and this function may unlock it temporarily
-and restore the lock before it exits.
-The background stats thread is guaranteed not to start using the specified
-table after this function returns and before the caller unlocks the data
-dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
-under dict_sys.mutex. */
-void
-dict_stats_wait_bg_to_stop_using_table(
-/*===================================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	trx_t*		trx)	/*!< in/out: transaction to use for
-				unlocking/locking the data dict */
-{
-	while (!dict_stats_stop_bg(table)) {
-		DICT_BG_YIELD(trx);
-	}
+done:
+  mysql_mutex_unlock(&recalc_pool_mutex);
 }
 
 /*****************************************************************//**
@@ -295,26 +252,11 @@ Initialize global variables needed for the operation of dict_stats_thread()
 Must be called before dict_stats_thread() is started. */
 void dict_stats_init()
 {
-	ut_ad(!srv_read_only_mode);
-
-	/* The recalc_pool_mutex is acquired from:
-	1) the background stats gathering thread before any other latch
-	   and released without latching anything else in between (thus
-	   any level would do here)
-	2) from dict_stats_update_if_needed()
-	   and released without latching anything else in between. We know
-	   that dict_sys.mutex (SYNC_DICT) is not acquired when
-	   dict_stats_update_if_needed() is called and it may be acquired
-	   inside that function (thus a level <=SYNC_DICT would do).
-	3) from row_drop_table_for_mysql() after dict_sys.mutex (SYNC_DICT)
-	   and dict_sys.latch (SYNC_DICT_OPERATION) have been locked
-	   (thus a level <SYNC_DICT && <SYNC_DICT_OPERATION would do)
-	So we choose SYNC_STATS_AUTO_RECALC to be about below SYNC_DICT. */
-
-	mutex_create(LATCH_ID_RECALC_POOL, &recalc_pool_mutex);
-
-	dict_defrag_pool_init();
-	stats_initialised = true;
+  ut_ad(!srv_read_only_mode);
+  mysql_mutex_init(recalc_pool_mutex_key, &recalc_pool_mutex, nullptr);
+  pthread_cond_init(&recalc_pool_cond, nullptr);
+  dict_defrag_pool_init();
+  stats_initialised= true;
 }
 
 /*****************************************************************//**
@@ -332,82 +274,102 @@ void dict_stats_deinit()
 	dict_stats_recalc_pool_deinit();
 	dict_defrag_pool_deinit();
 
-	mutex_free(&recalc_pool_mutex);
+	mysql_mutex_destroy(&recalc_pool_mutex);
+	pthread_cond_destroy(&recalc_pool_cond);
 }
 
 /**
 Get the first table that has been added for auto recalc and eventually
 update its stats.
 @return whether the first entry can be processed immediately */
-static bool dict_stats_process_entry_from_recalc_pool()
+static bool dict_stats_process_entry_from_recalc_pool(THD *thd)
 {
-	table_id_t	table_id;
-
-	ut_ad(!srv_read_only_mode);
-
-next_table_id:
-	/* pop the first table from the auto recalc pool */
-	if (!dict_stats_recalc_pool_get(&table_id)) {
-		/* no tables for auto recalc */
-		return false;
-	}
-
-	dict_table_t*	table;
-
-	mutex_enter(&dict_sys.mutex);
-
-	table = dict_table_open_on_id(table_id, TRUE, DICT_TABLE_OP_NORMAL);
-
-	if (table == NULL) {
-		/* table does not exist, must have been DROPped
-		after its id was enqueued */
-		mutex_exit(&dict_sys.mutex);
-		goto next_table_id;
-	}
-
-	ut_ad(!table->is_temporary());
-
-	if (!table->is_accessible()) {
-		dict_table_close(table, TRUE, FALSE);
-		mutex_exit(&dict_sys.mutex);
-		goto next_table_id;
-	}
-
-	table->stats_bg_flag |= BG_STAT_IN_PROGRESS;
-
-	mutex_exit(&dict_sys.mutex);
-
-	/* time() could be expensive, the current function
-	is called once every time a table has been changed more than 10% and
-	on a system with lots of small tables, this could become hot. If we
-	find out that this is a problem, then the check below could eventually
-	be replaced with something else, though a time interval is the natural
-	approach. */
-	int ret;
-	if (difftime(time(NULL), table->stats_last_recalc)
-	    < MIN_RECALC_INTERVAL) {
-
-		/* Stats were (re)calculated not long ago. To avoid
-		too frequent stats updates we put back the table on
-		the auto recalc list and do nothing. */
-
-		dict_stats_recalc_pool_add(table, false);
-		dict_stats_schedule(MIN_RECALC_INTERVAL*1000);
-		ret = false;
-	} else {
-
-		dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
-		ret = true;
-	}
+  ut_ad(!srv_read_only_mode);
+  table_id_t table_id;
+  mysql_mutex_lock(&recalc_pool_mutex);
+next_table_id_with_mutex:
+  for (auto &r : recalc_pool)
+  {
+    if ((table_id= r.id) && r.state == recalc::IDLE)
+    {
+      r.state= recalc::IN_PROGRESS;
+      mysql_mutex_unlock(&recalc_pool_mutex);
+      goto process;
+    }
+  }
+  mysql_mutex_unlock(&recalc_pool_mutex);
+  return false;
+
+process:
+  MDL_ticket *mdl= nullptr;
+  dict_table_t *table= dict_table_open_on_id(table_id, false,
+                                             DICT_TABLE_OP_NORMAL, thd, &mdl);
+  if (!table)
+  {
+invalid_table_id:
+    mysql_mutex_lock(&recalc_pool_mutex);
+    auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(),
+                         [&](const recalc &r){return r.id == table_id;});
+    if (i == recalc_pool.end());
+    else if (UNIV_LIKELY(i->state == recalc::IN_PROGRESS))
+      recalc_pool.erase(i);
+    else
+    {
+      ut_ad(i->state == recalc::IN_PROGRESS_DELETING);
+      i->state= recalc::DELETING;
+      pthread_cond_broadcast(&recalc_pool_cond);
+    }
+    goto next_table_id_with_mutex;
+  }
 
-	mutex_enter(&dict_sys.mutex);
+  ut_ad(!table->is_temporary());
 
-	table->stats_bg_flag = BG_STAT_NONE;
+  if (!mdl || !table->is_accessible())
+  {
+    dict_table_close(table, false, thd, mdl);
+    goto invalid_table_id;
+  }
 
-	dict_table_close(table, TRUE, FALSE);
+  /* time() could be expensive, the current function
+  is called once every time a table has been changed more than 10% and
+  on a system with lots of small tables, this could become hot. If we
+  find out that this is a problem, then the check below could eventually
+  be replaced with something else, though a time interval is the natural
+  approach. */
+  const bool update_now=
+    difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL;
+
+  const dberr_t err= update_now
+    ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT)
+    : DB_SUCCESS_LOCKED_REC;
+
+  dict_table_close(table, false, thd, mdl);
+
+  mysql_mutex_lock(&recalc_pool_mutex);
+  auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(),
+                       [&](const recalc &r){return r.id == table_id;});
+  if (i == recalc_pool.end())
+    goto done;
+  else if (i->state == recalc::IN_PROGRESS_DELETING)
+  {
+    i->state= recalc::DELETING;
+    pthread_cond_broadcast(&recalc_pool_cond);
+done:
+    mysql_mutex_unlock(&recalc_pool_mutex);
+  }
+  else
+  {
+    ut_ad(i->state == recalc::IN_PROGRESS);
+    recalc_pool.erase(i);
+    const bool reschedule= !update_now && recalc_pool.empty();
+    if (err == DB_SUCCESS_LOCKED_REC)
+      recalc_pool.emplace_back(recalc{table_id, recalc::IDLE});
+    mysql_mutex_unlock(&recalc_pool_mutex);
+    if (reschedule)
+      dict_stats_schedule(MIN_RECALC_INTERVAL * 1000);
+  }
 
-	mutex_exit(&dict_sys.mutex);
-	return ret;
+  return update_now;
 }
 
 static tpool::timer* dict_stats_timer;
@@ -415,8 +377,12 @@ static std::mutex dict_stats_mutex;
 
 static void dict_stats_func(void*)
 {
-	while (dict_stats_process_entry_from_recalc_pool()) {}
-	dict_defrag_process_entries_from_defrag_pool();
+  THD *thd= innobase_create_background_thd("InnoDB statistics");
+  set_current_thd(thd);
+  while (dict_stats_process_entry_from_recalc_pool(thd)) {}
+  dict_defrag_process_entries_from_defrag_pool(thd);
+  set_current_thd(nullptr);
+  destroy_background_thd(thd);
 }
 
 
diff --git a/storage/innobase/dict/drop.cc b/storage/innobase/dict/drop.cc
new file mode 100644
index 00000000000..9013841ba5e
--- /dev/null
+++ b/storage/innobase/dict/drop.cc
@@ -0,0 +1,288 @@
+/*****************************************************************************
+
+Copyright (c) 2021, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+@file dict/drop.cc
+Data Dictionary Language operations that delete .ibd files */
+
+/* We implement atomic data dictionary operations as follows.
+
+1. A data dictionary transaction is started.
+2. We acquire exclusive lock on all the tables that are to be dropped
+during the execution of the transaction.
+3. We lock the data dictionary cache.
+4. All metadata tables will be updated within the single DDL transaction,
+including deleting or renaming InnoDB persistent statistics.
+4b. If any lock wait would occur while we are holding the dict_sys latches,
+we will instantly report a timeout error and roll back the transaction.
+5. The transaction metadata is marked as committed.
+6. If any files were deleted, we will durably write FILE_DELETE
+to the redo log and start deleting the files.
+6b. Also purge after a commit may perform file deletion. This is also the
+recovery mechanism if the server was killed between step 5 and 6.
+7. We unlock the data dictionary cache.
+8. The file handles of the unlinked files will be closed. This will actually
+reclaim the space in the file system (delete-on-close semantics).
+
+Notes:
+
+(a) Purge will be locked out by MDL. For internal tables related to
+FULLTEXT INDEX, purge will not acquire MDL on the user table name,
+and therefore, when we are dropping any FTS_ tables, we must suspend
+and resume purge to prevent a race condition.
+
+(b) If a transaction needs to both drop and create a table by some
+name, it must rename the table in between. This is used by
+ha_innobase::truncate() and fts_drop_common_tables().
+
+(c) No data is ever destroyed before the transaction is committed,
+so we can trivially roll back the transaction at any time.
+Lock waits during a DDL operation are no longer a fatal error
+that would cause the InnoDB to hang or to intentionally crash.
+(Only ALTER TABLE...DISCARD TABLESPACE may discard data before commit.)
+
+(d) The only changes to the data dictionary cache that are performed
+before transaction commit and must be rolled back explicitly are as follows:
+(d1) fts_optimize_add_table() to undo fts_optimize_remove_table()
+*/
+
+#include "trx0purge.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+
+#include "dict0defrag_bg.h"
+#include "btr0defragment.h"
+#include "lock0lock.h"
+
+#include "que0que.h"
+#include "pars0pars.h"
+
+/** Try to drop the foreign key constraints for a persistent table.
+@param name        name of persistent table
+@return error code */
+dberr_t trx_t::drop_table_foreign(const table_name_t &name)
+{
+  ut_ad(dict_sys.locked());
+  ut_ad(state == TRX_STATE_ACTIVE);
+  ut_ad(dict_operation);
+  ut_ad(dict_operation_lock_mode);
+
+  if (!dict_sys.sys_foreign || dict_sys.sys_foreign->corrupted)
+    return DB_SUCCESS;
+
+  if (!dict_sys.sys_foreign_cols || dict_sys.sys_foreign_cols->corrupted)
+    return DB_SUCCESS;
+
+  pars_info_t *info= pars_info_create();
+  pars_info_add_str_literal(info, "name", name.m_name);
+  return que_eval_sql(info,
+                      "PROCEDURE DROP_FOREIGN() IS\n"
+                      "fid CHAR;\n"
+
+                      "DECLARE CURSOR fk IS\n"
+                      "SELECT ID FROM SYS_FOREIGN\n"
+                      "WHERE FOR_NAME=:name\n"
+                      "AND TO_BINARY(FOR_NAME)=TO_BINARY(:name)\n"
+                      "FOR UPDATE;\n"
+
+                      "BEGIN\n"
+                      "OPEN fk;\n"
+                      "WHILE 1=1 LOOP\n"
+                      "  FETCH fk INTO fid;\n"
+                      "  IF (SQL % NOTFOUND)THEN RETURN;END IF;\n"
+                      "  DELETE FROM SYS_FOREIGN_COLS"
+                      " WHERE ID=fid;\n"
+                      "  DELETE FROM SYS_FOREIGN WHERE ID=fid;\n"
+                      "END LOOP;\n"
+                      "CLOSE fk;\n"
+                      "END;\n", this);
+}
+
+/** Try to drop the statistics for a persistent table.
+@param name        name of persistent table
+@return error code */
+dberr_t trx_t::drop_table_statistics(const table_name_t &name)
+{
+  ut_ad(dict_sys.locked());
+  ut_ad(dict_operation_lock_mode);
+
+  if (strstr(name.m_name, "/" TEMP_FILE_PREFIX_INNODB) ||
+      !strcmp(name.m_name, TABLE_STATS_NAME) ||
+      !strcmp(name.m_name, INDEX_STATS_NAME))
+    return DB_SUCCESS;
+
+  char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
+  dict_fs2utf8(name.m_name, db, sizeof db, table, sizeof table);
+
+  dberr_t err= dict_stats_delete_from_table_stats(db, table, this);
+  if (err == DB_SUCCESS || err == DB_STATS_DO_NOT_EXIST)
+  {
+    err= dict_stats_delete_from_index_stats(db, table, this);
+    if (err == DB_STATS_DO_NOT_EXIST)
+      err= DB_SUCCESS;
+  }
+  return err;
+}
+
+/** Try to drop a persistent table.
+@param table       persistent table
+@param fk          whether to drop FOREIGN KEY metadata
+@return error code */
+dberr_t trx_t::drop_table(const dict_table_t &table)
+{
+  ut_ad(dict_sys.locked());
+  ut_ad(state == TRX_STATE_ACTIVE);
+  ut_ad(dict_operation);
+  ut_ad(dict_operation_lock_mode);
+  ut_ad(!table.is_temporary());
+  /* The table must be exclusively locked by this transaction. */
+  ut_ad(table.get_ref_count() <= 1);
+  ut_ad(table.n_lock_x_or_s == 1);
+  ut_ad(UT_LIST_GET_LEN(table.locks) >= 1);
+#ifdef UNIV_DEBUG
+  bool found_x= false;
+  for (lock_t *lock= UT_LIST_GET_FIRST(table.locks); lock;
+       lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+  {
+    ut_ad(lock->trx == this);
+    switch (lock->type_mode) {
+    case LOCK_TABLE | LOCK_X:
+      found_x= true;
+      break;
+    case LOCK_TABLE | LOCK_IX:
+    case LOCK_TABLE | LOCK_AUTO_INC:
+      break;
+    default:
+      ut_ad("unexpected lock type" == 0);
+    }
+  }
+  ut_ad(found_x);
+#endif
+
+  if (dict_sys.sys_virtual && !dict_sys.sys_virtual->corrupted)
+  {
+    pars_info_t *info= pars_info_create();
+    pars_info_add_ull_literal(info, "id", table.id);
+    if (dberr_t err= que_eval_sql(info,
+                                  "PROCEDURE DROP_VIRTUAL() IS\n"
+                                  "BEGIN\n"
+                                  "DELETE FROM SYS_VIRTUAL"
+                                  " WHERE TABLE_ID=:id;\n"
+                                  "END;\n", this))
+      return err;
+  }
+
+  /* Once DELETE FROM SYS_INDEXES is committed, purge may invoke
+  dict_drop_index_tree(). */
+
+  if (!(table.flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)));
+  else if (dberr_t err= fts_drop_tables(this, table))
+  {
+    ib::error() << "Unable to remove FTS tables for "
+                << table.name << ": " << err;
+    return err;
+  }
+
+  mod_tables.emplace(const_cast<dict_table_t*>(&table), undo_no).
+    first->second.set_dropped();
+
+  pars_info_t *info= pars_info_create();
+  pars_info_add_ull_literal(info, "id", table.id);
+  return que_eval_sql(info,
+                      "PROCEDURE DROP_TABLE() IS\n"
+                      "iid CHAR;\n"
+
+                      "DECLARE CURSOR idx IS\n"
+                      "SELECT ID FROM SYS_INDEXES\n"
+                      "WHERE TABLE_ID=:id FOR UPDATE;\n"
+
+                      "BEGIN\n"
+
+                      "DELETE FROM SYS_TABLES WHERE ID=:id;\n"
+                      "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=:id;\n"
+
+                      "OPEN idx;\n"
+                      "WHILE 1 = 1 LOOP\n"
+                      "  FETCH idx INTO iid;\n"
+                      "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+                      "  DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n"
+                      "  DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+                      "END LOOP;\n"
+                      "CLOSE idx;\n"
+
+                      "END;\n", this);
+}
+
+/** Commit the transaction, possibly after drop_table().
+@param deleted   handles of data files that were deleted */
+void trx_t::commit(std::vector<pfs_os_file_t> &deleted)
+{
+  ut_ad(dict_operation);
+  commit_persist();
+  if (dict_operation)
+  {
+    ut_ad(dict_sys.locked());
+    lock_sys.wr_lock(SRW_LOCK_CALL);
+    mutex_lock();
+    lock_release_on_drop(this);
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    mem_heap_empty(lock.lock_heap);
+    lock.table_locks.clear();
+    /* commit_persist() already reset this. */
+    ut_ad(!lock.was_chosen_as_deadlock_victim);
+    lock.n_rec_locks= 0;
+    while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+    {
+      UT_LIST_REMOVE(lock.evicted_tables, table);
+      dict_mem_table_free(table);
+    }
+    dict_operation= false;
+    id= 0;
+    mutex_unlock();
+
+    for (const auto &p : mod_tables)
+    {
+      if (p.second.is_dropped())
+      {
+        dict_table_t *table= p.first;
+        dict_stats_recalc_pool_del(table->id, true);
+        dict_stats_defrag_pool_del(table, nullptr);
+        if (btr_defragment_active)
+          btr_defragment_remove_table(table);
+        const fil_space_t *space= table->space;
+        ut_ad(!p.second.is_aux_table() || purge_sys.must_wait_FTS());
+        dict_sys.remove(table);
+        if (const auto id= space ? space->id : 0)
+        {
+          pfs_os_file_t d= fil_delete_tablespace(id);
+          if (d != OS_FILE_CLOSED)
+            deleted.emplace_back(d);
+        }
+      }
+    }
+
+    lock_sys.wr_unlock();
+
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    lock_sys.deadlock_check();
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+  }
+  commit_cleanup();
+}
diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc
index 193a5814a78..73ab113cff5 100644
--- a/storage/innobase/eval/eval0eval.cc
+++ b/storage/innobase/eval/eval0eval.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -378,12 +378,23 @@ eval_substr(
 
 	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
 
+	const ulint str1_len = dfield_get_len(que_node_get_val(arg1));
+
 	len1 = (ulint) eval_node_get_int_val(arg2);
 	len2 = (ulint) eval_node_get_int_val(arg3);
 
 	dfield = que_node_get_val(func_node);
 
-	dfield_set_data(dfield, str1 + len1, len2);
+	if (len1 > str1_len) {
+		len2 = 0;
+	} else {
+		str1 += len1;
+		if (len2 > str1_len - len1) {
+			len2 = str1_len - len1;
+		}
+	}
+
+	dfield_set_data(dfield, str1, len2);
 }
 
 /*****************************************************************//**
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
index 3bc0f3e1f1f..fe75b7d58fa 100644
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -44,45 +44,57 @@ Modified           Jan Lindström jan.lindstrom@mariadb.com
 static bool fil_crypt_threads_inited = false;
 
 /** Is encryption enabled/disabled */
-UNIV_INTERN ulong srv_encrypt_tables = 0;
+ulong srv_encrypt_tables;
 
 /** No of key rotation threads requested */
-UNIV_INTERN uint srv_n_fil_crypt_threads = 0;
+uint srv_n_fil_crypt_threads;
 
 /** No of key rotation threads started */
-UNIV_INTERN uint srv_n_fil_crypt_threads_started = 0;
+uint srv_n_fil_crypt_threads_started;
 
 /** At this age or older a space/page will be rotated */
-UNIV_INTERN uint srv_fil_crypt_rotate_key_age;
+uint srv_fil_crypt_rotate_key_age;
 
 /** Whether the encryption plugin does key rotation */
-static bool srv_encrypt_rotate;
+Atomic_relaxed<bool> srv_encrypt_rotate;
 
-/** Event to signal FROM the key rotation threads. */
-static os_event_t fil_crypt_event;
+/** Condition variable for srv_n_fil_crypt_threads_started */
+static pthread_cond_t fil_crypt_cond;
 
-/** Event to signal TO the key rotation threads. */
-UNIV_INTERN os_event_t fil_crypt_threads_event;
+/** Condition variable to to signal the key rotation threads */
+static pthread_cond_t fil_crypt_threads_cond;
 
-/** Event for waking up threads throttle. */
-static os_event_t fil_crypt_throttle_sleep_event;
+/** Condition variable for interrupting sleeptime_ms sleep at the end
+of fil_crypt_rotate_page() */
+static pthread_cond_t fil_crypt_throttle_sleep_cond;
 
-/** Mutex for key rotation threads. */
-UNIV_INTERN ib_mutex_t fil_crypt_threads_mutex;
+/** Mutex for key rotation threads. Acquired before fil_system.mutex! */
+static mysql_mutex_t fil_crypt_threads_mutex;
 
 /** Variable ensuring only 1 thread at time does initial conversion */
-static bool fil_crypt_start_converting = false;
+static bool fil_crypt_start_converting;
 
 /** Variables for throttling */
-UNIV_INTERN uint srv_n_fil_crypt_iops = 100;	 // 10ms per iop
-static uint srv_alloc_time = 3;		    // allocate iops for 3s at a time
-static uint n_fil_crypt_iops_allocated = 0;
+uint srv_n_fil_crypt_iops;	 // 10ms per iop
+static constexpr uint srv_alloc_time = 3; // allocate iops for 3s at a time
+static uint n_fil_crypt_iops_allocated;
 
 #define DEBUG_KEYROTATION_THROTTLING 0
 
 /** Statistics variables */
 static fil_crypt_stat_t crypt_stat;
-static ib_mutex_t crypt_stat_mutex;
+static mysql_mutex_t crypt_stat_mutex;
+
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast)
+{
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+  if (broadcast)
+    pthread_cond_broadcast(&fil_crypt_threads_cond);
+  else
+    pthread_cond_signal(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
 
 /***********************************************************************
 Check if a key needs rotation given a key_state
@@ -101,24 +113,18 @@ fil_crypt_needs_rotation(
 
 /*********************************************************************
 Init space crypt */
-UNIV_INTERN
-void
-fil_space_crypt_init()
+void fil_space_crypt_init()
 {
-	fil_crypt_throttle_sleep_event = os_event_create(0);
-
-	mutex_create(LATCH_ID_FIL_CRYPT_STAT_MUTEX, &crypt_stat_mutex);
-	memset(&crypt_stat, 0, sizeof(crypt_stat));
+  pthread_cond_init(&fil_crypt_throttle_sleep_cond, nullptr);
+  mysql_mutex_init(0, &crypt_stat_mutex, nullptr);
 }
 
 /*********************************************************************
 Cleanup space crypt */
-UNIV_INTERN
-void
-fil_space_crypt_cleanup()
+void fil_space_crypt_cleanup()
 {
-	os_event_destroy(fil_crypt_throttle_sleep_event);
-	mutex_free(&crypt_stat_mutex);
+  pthread_cond_destroy(&fil_crypt_throttle_sleep_cond);
+  mysql_mutex_destroy(&crypt_stat_mutex);
 }
 
 /**
@@ -165,11 +171,8 @@ fil_crypt_get_latest_key_version(
 				crypt_data->min_key_version,
 				key_version,
 				srv_fil_crypt_rotate_key_age)) {
-			/* Below event seen as NULL-pointer at startup
-			when new database was created and we create a
-			checkpoint. Only seen when debugging. */
 			if (fil_crypt_threads_inited) {
-				os_event_set(fil_crypt_threads_event);
+				fil_crypt_threads_signal();
 			}
 		}
 	}
@@ -189,9 +192,9 @@ crypt_data_scheme_locker(
 		static_cast<fil_space_crypt_t*>(scheme);
 
 	if (exit) {
-		mutex_exit(&crypt_data->mutex);
+		mysql_mutex_unlock(&crypt_data->mutex);
 	} else {
-		mutex_enter(&crypt_data->mutex);
+		mysql_mutex_lock(&crypt_data->mutex);
 	}
 }
 
@@ -234,7 +237,6 @@ Create a fil_space_crypt_t object
 
 @param[in]	key_id		Encryption key id
 @return crypt object */
-UNIV_INTERN
 fil_space_crypt_t*
 fil_space_create_crypt_data(
 	fil_encryption_t	encrypt_mode,
@@ -247,13 +249,13 @@ fil_space_create_crypt_data(
 Merge fil_space_crypt_t object
 @param[in,out]	dst		Destination cryp data
 @param[in]	src		Source crypt data */
-UNIV_INTERN
+static
 void
 fil_space_merge_crypt_data(
 	fil_space_crypt_t* dst,
 	const fil_space_crypt_t* src)
 {
-	mutex_enter(&dst->mutex);
+	mysql_mutex_lock(&dst->mutex);
 
 	/* validate that they are mergeable */
 	ut_a(src->type == CRYPT_SCHEME_UNENCRYPTED ||
@@ -267,7 +269,7 @@ fil_space_merge_crypt_data(
 	dst->min_key_version = src->min_key_version;
 	dst->keyserver_requests += src->keyserver_requests;
 
-	mutex_exit(&dst->mutex);
+	mysql_mutex_unlock(&dst->mutex);
 }
 
 /** Initialize encryption parameters from a tablespace header page.
@@ -321,18 +323,15 @@ fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
 /******************************************************************
 Free a crypt data object
 @param[in,out] crypt_data	crypt data to be freed */
-UNIV_INTERN
-void
-fil_space_destroy_crypt_data(
-	fil_space_crypt_t **crypt_data)
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data)
 {
 	if (crypt_data != NULL && (*crypt_data) != NULL) {
 		fil_space_crypt_t* c;
 		if (UNIV_LIKELY(fil_crypt_threads_inited)) {
-			mutex_enter(&fil_crypt_threads_mutex);
+			mysql_mutex_lock(&fil_crypt_threads_mutex);
 			c = *crypt_data;
 			*crypt_data = NULL;
-			mutex_exit(&fil_crypt_threads_mutex);
+			mysql_mutex_unlock(&fil_crypt_threads_mutex);
 		} else {
 			ut_ad(srv_read_only_mode || !srv_was_started);
 			c = *crypt_data;
@@ -360,7 +359,7 @@ void fil_crypt_parse(fil_space_t* space, const byte* data)
 				static_cast<fil_encryption_t>
 				(data[10 + MY_AES_BLOCK_SIZE]));
 		memcpy(crypt_data->iv, data + 2, MY_AES_BLOCK_SIZE);
-		mutex_enter(&fil_system.mutex);
+		mysql_mutex_lock(&fil_system.mutex);
 		if (space->crypt_data) {
 			fil_space_merge_crypt_data(space->crypt_data,
 						   crypt_data);
@@ -369,37 +368,10 @@ void fil_crypt_parse(fil_space_t* space, const byte* data)
 		} else {
 			space->crypt_data = crypt_data;
 		}
-		mutex_exit(&fil_system.mutex);
+		mysql_mutex_unlock(&fil_system.mutex);
 	}
 }
 
-/** Fill crypt data information to the give page.
-It should be called during ibd file creation.
-@param[in]	flags	tablespace flags
-@param[in,out]	page	first page of the tablespace */
-void
-fil_space_crypt_t::fill_page0(
-	ulint	flags,
-	byte*	page)
-{
-	const uint len = sizeof(iv);
-	const ulint offset = FSP_HEADER_OFFSET
-		+ fsp_header_get_encryption_offset(
-			fil_space_t::zip_size(flags));
-
-	memcpy(page + offset, CRYPT_MAGIC, MAGIC_SZ);
-	mach_write_to_1(page + offset + MAGIC_SZ, type);
-	mach_write_to_1(page + offset + MAGIC_SZ + 1, len);
-	memcpy(page + offset + MAGIC_SZ + 2, &iv, len);
-
-	mach_write_to_4(page + offset + MAGIC_SZ + 2 + len,
-			min_key_version);
-	mach_write_to_4(page + offset + MAGIC_SZ + 2 + len + 4,
-			key_id);
-	mach_write_to_1(page + offset + MAGIC_SZ + 2  + len + 8,
-			encryption);
-}
-
 /** Write encryption metadata to the first page.
 @param[in,out]	block	first page of the tablespace
 @param[in,out]	mtr	mini-transaction */
@@ -407,7 +379,7 @@ void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
 {
 	const ulint offset = FSP_HEADER_OFFSET
 		+ fsp_header_get_encryption_offset(block->zip_size());
-	byte* b = block->frame + offset;
+	byte* b = block->page.frame + offset;
 
 	mtr->memcpy<mtr_t::MAYBE_NOP>(*block, b, CRYPT_MAGIC, MAGIC_SZ);
 
@@ -474,7 +446,6 @@ static byte* fil_encrypt_buf_for_non_full_checksum(
 	const byte*	src = src_frame + header_len;
 	byte*		dst = dst_frame + header_len;
 	uint32		dstlen = 0;
-	ib_uint32_t	checksum = 0;
 
 	if (page_compressed) {
 		srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
@@ -501,11 +472,12 @@ static byte* fil_encrypt_buf_for_non_full_checksum(
 		       size - (header_len + srclen));
 	}
 
-	checksum = fil_crypt_calculate_checksum(zip_size, dst_frame);
-
 	/* store the post-encryption checksum after the key-version */
 	mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
-			checksum);
+			zip_size
+			? page_zip_calc_checksum(dst_frame, zip_size,
+						 SRV_CHECKSUM_ALGORITHM_CRC32)
+			: buf_calc_page_crc32(dst_frame));
 
 	ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size));
 
@@ -668,10 +640,7 @@ static dberr_t fil_space_decrypt_full_crc32(
 	lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
 	uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
 
-	ut_a(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
-
-	ut_ad(crypt_data);
-	ut_ad(crypt_data->is_encrypted());
+	ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
 
 	memcpy(tmp_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
 
@@ -693,15 +662,7 @@ static dberr_t fil_space_decrypt_full_crc32(
 					   (uint) space, offset, lsn);
 
 	if (rc != MY_AES_OK || dstlen != srclen) {
-		if (rc == -1) {
-			return DB_DECRYPTION_FAILED;
-		}
-
-		ib::fatal() << "Unable to decrypt data-block "
-			    << " src: " << src << "srclen: "
-			    << srclen << " buf: " << dst << "buflen: "
-			    << dstlen << " return-code: " << rc
-			    << " Can't continue!";
+		return DB_DECRYPTION_FAILED;
 	}
 
 	/* Copy only checksum part in the trailer */
@@ -735,8 +696,7 @@ static dberr_t fil_space_decrypt_for_non_full_checksum(
 			src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 	ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
 
-	ut_a(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
-	ut_a(crypt_data != NULL && crypt_data->is_encrypted());
+	ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
 
 	/* read space & lsn */
 	uint header_len = FIL_PAGE_DATA;
@@ -763,18 +723,7 @@ static dberr_t fil_space_decrypt_for_non_full_checksum(
 					   space, offset, lsn);
 
 	if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) {
-
-		if (rc == -1) {
-			return DB_DECRYPTION_FAILED;
-		}
-
-		ib::fatal() << "Unable to decrypt data-block "
-			    << " src: " << static_cast<const void*>(src)
-			    << "srclen: "
-			    << srclen << " buf: "
-			    << static_cast<const void*>(dst) << "buflen: "
-			    << dstlen << " return-code: " << rc
-			    << " Can't continue!";
+		return DB_DECRYPTION_FAILED;
 	}
 
 	/* For compressed tables we do not store the FIL header because
@@ -800,9 +749,8 @@ static dberr_t fil_space_decrypt_for_non_full_checksum(
 @param[in]	physical_size		page size
 @param[in]	fsp_flags		Tablespace flags
 @param[in,out]	src_frame		Page to decrypt
-@param[out]	err			DB_SUCCESS or DB_DECRYPTION_FAILED
-@return DB_SUCCESS or error */
-UNIV_INTERN
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
 dberr_t
 fil_space_decrypt(
 	ulint			space_id,
@@ -812,6 +760,10 @@ fil_space_decrypt(
 	ulint			fsp_flags,
 	byte*			src_frame)
 {
+	if (!crypt_data || !crypt_data->is_encrypted()) {
+		return DB_DECRYPTION_FAILED;
+	}
+
 	if (fil_space_t::full_crc32(fsp_flags)) {
 		return fil_space_decrypt_full_crc32(
 			space_id, crypt_data, tmp_frame, src_frame);
@@ -828,8 +780,8 @@ Decrypt a page.
 @param[in]	tmp_frame		Temporary buffer used for decrypting
 @param[in,out]	src_frame		Page to decrypt
 @return decrypted page, or original not encrypted page if decryption is
-not needed.*/
-UNIV_INTERN
+not needed.
+@retval nullptr on failure */
 byte*
 fil_space_decrypt(
 	const fil_space_t* space,
@@ -838,7 +790,6 @@ fil_space_decrypt(
 {
 	const ulint physical_size = space->physical_size();
 
-	ut_ad(space->crypt_data != NULL && space->crypt_data->is_encrypted());
 	ut_ad(space->referenced());
 
 	if (DB_SUCCESS != fil_space_decrypt(space->id, space->crypt_data,
@@ -849,25 +800,7 @@ fil_space_decrypt(
 
 	/* Copy the decrypted page back to page buffer, not
 	really any other options. */
-	memcpy(src_frame, tmp_frame, physical_size);
-
-	return src_frame;
-}
-
-/**
-Calculate post encryption checksum
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	dst_frame	Block where checksum is calculated
-@return page checksum
-not needed. */
-uint32_t
-fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame)
-{
-	/* For encrypted tables we use only crc32 and strict_crc32 */
-	return zip_size
-		? page_zip_calc_checksum(dst_frame, zip_size,
-					 SRV_CHECKSUM_ALGORITHM_CRC32)
-		: buf_calc_page_crc32(dst_frame);
+	return static_cast<byte*>(memcpy(src_frame, tmp_frame, physical_size));
 }
 
 /***********************************************************************/
@@ -959,43 +892,32 @@ fil_crypt_needs_rotation(
 
 /** Read page 0 and possible crypt data from there.
 @param[in,out]	space		Tablespace */
-static inline
-void
-fil_crypt_read_crypt_data(fil_space_t* space)
+static inline void fil_crypt_read_crypt_data(fil_space_t *space)
 {
-	if (space->crypt_data || space->size || !space->get_size()) {
-		/* The encryption metadata has already been read, or
-		the tablespace is not encrypted and the file has been
-		opened already, or the file cannot be accessed,
-		likely due to a concurrent DROP
-		(possibly as part of TRUNCATE or ALTER TABLE).
-		FIXME: The file can become unaccessible any time
-		after this check! We should really remove this
-		function and instead make crypt_data an integral
-		part of fil_space_t. */
-		return;
-	}
-
-	const ulint zip_size = space->zip_size();
-	mtr_t	mtr;
-	mtr.start();
-	if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, 0),
-						  zip_size, RW_S_LATCH,
-						  nullptr,
-						  BUF_GET_POSSIBLY_FREED,
-						  __FILE__, __LINE__, &mtr)) {
-		if (block->page.status == buf_page_t::FREED) {
-			goto func_exit;
-		}
-		mutex_enter(&fil_system.mutex);
-		if (!space->crypt_data && !space->is_stopping()) {
-			space->crypt_data = fil_space_read_crypt_data(
-				zip_size, block->frame);
-		}
-		mutex_exit(&fil_system.mutex);
-	}
-func_exit:
-	mtr.commit();
+  if (space->crypt_data || space->size || !space->get_size())
+    /* The encryption metadata has already been read, or the
+    tablespace is not encrypted and the file has been opened already,
+    or the file cannot be accessed, likely due to a concurrent DROP
+    (possibly as part of TRUNCATE or ALTER TABLE).
+
+    FIXME: The file can become unaccessible any time after this check!
+    We should really remove this function and instead make crypt_data
+    an integral part of fil_space_t. */
+    return;
+
+  const ulint zip_size= space->zip_size();
+  mtr_t mtr;
+  mtr.start();
+  if (buf_block_t* b= buf_page_get_gen(page_id_t{space->id, 0}, zip_size,
+                                       RW_S_LATCH, nullptr,
+                                       BUF_GET_POSSIBLY_FREED, &mtr))
+  {
+    mysql_mutex_lock(&fil_system.mutex);
+    if (!space->crypt_data && !space->is_stopping())
+      space->crypt_data= fil_space_read_crypt_data(zip_size, b->page.frame);
+    mysql_mutex_unlock(&fil_system.mutex);
+  }
+  mtr.commit();
 }
 
 /** Start encrypting a space
@@ -1003,21 +925,22 @@ func_exit:
 @return true if a recheck of tablespace is needed by encryption thread. */
 static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 {
-	mutex_enter(&fil_crypt_threads_mutex);
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
 
 	fil_space_crypt_t *crypt_data = space->crypt_data;
 
 	/* If space is not encrypted and encryption is not enabled, then
 	do not continue encrypting the space. */
 	if (!crypt_data && !srv_encrypt_tables) {
-		mutex_exit(&fil_crypt_threads_mutex);
+func_exit:
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
 		return false;
 	}
 
 	const bool recheck = fil_crypt_start_converting;
 
 	if (recheck || crypt_data || space->is_stopping()) {
-		mutex_exit(&fil_crypt_threads_mutex);
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
 		return recheck;
 	}
 
@@ -1030,39 +953,32 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 	crypt_data = fil_space_create_crypt_data(
 		FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
 
-	if (crypt_data == NULL) {
-		mutex_exit(&fil_crypt_threads_mutex);
-		return false;
+	if (!crypt_data) {
+		goto func_exit;
 	}
 
 	fil_crypt_start_converting = true;
-	mutex_exit(&fil_crypt_threads_mutex);
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
 
 	mtr_t mtr;
 	mtr.start();
 
 	/* 2 - get page 0 */
-	dberr_t err = DB_SUCCESS;
 	if (buf_block_t* block = buf_page_get_gen(
 		    page_id_t(space->id, 0), space->zip_size(),
-		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
-		    __FILE__, __LINE__, &mtr, &err)) {
-		if (block->page.status == buf_page_t::FREED) {
-			goto abort;
-		}
-
+		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) {
 		crypt_data->type = CRYPT_SCHEME_1;
 		crypt_data->min_key_version = 0; // all pages are unencrypted
 		crypt_data->rotate_state.start_time = time(0);
 		crypt_data->rotate_state.starting = true;
 		crypt_data->rotate_state.active_threads = 1;
 
-		mutex_enter(&fil_system.mutex);
+		mysql_mutex_lock(&fil_system.mutex);
 		const bool stopping = space->is_stopping();
 		if (!stopping) {
 			space->crypt_data = crypt_data;
 		}
-		mutex_exit(&fil_system.mutex);
+		mysql_mutex_unlock(&fil_system.mutex);
 
 		if (stopping) {
 			goto abort;
@@ -1078,25 +994,25 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space)
 		while (buf_flush_list_space(space));
 
 		/* 5 - publish crypt data */
-		mutex_enter(&fil_crypt_threads_mutex);
-		mutex_enter(&crypt_data->mutex);
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+		mysql_mutex_lock(&crypt_data->mutex);
 		crypt_data->type = CRYPT_SCHEME_1;
 		ut_a(crypt_data->rotate_state.active_threads == 1);
 		crypt_data->rotate_state.active_threads = 0;
 		crypt_data->rotate_state.starting = false;
 
 		fil_crypt_start_converting = false;
-		mutex_exit(&crypt_data->mutex);
-		mutex_exit(&fil_crypt_threads_mutex);
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		mysql_mutex_unlock(&crypt_data->mutex);
 
 		return false;
 	}
 
 abort:
 	mtr.commit();
-	mutex_enter(&fil_crypt_threads_mutex);
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
 	fil_crypt_start_converting = false;
-	mutex_exit(&fil_crypt_threads_mutex);
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
 
 	crypt_data->~fil_space_crypt_t();
 	ut_free(crypt_data);
@@ -1105,30 +1021,27 @@ abort:
 
 /** State of a rotation thread */
 struct rotate_thread_t {
-	explicit rotate_thread_t(uint no) {
-		memset(this, 0, sizeof(* this));
-		thread_no = no;
-		first = true;
-		estimated_max_iops = 20;
-	}
-
-	uint thread_no;
-	bool first;		    /*!< is position before first space */
-	fil_space_t* space;	    /*!< current space or NULL */
-	uint32_t offset;	    /*!< current page number */
-	ulint batch;		    /*!< #pages to rotate */
-	uint  min_key_version_found;/*!< min key version found but not rotated */
-	lsn_t end_lsn;		    /*!< max lsn when rotating this space */
-
-	uint estimated_max_iops;   /*!< estimation of max iops */
-	uint allocated_iops;	   /*!< allocated iops */
-	ulint cnt_waited;	   /*!< #times waited during this slot */
-	uintmax_t sum_waited_us;   /*!< wait time during this slot */
+  explicit rotate_thread_t(uint no) : thread_no(no) {}
+
+  uint thread_no;
+  bool first = true;              /*!< is position before first space */
+  space_list_t::iterator space
+    = fil_system.space_list.end();/*!< current space or .end() */
+  uint32_t offset = 0;            /*!< current page number */
+  ulint batch = 0;                /*!< #pages to rotate */
+  uint min_key_version_found = 0; /*!< min key version found but not rotated */
+  lsn_t end_lsn = 0;              /*!< max lsn when rotating this space */
+
+  uint estimated_max_iops = 20;/*!< estimation of max iops */
+  uint allocated_iops = 0;     /*!< allocated iops */
+  ulint cnt_waited = 0;	       /*!< #times waited during this slot */
+  uintmax_t sum_waited_us = 0; /*!< wait time during this slot */
 
 	fil_crypt_stat_t crypt_stat; // statistics
 
 	/** @return whether this thread should terminate */
 	bool should_shutdown() const {
+		mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
 		switch (srv_shutdown_state) {
 		case SRV_SHUTDOWN_NONE:
 			return thread_no >= srv_n_fil_crypt_threads;
@@ -1156,19 +1069,19 @@ static bool fil_crypt_must_remove(const fil_space_t &space)
 {
   ut_ad(space.purpose == FIL_TYPE_TABLESPACE);
   fil_space_crypt_t *crypt_data = space.crypt_data;
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   const ulong encrypt_tables= srv_encrypt_tables;
   if (!crypt_data)
     return !encrypt_tables;
   if (!crypt_data->is_key_found())
     return true;
 
-  mutex_enter(&crypt_data->mutex);
+  mysql_mutex_lock(&crypt_data->mutex);
   const bool remove= (space.is_stopping() || crypt_data->not_encrypted()) &&
     (!crypt_data->rotate_state.flushing &&
      !encrypt_tables == !!crypt_data->min_key_version &&
      !crypt_data->rotate_state.active_threads);
-  mutex_exit(&crypt_data->mutex);
+  mysql_mutex_unlock(&crypt_data->mutex);
   return remove;
 }
 
@@ -1185,14 +1098,12 @@ fil_crypt_space_needs_rotation(
 	key_state_t*		key_state,
 	bool*			recheck)
 {
-	fil_space_t* space = state->space;
+	mysql_mutex_assert_not_owner(&fil_crypt_threads_mutex);
 
-	/* Make sure that tablespace is normal tablespace */
-	if (space->purpose != FIL_TYPE_TABLESPACE) {
-		return false;
-	}
+	fil_space_t* space = &*state->space;
 
 	ut_ad(space->referenced());
+	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
 
 	fil_space_crypt_t *crypt_data = space->crypt_data;
 
@@ -1218,7 +1129,8 @@ fil_crypt_space_needs_rotation(
 	}
 
 	bool need_key_rotation = false;
-	mutex_enter(&crypt_data->mutex);
+
+	mysql_mutex_lock(&crypt_data->mutex);
 
 	do {
 		/* prevent threads from starting to rotate space */
@@ -1254,7 +1166,7 @@ fil_crypt_space_needs_rotation(
 			key_state->rotate_key_age);
 	} while (0);
 
-	mutex_exit(&crypt_data->mutex);
+	mysql_mutex_unlock(&crypt_data->mutex);
 	return need_key_rotation;
 }
 
@@ -1265,7 +1177,7 @@ static void
 fil_crypt_update_total_stat(
 	rotate_thread_t *state)
 {
-	mutex_enter(&crypt_stat_mutex);
+	mysql_mutex_lock(&crypt_stat_mutex);
 	crypt_stat.pages_read_from_cache +=
 		state->crypt_stat.pages_read_from_cache;
 	crypt_stat.pages_read_from_disk +=
@@ -1276,10 +1188,13 @@ fil_crypt_update_total_stat(
 	crypt_stat.estimated_iops -= state->crypt_stat.estimated_iops;
 	// add new estimate
 	crypt_stat.estimated_iops += state->estimated_max_iops;
-	mutex_exit(&crypt_stat_mutex);
+	mysql_mutex_unlock(&crypt_stat_mutex);
 
 	// make new estimate "current" estimate
-	memset(&state->crypt_stat, 0, sizeof(state->crypt_stat));
+	state->crypt_stat.pages_read_from_cache = 0;
+	state->crypt_stat.pages_read_from_disk = 0;
+	state->crypt_stat.pages_modified = 0;
+	state->crypt_stat.pages_flushed = 0;
 	// record our old (current) estimate
 	state->crypt_stat.estimated_iops = state->estimated_max_iops;
 }
@@ -1289,11 +1204,9 @@ Allocate iops to thread from global setting,
 used before starting to rotate a space.
 @param[in,out]		state		Rotation state
 @return true if allocation succeeded, false if failed */
-static
-bool
-fil_crypt_alloc_iops(
-	rotate_thread_t *state)
+static bool fil_crypt_alloc_iops(rotate_thread_t *state)
 {
+	mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
 	ut_ad(state->allocated_iops == 0);
 
 	/* We have not yet selected the space to rotate, thus
@@ -1301,11 +1214,11 @@ fil_crypt_alloc_iops(
 	its status yet. */
 
 	uint max_iops = state->estimated_max_iops;
-	mutex_enter(&fil_crypt_threads_mutex);
 
 	if (n_fil_crypt_iops_allocated >= srv_n_fil_crypt_iops) {
-		/* this can happen when user decreases srv_fil_crypt_iops */
-		mutex_exit(&fil_crypt_threads_mutex);
+wait:
+		my_cond_wait(&fil_crypt_threads_cond,
+			     &fil_crypt_threads_mutex.m_mutex);
 		return false;
 	}
 
@@ -1315,22 +1228,21 @@ fil_crypt_alloc_iops(
 		alloc = max_iops;
 	}
 
+	if (!alloc) {
+		goto wait;
+	}
+
 	n_fil_crypt_iops_allocated += alloc;
-	mutex_exit(&fil_crypt_threads_mutex);
 
 	state->allocated_iops = alloc;
-
-	return alloc > 0;
+	return true;
 }
 
-/***********************************************************************
-Reallocate iops to thread,
-used when inside a space
-@param[in,out]		state		Rotation state */
-static
-void
-fil_crypt_realloc_iops(
-	rotate_thread_t *state)
+/**
+Reallocate iops to thread when processing a tablespace
+@param[in,out]		state		Rotation state
+@return whether the thread should continue running */
+static bool fil_crypt_realloc_iops(rotate_thread_t *state)
 {
 	ut_a(state->allocated_iops > 0);
 
@@ -1350,7 +1262,8 @@ fil_crypt_realloc_iops(
 			state->estimated_max_iops,
 			1000000 / avg_wait_time_us));
 
-		state->estimated_max_iops = uint(1000000 / avg_wait_time_us);
+		state->estimated_max_iops = std::max(
+			1U, uint(1000000 / avg_wait_time_us));
 		state->cnt_waited = 0;
 		state->sum_waited_us = 0;
 	} else {
@@ -1362,60 +1275,42 @@ fil_crypt_realloc_iops(
 			    / (state->batch ? state->batch : 1)));
 	}
 
-	if (state->estimated_max_iops <= state->allocated_iops) {
-		/* return extra iops */
-		uint extra = state->allocated_iops - state->estimated_max_iops;
+	ut_ad(state->estimated_max_iops);
 
-		if (extra > 0) {
-			mutex_enter(&fil_crypt_threads_mutex);
-			if (n_fil_crypt_iops_allocated < extra) {
-				/* unknown bug!
-				* crash in debug
-				* keep n_fil_crypt_iops_allocated unchanged
-				* in release */
-				ut_ad(0);
-				extra = 0;
-			}
-			n_fil_crypt_iops_allocated -= extra;
-			state->allocated_iops -= extra;
-
-			if (state->allocated_iops == 0) {
-				/* no matter how slow io system seems to be
-				* never decrease allocated_iops to 0... */
-				state->allocated_iops ++;
-				n_fil_crypt_iops_allocated ++;
-			}
-
-			os_event_set(fil_crypt_threads_event);
-			mutex_exit(&fil_crypt_threads_mutex);
-		}
-	} else {
-		/* see if there are more to get */
-		mutex_enter(&fil_crypt_threads_mutex);
-		if (n_fil_crypt_iops_allocated < srv_n_fil_crypt_iops) {
-			/* there are extra iops free */
-			uint extra = srv_n_fil_crypt_iops -
-				n_fil_crypt_iops_allocated;
-			if (state->allocated_iops + extra >
-			    state->estimated_max_iops) {
-				/* but don't alloc more than our max */
-				extra = state->estimated_max_iops -
-					state->allocated_iops;
-			}
-			n_fil_crypt_iops_allocated += extra;
-			state->allocated_iops += extra;
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
 
-			DBUG_PRINT("ib_crypt",
-				("thr_no: %u increased iops from %u to %u.",
-				state->thread_no,
-				state->allocated_iops - extra,
-				state->allocated_iops));
+	if (state->should_shutdown()) {
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		return false;
+	}
 
+	if (state->allocated_iops > state->estimated_max_iops) {
+		/* release iops */
+		uint extra = state->allocated_iops - state->estimated_max_iops;
+		state->allocated_iops = state->estimated_max_iops;
+		ut_ad(n_fil_crypt_iops_allocated >= extra);
+		n_fil_crypt_iops_allocated -= extra;
+		pthread_cond_broadcast(&fil_crypt_threads_cond);
+	} else if (srv_n_fil_crypt_iops > n_fil_crypt_iops_allocated) {
+		/* there are extra iops free */
+		uint add = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+		if (state->allocated_iops + add > state->estimated_max_iops) {
+			/* but don't alloc more than our max */
+			add= state->estimated_max_iops - state->allocated_iops;
 		}
-		mutex_exit(&fil_crypt_threads_mutex);
+		n_fil_crypt_iops_allocated += add;
+		state->allocated_iops += add;
+
+		DBUG_PRINT("ib_crypt",
+			   ("thr_no: %u increased iops from %u to %u.",
+			    state->thread_no,
+			    state->allocated_iops - add,
+			    state->allocated_iops));
 	}
 
 	fil_crypt_update_total_stat(state);
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
+	return true;
 }
 
 /** Release excess allocated iops
@@ -1423,34 +1318,25 @@ fil_crypt_realloc_iops(
 @param wake    whether to wake up other threads */
 static void fil_crypt_return_iops(rotate_thread_t *state, bool wake= true)
 {
-	if (state->allocated_iops > 0) {
-		uint iops = state->allocated_iops;
-		mutex_enter(&fil_crypt_threads_mutex);
-		if (n_fil_crypt_iops_allocated < iops) {
-			/* unknown bug!
-			* crash in debug
-			* keep n_fil_crypt_iops_allocated unchanged
-			* in release */
-			ut_ad(0);
-			iops = 0;
-		}
+  mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
 
-		n_fil_crypt_iops_allocated -= iops;
-		state->allocated_iops = 0;
-		if (wake) {
-			os_event_set(fil_crypt_threads_event);
-		}
-		mutex_exit(&fil_crypt_threads_mutex);
-	}
+  if (uint iops= state->allocated_iops)
+  {
+    ut_ad(n_fil_crypt_iops_allocated >= iops);
+    n_fil_crypt_iops_allocated-= iops;
+    state->allocated_iops= 0;
+    if (wake)
+      pthread_cond_broadcast(&fil_crypt_threads_cond);
+  }
 
-	fil_crypt_update_total_stat(state);
+  fil_crypt_update_total_stat(state);
 }
 
 /** Acquire a tablespace reference.
 @return whether a tablespace reference was successfully acquired */
 inline bool fil_space_t::acquire_if_not_stopped()
 {
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   const uint32_t n= acquire_low();
   if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
     return true;
@@ -1476,14 +1362,12 @@ inline fil_space_t *fil_system_t::default_encrypt_next(fil_space_t *space,
                                                        bool recheck,
                                                        bool encrypt)
 {
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
 
-  sized_ilist<fil_space_t, rotation_list_tag_t>::iterator it=
-    space && space->is_in_default_encrypt
-    ? space
+  auto it= space && space->is_in_default_encrypt
+    ? sized_ilist<fil_space_t, rotation_list_tag_t>::iterator(space)
     : default_encrypt_tables.begin();
-  const sized_ilist<fil_space_t, rotation_list_tag_t>::iterator end=
-    default_encrypt_tables.end();
+  const auto end= default_encrypt_tables.end();
 
   if (space)
   {
@@ -1541,26 +1425,33 @@ encryption parameters were changed
 @param encrypt  expected state of innodb_encrypt_tables
 @return the next tablespace
 @retval fil_system.temp_space if there is no work to do
-@retval nullptr upon reaching the end of the iteration */
-inline fil_space_t *fil_space_t::next(fil_space_t *space, bool recheck,
-                                      bool encrypt)
+@retval end() upon reaching the end of the iteration */
+space_list_t::iterator fil_space_t::next(space_list_t::iterator space,
+                                         bool recheck, bool encrypt)
 {
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
 
   if (fil_crypt_must_default_encrypt())
-    space= fil_system.default_encrypt_next(space, recheck, encrypt);
+  {
+    fil_space_t *next_space=
+      fil_system.default_encrypt_next(space == fil_system.space_list.end()
+				      ? nullptr : &*space, recheck, encrypt);
+    space= next_space
+      ? space_list_t::iterator(next_space)
+      : fil_system.space_list.end();
+  }
   else
   {
-    if (!space)
-      space= UT_LIST_GET_FIRST(fil_system.space_list);
+    if (space == fil_system.space_list.end())
+      space= fil_system.space_list.begin();
     else
     {
       /* Move on to the next fil_space_t */
       space->release();
-      space= UT_LIST_GET_NEXT(space_list, space);
+      ++space;
     }
 
-    for (; space; space= UT_LIST_GET_NEXT(space_list, space))
+    for (; space != fil_system.space_list.end(); ++space)
     {
       if (space->purpose != FIL_TYPE_TABLESPACE)
         continue;
@@ -1572,7 +1463,7 @@ inline fil_space_t *fil_space_t::next(fil_space_t *space, bool recheck,
     }
   }
 
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
   return space;
 }
 
@@ -1580,62 +1471,56 @@ inline fil_space_t *fil_space_t::next(fil_space_t *space, bool recheck,
 @param[in,out]	key_state	Key state
 @param[in,out]	state		Rotation state
 @param[in,out]	recheck		recheck of the tablespace is needed or
-				still encryption thread does write page 0 */
+				still encryption thread does write page 0
+@return whether the thread should keep running */
 static bool fil_crypt_find_space_to_rotate(
 	key_state_t*		key_state,
 	rotate_thread_t*	state,
 	bool*			recheck)
 {
 	/* we need iops to start rotating */
-	while (!state->should_shutdown() && !fil_crypt_alloc_iops(state)) {
-		if (state->space && state->space->is_stopping()) {
-			state->space->release();
-			state->space = NULL;
-		}
-
-		os_event_reset(fil_crypt_threads_event);
-		os_event_wait_time(fil_crypt_threads_event, 100000);
-	}
-
-	if (state->should_shutdown()) {
-		if (state->space) {
-			state->space->release();
-			state->space = NULL;
+	do {
+		if (state->should_shutdown()) {
+			if (state->space != fil_system.space_list.end()) {
+				state->space->release();
+				state->space = fil_system.space_list.end();
+			}
+			return false;
 		}
-		return false;
-	}
+	} while (!fil_crypt_alloc_iops(state));
 
 	if (state->first) {
 		state->first = false;
-		if (state->space) {
+		if (state->space != fil_system.space_list.end()) {
 			state->space->release();
 		}
-		state->space = NULL;
+		state->space = fil_system.space_list.end();
 	}
 
-	bool wake;
-	for (;;) {
-		state->space = fil_space_t::next(state->space, *recheck,
-						 key_state->key_version != 0);
-		wake = state->should_shutdown();
+	state->space = fil_space_t::next(state->space, *recheck,
+					 key_state->key_version != 0);
 
-		if (state->space == fil_system.temp_space) {
+	bool wake = true;
+	while (state->space != fil_system.space_list.end()) {
+		if (state->space
+			== space_list_t::iterator(fil_system.temp_space)) {
+			wake = false;
 			goto done;
-		} else if (wake) {
-			break;
-		} else {
-			wake = true;
 		}
 
-		if (!state->space) {
+		if (state->should_shutdown()) {
+			state->space->release();
+done:
+			state->space = fil_system.space_list.end();
 			break;
 		}
 
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
 		/* If there is no crypt data and we have not yet read
 		page 0 for this tablespace, we need to read it before
 		we can continue. */
 		if (!state->space->crypt_data) {
-			fil_crypt_read_crypt_data(state->space);
+			fil_crypt_read_crypt_data(&*state->space);
 		}
 
 		if (fil_crypt_space_needs_rotation(state, key_state, recheck)) {
@@ -1643,21 +1528,18 @@ static bool fil_crypt_find_space_to_rotate(
 			/* init state->min_key_version_found before
 			* starting on a space */
 			state->min_key_version_found = key_state->key_version;
+			mysql_mutex_lock(&fil_crypt_threads_mutex);
 			return true;
 		}
-	}
 
-	if (state->space) {
-		state->space->release();
-done:
-		state->space = NULL;
+		state->space = fil_space_t::next(state->space, *recheck,
+						 key_state->key_version != 0);
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
 	}
 
 	/* no work to do; release our allocation of I/O capacity */
 	fil_crypt_return_iops(state, wake);
-
-	return false;
-
+	return true;
 }
 
 /***********************************************************************
@@ -1673,7 +1555,7 @@ fil_crypt_start_rotate_space(
 	fil_space_crypt_t *crypt_data = state->space->crypt_data;
 
 	ut_ad(crypt_data);
-	mutex_enter(&crypt_data->mutex);
+	mysql_mutex_lock(&crypt_data->mutex);
 	ut_ad(key_state->key_id == crypt_data->key_id);
 
 	if (crypt_data->rotate_state.active_threads == 0) {
@@ -1706,7 +1588,7 @@ fil_crypt_start_rotate_space(
 	state->min_key_version_found =
 		crypt_data->rotate_state.min_key_version_found;
 
-	mutex_exit(&crypt_data->mutex);
+	mysql_mutex_unlock(&crypt_data->mutex);
 }
 
 /***********************************************************************
@@ -1721,18 +1603,19 @@ fil_crypt_find_page_to_rotate(
 	rotate_thread_t*	state)
 {
 	ulint batch = srv_alloc_time * state->allocated_iops;
-	fil_space_t* space = state->space;
 
-	ut_ad(!space || space->referenced());
+	ut_ad(state->space == fil_system.space_list.end()
+		|| state->space->referenced());
 
 	/* If space is marked to be dropped stop rotation. */
-	if (!space || space->is_stopping()) {
+	if (state->space == fil_system.space_list.end()
+		|| state->space->is_stopping()) {
 		return false;
 	}
 
-	fil_space_crypt_t *crypt_data = space->crypt_data;
+	fil_space_crypt_t *crypt_data = state->space->crypt_data;
 
-	mutex_enter(&crypt_data->mutex);
+	mysql_mutex_lock(&crypt_data->mutex);
 	ut_ad(key_state->key_id == crypt_data->key_id);
 
 	bool found = crypt_data->rotate_state.max_offset >=
@@ -1751,34 +1634,26 @@ fil_crypt_find_page_to_rotate(
 	}
 
 	crypt_data->rotate_state.next_offset += uint32_t(batch);
-	mutex_exit(&crypt_data->mutex);
+	mysql_mutex_unlock(&crypt_data->mutex);
 	return found;
 }
 
-#define fil_crypt_get_page_throttle(state,offset,mtr,sleeptime_ms) \
-	fil_crypt_get_page_throttle_func(state, offset, mtr, \
-					 sleeptime_ms, __FILE__, __LINE__)
-
 /***********************************************************************
 Get a page and compute sleep time
 @param[in,out]		state		Rotation state
 @param[in]		offset		Page offset
 @param[in,out]		mtr		Minitransaction
 @param[out]		sleeptime_ms	Sleep time
-@param[in]		file		File where called
-@param[in]		line		Line where called
 @return page or NULL*/
 static
 buf_block_t*
-fil_crypt_get_page_throttle_func(
+fil_crypt_get_page_throttle(
 	rotate_thread_t*	state,
 	uint32_t		offset,
 	mtr_t*			mtr,
-	ulint*			sleeptime_ms,
-	const char*		file,
-	unsigned		line)
+	ulint*			sleeptime_ms)
 {
-	fil_space_t* space = state->space;
+	fil_space_t* space = &*state->space;
 	const ulint zip_size = space->zip_size();
 	const page_id_t page_id(space->id, offset);
 	ut_ad(space->referenced());
@@ -1789,11 +1664,9 @@ fil_crypt_get_page_throttle_func(
 		return NULL;
 	}
 
-	dberr_t err = DB_SUCCESS;
 	buf_block_t* block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
 					      NULL,
-					      BUF_PEEK_IF_IN_POOL, file, line,
-					      mtr, &err);
+					      BUF_PEEK_IF_IN_POOL, mtr);
 	if (block != NULL) {
 		/* page was in buffer pool */
 		state->crypt_stat.pages_read_from_cache++;
@@ -1804,7 +1677,9 @@ fil_crypt_get_page_throttle_func(
 		return NULL;
 	}
 
-	if (fseg_page_is_free(space, state->offset)) {
+	if (offset % (zip_size ? zip_size : srv_page_size)
+	    && DB_SUCCESS_LOCKED_REC
+	    != fseg_page_is_allocated(space, offset)) {
 		/* page is already freed */
 		return NULL;
 	}
@@ -1814,8 +1689,7 @@ fil_crypt_get_page_throttle_func(
 	const ulonglong start = my_interval_timer();
 	block = buf_page_get_gen(page_id, zip_size,
 				 RW_X_LATCH,
-				 NULL, BUF_GET_POSSIBLY_FREED,
-				file, line, mtr, &err);
+				 NULL, BUF_GET_POSSIBLY_FREED, mtr);
 	const ulonglong end = my_interval_timer();
 
 	state->cnt_waited++;
@@ -1851,7 +1725,7 @@ fil_crypt_rotate_page(
 	const key_state_t*	key_state,
 	rotate_thread_t*	state)
 {
-	fil_space_t*space = state->space;
+	fil_space_t *space = &*state->space;
 	ulint space_id = space->id;
 	uint32_t offset = state->offset;
 	ulint sleeptime_ms = 0;
@@ -1883,10 +1757,7 @@ fil_crypt_rotate_page(
 		const lsn_t block_lsn = mach_read_from_8(FIL_PAGE_LSN + frame);
 		uint kv = buf_page_get_key_version(frame, space->flags);
 
-		if (block->page.status == buf_page_t::FREED) {
-			/* Do not modify freed pages to avoid an assertion
-			failure on recovery.*/
-		} else if (block->page.oldest_modification() > 1) {
+		if (block->page.oldest_modification() > 1) {
 			/* Do not unnecessarily touch pages that are
 			already dirty. */
 		} else if (space->is_stopping()) {
@@ -1898,11 +1769,11 @@ fil_crypt_rotate_page(
 			allocated. Because key rotation is accessing
 			pages in a pattern that is unlike the normal
 			B-tree and undo log access pattern, we cannot
-			invoke fseg_page_is_free() here, because that
+			invoke fseg_page_is_allocated() here, because that
 			could result in a deadlock. If we invoked
-			fseg_page_is_free() and released the
+			fseg_page_is_allocated() and released the
 			tablespace latch before acquiring block->lock,
-			then the fseg_page_is_free() information
+			then the fseg_page_is_allocated() information
 			could be stale already. */
 
 			/* If the data file was originally created
@@ -1960,15 +1831,17 @@ fil_crypt_rotate_page(
 	} else {
 		/* If block read failed mtr memo and log should be empty. */
 		ut_ad(!mtr.has_modifications());
-		ut_ad(!mtr.is_dirty());
 		ut_ad(mtr.is_empty());
 		mtr.commit();
 	}
 
 	if (sleeptime_ms) {
-		os_event_reset(fil_crypt_throttle_sleep_event);
-		os_event_wait_time(fil_crypt_throttle_sleep_event,
-				   1000 * sleeptime_ms);
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+		timespec abstime;
+		set_timespec_nsec(abstime, 1000000ULL * sleeptime_ms);
+		my_cond_timedwait(&fil_crypt_throttle_sleep_cond,
+				  &fil_crypt_threads_mutex.m_mutex, &abstime);
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
 	}
 }
 
@@ -2021,7 +1894,7 @@ void
 fil_crypt_flush_space(
 	rotate_thread_t*	state)
 {
-	fil_space_t* space = state->space;
+	fil_space_t* space = &*state->space;
 	fil_space_crypt_t *crypt_data = space->crypt_data;
 
 	ut_ad(space->referenced());
@@ -2058,12 +1931,9 @@ fil_crypt_flush_space(
 
 	if (buf_block_t* block = buf_page_get_gen(
 		    page_id_t(space->id, 0), space->zip_size(),
-		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
-		    __FILE__, __LINE__, &mtr)) {
-		if (block->page.status != buf_page_t::FREED) {
-			mtr.set_named_space(space);
-			crypt_data->write_page0(block, &mtr);
-		}
+		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) {
+		mtr.set_named_space(space);
+		crypt_data->write_page0(block, &mtr);
 	}
 
 	mtr.commit();
@@ -2079,10 +1949,10 @@ static void fil_crypt_complete_rotate_space(rotate_thread_t* state)
 	ut_ad(crypt_data);
 	ut_ad(state->space->referenced());
 
+	mysql_mutex_lock(&crypt_data->mutex);
+
 	/* Space might already be dropped */
 	if (!state->space->is_stopping()) {
-		mutex_enter(&crypt_data->mutex);
-
 		/**
 		* Update crypt data state with state from thread
 		*/
@@ -2119,137 +1989,110 @@ static void fil_crypt_complete_rotate_space(rotate_thread_t* state)
 			crypt_data->rotate_state.flushing = true;
 			crypt_data->min_key_version =
 				crypt_data->rotate_state.min_key_version_found;
-			mutex_exit(&crypt_data->mutex);
+			mysql_mutex_unlock(&crypt_data->mutex);
 			fil_crypt_flush_space(state);
 
-			mutex_enter(&crypt_data->mutex);
+			mysql_mutex_lock(&crypt_data->mutex);
 			crypt_data->rotate_state.flushing = false;
-			mutex_exit(&crypt_data->mutex);
-		} else {
-			mutex_exit(&crypt_data->mutex);
 		}
 	} else {
-		mutex_enter(&crypt_data->mutex);
 		ut_a(crypt_data->rotate_state.active_threads > 0);
 		crypt_data->rotate_state.active_threads--;
-		mutex_exit(&crypt_data->mutex);
 	}
+
+	mysql_mutex_unlock(&crypt_data->mutex);
 }
 
-/*********************************************************************//**
-A thread which monitors global key state and rotates tablespaces accordingly
-@return a dummy parameter */
-extern "C" UNIV_INTERN
-os_thread_ret_t
-DECLARE_THREAD(fil_crypt_thread)(void*)
+/** A thread which monitors global key state and rotates tablespaces
+accordingly */
+static void fil_crypt_thread()
 {
-	mutex_enter(&fil_crypt_threads_mutex);
-	uint thread_no = srv_n_fil_crypt_threads_started;
-	srv_n_fil_crypt_threads_started++;
-	os_event_set(fil_crypt_event); /* signal that we started */
-	mutex_exit(&fil_crypt_threads_mutex);
-
-	/* state of this thread */
-	rotate_thread_t thr(thread_no);
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
+	rotate_thread_t thr(srv_n_fil_crypt_threads_started++);
+	pthread_cond_signal(&fil_crypt_cond); /* signal that we started */
 
-	/* if we find a space that is starting, skip over it and recheck it later */
-	bool recheck = false;
-
-	while (!thr.should_shutdown()) {
-
-		key_state_t new_state;
-
-		while (!thr.should_shutdown()) {
+	if (!thr.should_shutdown()) {
+		/* if we find a tablespace that is starting, skip over it
+		and recheck it later */
+		bool recheck = false;
 
+wait_for_work:
+		if (!recheck && !thr.should_shutdown()) {
 			/* wait for key state changes
 			* i.e either new key version of change or
 			* new rotate_key_age */
-			os_event_reset(fil_crypt_threads_event);
-
-			if (os_event_wait_time(fil_crypt_threads_event, 1000000) == 0) {
-				break;
-			}
-
-			if (recheck) {
-				/* check recheck here, after sleep, so
-				* that we don't busy loop while when one thread is starting
-				* a space*/
-				break;
-			}
+			my_cond_wait(&fil_crypt_threads_cond,
+				     &fil_crypt_threads_mutex.m_mutex);
 		}
 
 		recheck = false;
 		thr.first = true;      // restart from first tablespace
 
+		key_state_t new_state;
+
 		/* iterate all spaces searching for those needing rotation */
-		while (!thr.should_shutdown() &&
-		       fil_crypt_find_space_to_rotate(&new_state, &thr, &recheck)) {
+		while (fil_crypt_find_space_to_rotate(&new_state, &thr,
+						      &recheck)) {
+			if (thr.space == fil_system.space_list.end()) {
+				goto wait_for_work;
+			}
 
 			/* we found a space to rotate */
+			mysql_mutex_unlock(&fil_crypt_threads_mutex);
 			fil_crypt_start_rotate_space(&new_state, &thr);
 
 			/* iterate all pages (cooperativly with other threads) */
-			while (!thr.should_shutdown() &&
-			       fil_crypt_find_page_to_rotate(&new_state, &thr)) {
-
-				if (!thr.space->is_stopping()) {
-					/* rotate a (set) of pages */
-					fil_crypt_rotate_pages(&new_state, &thr);
-				}
+			while (fil_crypt_find_page_to_rotate(&new_state, &thr)) {
 
 				/* If space is marked as stopping, release
 				space and stop rotation. */
 				if (thr.space->is_stopping()) {
 					fil_crypt_complete_rotate_space(&thr);
 					thr.space->release();
-					thr.space = NULL;
+					thr.space = fil_system.space_list.end();
 					break;
 				}
 
+				fil_crypt_rotate_pages(&new_state, &thr);
 				/* realloc iops */
-				fil_crypt_realloc_iops(&thr);
+				if (!fil_crypt_realloc_iops(&thr)) {
+					break;
+				}
 			}
 
 			/* complete rotation */
-			if (thr.space) {
+			if (thr.space != fil_system.space_list.end()) {
 				fil_crypt_complete_rotate_space(&thr);
 			}
 
 			/* force key state refresh */
 			new_state.key_id = 0;
 
-			/* return iops */
+			mysql_mutex_lock(&fil_crypt_threads_mutex);
+			/* release iops */
 			fil_crypt_return_iops(&thr);
 		}
-	}
 
-	/* return iops if shutting down */
-	fil_crypt_return_iops(&thr);
-
-	/* release current space if shutting down */
-	if (thr.space) {
-		thr.space->release();
-		thr.space = NULL;
+		if (thr.space != fil_system.space_list.end()) {
+			thr.space->release();
+			thr.space = fil_system.space_list.end();
+		}
 	}
 
-	mutex_enter(&fil_crypt_threads_mutex);
+	fil_crypt_return_iops(&thr);
 	srv_n_fil_crypt_threads_started--;
-	os_event_set(fil_crypt_event); /* signal that we stopped */
-	mutex_exit(&fil_crypt_threads_mutex);
+	pthread_cond_signal(&fil_crypt_cond); /* signal that we stopped */
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
 
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	return os_thread_exit();
+#ifdef UNIV_PFS_THREAD
+	pfs_delete_thread();
+#endif
 }
 
 /*********************************************************************
 Adjust thread count for key rotation
 @param[in]	enw_cnt		Number of threads to be used */
-UNIV_INTERN
-void
-fil_crypt_set_thread_cnt(
-	const uint	new_cnt)
+void fil_crypt_set_thread_cnt(const uint new_cnt)
 {
 	if (!fil_crypt_threads_inited) {
 		if (srv_shutdown_state != SRV_SHUTDOWN_NONE)
@@ -2257,111 +2100,104 @@ fil_crypt_set_thread_cnt(
 		fil_crypt_threads_init();
 	}
 
-	mutex_enter(&fil_crypt_threads_mutex);
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
 
 	if (new_cnt > srv_n_fil_crypt_threads) {
 		uint add = new_cnt - srv_n_fil_crypt_threads;
 		srv_n_fil_crypt_threads = new_cnt;
 		for (uint i = 0; i < add; i++) {
+			std::thread thd(fil_crypt_thread);
 			ib::info() << "Creating #"
 				   << i+1 << " encryption thread id "
-				   << os_thread_create(fil_crypt_thread)
+				   << thd.get_id()
 				   << " total threads " << new_cnt << ".";
+			thd.detach();
 		}
 	} else if (new_cnt < srv_n_fil_crypt_threads) {
 		srv_n_fil_crypt_threads = new_cnt;
-		os_event_set(fil_crypt_threads_event);
 	}
 
-	mutex_exit(&fil_crypt_threads_mutex);
+	pthread_cond_broadcast(&fil_crypt_threads_cond);
 
-	while(srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) {
-		os_event_reset(fil_crypt_event);
-		os_event_wait_time(fil_crypt_event, 100000);
+	while (srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) {
+		my_cond_wait(&fil_crypt_cond,
+			     &fil_crypt_threads_mutex.m_mutex);
 	}
 
-	/* Send a message to encryption threads that there could be
-	something to do. */
-	if (srv_n_fil_crypt_threads) {
-		os_event_set(fil_crypt_threads_event);
-	}
+	pthread_cond_broadcast(&fil_crypt_threads_cond);
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
 }
 
 /** Initialize the tablespace default_encrypt_tables
 if innodb_encryption_rotate_key_age=0. */
 static void fil_crypt_default_encrypt_tables_fill()
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space != NULL;
-	     space = UT_LIST_GET_NEXT(space_list, space)) {
-		if (space->purpose != FIL_TYPE_TABLESPACE
-		    || space->is_in_default_encrypt
-		    || UT_LIST_GET_LEN(space->chain) == 0
-		    || !space->acquire_if_not_stopped()) {
+	mysql_mutex_assert_owner(&fil_system.mutex);
+
+	for (fil_space_t& space : fil_system.space_list) {
+		if (space.purpose != FIL_TYPE_TABLESPACE
+		    || space.is_in_default_encrypt
+		    || UT_LIST_GET_LEN(space.chain) == 0
+		    || !space.acquire_if_not_stopped()) {
 			continue;
 		}
 
 		/* Ensure that crypt_data has been initialized. */
-		ut_ad(space->size);
+		ut_ad(space.size);
 
 		/* Skip ENCRYPTION!=DEFAULT tablespaces. */
-		if (space->crypt_data
-		    && !space->crypt_data->is_default_encryption()) {
+		if (space.crypt_data
+		    && !space.crypt_data->is_default_encryption()) {
 			goto next;
 		}
 
 		if (srv_encrypt_tables) {
 			/* Skip encrypted tablespaces if
 			innodb_encrypt_tables!=OFF */
-			if (space->crypt_data
-			    && space->crypt_data->min_key_version) {
+			if (space.crypt_data
+			    && space.crypt_data->min_key_version) {
 				goto next;
 			}
 		} else {
 			/* Skip unencrypted tablespaces if
 			innodb_encrypt_tables=OFF */
-			if (!space->crypt_data
-			    || !space->crypt_data->min_key_version) {
+			if (!space.crypt_data
+			    || !space.crypt_data->min_key_version) {
 				goto next;
 			}
 		}
 
-		fil_system.default_encrypt_tables.push_back(*space);
-		space->is_in_default_encrypt = true;
+		fil_system.default_encrypt_tables.push_back(space);
+		space.is_in_default_encrypt = true;
 next:
-		space->release();
+		space.release();
 	}
 }
 
 /*********************************************************************
 Adjust max key age
 @param[in]	val		New max key age */
-UNIV_INTERN
-void
-fil_crypt_set_rotate_key_age(
-	uint	val)
+void fil_crypt_set_rotate_key_age(uint val)
 {
-	mutex_enter(&fil_system.mutex);
-	srv_fil_crypt_rotate_key_age = val;
-	if (val == 0) {
-		fil_crypt_default_encrypt_tables_fill();
-	}
-	mutex_exit(&fil_system.mutex);
-	os_event_set(fil_crypt_threads_event);
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+  mysql_mutex_lock(&fil_system.mutex);
+  srv_fil_crypt_rotate_key_age= val;
+  if (val == 0)
+    fil_crypt_default_encrypt_tables_fill();
+  mysql_mutex_unlock(&fil_system.mutex);
+  pthread_cond_broadcast(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
 }
 
 /*********************************************************************
 Adjust rotation iops
 @param[in]	val		New max roation iops */
-UNIV_INTERN
-void
-fil_crypt_set_rotation_iops(
-	uint val)
+void fil_crypt_set_rotation_iops(uint val)
 {
-	srv_n_fil_crypt_iops = val;
-	os_event_set(fil_crypt_threads_event);
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+  srv_n_fil_crypt_iops= val;
+  pthread_cond_broadcast(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
 }
 
 /*********************************************************************
@@ -2369,35 +2205,31 @@ Adjust encrypt tables
 @param[in]	val		New setting for innodb-encrypt-tables */
 void fil_crypt_set_encrypt_tables(ulong val)
 {
-	if (!fil_crypt_threads_inited) {
-		return;
-	}
+  if (!fil_crypt_threads_inited)
+    return;
 
-	mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
 
-	srv_encrypt_tables = val;
+  mysql_mutex_lock(&fil_system.mutex);
+  srv_encrypt_tables= val;
 
-	if (fil_crypt_must_default_encrypt()) {
-		fil_crypt_default_encrypt_tables_fill();
-	}
+  if (fil_crypt_must_default_encrypt())
+    fil_crypt_default_encrypt_tables_fill();
 
-	mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
 
-	os_event_set(fil_crypt_threads_event);
+  pthread_cond_broadcast(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
 }
 
 /*********************************************************************
 Init threads for key rotation */
-UNIV_INTERN
-void
-fil_crypt_threads_init()
+void fil_crypt_threads_init()
 {
 	if (!fil_crypt_threads_inited) {
-		fil_crypt_event = os_event_create(0);
-		fil_crypt_threads_event = os_event_create(0);
-		mutex_create(LATCH_ID_FIL_CRYPT_THREADS_MUTEX,
-		     &fil_crypt_threads_mutex);
-
+		pthread_cond_init(&fil_crypt_cond, nullptr);
+		pthread_cond_init(&fil_crypt_threads_cond, nullptr);
+		mysql_mutex_init(0, &fil_crypt_threads_mutex, nullptr);
 		uint cnt = srv_n_fil_crypt_threads;
 		srv_n_fil_crypt_threads = 0;
 		fil_crypt_threads_inited = true;
@@ -2407,27 +2239,22 @@ fil_crypt_threads_init()
 
 /*********************************************************************
 Clean up key rotation threads resources */
-UNIV_INTERN
-void
-fil_crypt_threads_cleanup()
+void fil_crypt_threads_cleanup()
 {
 	if (!fil_crypt_threads_inited) {
 		return;
 	}
 	ut_a(!srv_n_fil_crypt_threads_started);
-	os_event_destroy(fil_crypt_event);
-	os_event_destroy(fil_crypt_threads_event);
-	mutex_free(&fil_crypt_threads_mutex);
+	pthread_cond_destroy(&fil_crypt_cond);
+	pthread_cond_destroy(&fil_crypt_threads_cond);
+	mysql_mutex_destroy(&fil_crypt_threads_mutex);
 	fil_crypt_threads_inited = false;
 }
 
 /*********************************************************************
 Wait for crypt threads to stop accessing space
 @param[in]	space		Tablespace */
-UNIV_INTERN
-void
-fil_space_crypt_close_tablespace(
-	const fil_space_t*	space)
+void fil_space_crypt_close_tablespace(const fil_space_t *space)
 {
 	fil_space_crypt_t* crypt_data = space->crypt_data;
 
@@ -2436,55 +2263,47 @@ fil_space_crypt_close_tablespace(
 		return;
 	}
 
-	mutex_enter(&fil_crypt_threads_mutex);
-
 	time_t start = time(0);
 	time_t last = start;
 
-	mutex_enter(&crypt_data->mutex);
-	mutex_exit(&fil_crypt_threads_mutex);
-
-	ulint cnt = crypt_data->rotate_state.active_threads;
-	bool flushing = crypt_data->rotate_state.flushing;
+	mysql_mutex_lock(&crypt_data->mutex);
 
-	while (cnt > 0 || flushing) {
-		mutex_exit(&crypt_data->mutex);
-		/* release dict mutex so that scrub threads can release their
-		* table references */
-		dict_mutex_exit_for_mysql();
+	while (crypt_data->rotate_state.active_threads
+	       || crypt_data->rotate_state.flushing) {
+		mysql_mutex_unlock(&crypt_data->mutex);
 
 		/* wakeup throttle (all) sleepers */
-		os_event_set(fil_crypt_throttle_sleep_event);
-		os_event_set(fil_crypt_threads_event);
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+		pthread_cond_broadcast(&fil_crypt_throttle_sleep_cond);
+		pthread_cond_broadcast(&fil_crypt_threads_cond);
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
 
-		os_thread_sleep(20000);
-		dict_mutex_enter_for_mysql();
-		mutex_enter(&crypt_data->mutex);
-		cnt = crypt_data->rotate_state.active_threads;
-		flushing = crypt_data->rotate_state.flushing;
+		std::this_thread::sleep_for(std::chrono::milliseconds(20));
 
 		time_t now = time(0);
 
-		if (now >= last + 30) {
+		if (UNIV_UNLIKELY(now >= last + 30)) {
 			ib::warn() << "Waited "
 				   << now - start
 				   << " seconds to drop space: "
-				   << space->name << " ("
+				   << space->chain.start->name << " ("
 				   << space->id << ") active threads "
-				   << cnt << "flushing="
-				   << flushing << ".";
+				   << crypt_data->rotate_state.active_threads
+				   << "flushing="
+				   << crypt_data->rotate_state.flushing << ".";
 			last = now;
 		}
+
+		mysql_mutex_lock(&crypt_data->mutex);
 	}
 
-	mutex_exit(&crypt_data->mutex);
+	mysql_mutex_unlock(&crypt_data->mutex);
 }
 
 /*********************************************************************
 Get crypt status for a space (used by information_schema)
 @param[in]	space		Tablespace
 @param[out]	status		Crypt status */
-UNIV_INTERN
 void
 fil_space_crypt_get_status(
 	const fil_space_t*			space,
@@ -2505,7 +2324,7 @@ fil_space_crypt_get_status(
 
 	if (fil_space_crypt_t* crypt_data = space->crypt_data) {
 		status->space = space->id;
-		mutex_enter(&crypt_data->mutex);
+		mysql_mutex_lock(&crypt_data->mutex);
 		status->scheme = crypt_data->type;
 		status->keyserver_requests = crypt_data->keyserver_requests;
 		status->min_key_version = crypt_data->min_key_version;
@@ -2522,7 +2341,7 @@ fil_space_crypt_get_status(
 				crypt_data->rotate_state.max_offset;
 		}
 
-		mutex_exit(&crypt_data->mutex);
+		mysql_mutex_unlock(&crypt_data->mutex);
 
 		if (srv_encrypt_tables || crypt_data->min_key_version) {
 			status->current_key_version =
@@ -2534,14 +2353,11 @@ fil_space_crypt_get_status(
 /*********************************************************************
 Return crypt statistics
 @param[out]	stat		Crypt statistics */
-UNIV_INTERN
-void
-fil_crypt_total_stat(
-	fil_crypt_stat_t *stat)
+void fil_crypt_total_stat(fil_crypt_stat_t *stat)
 {
-	mutex_enter(&crypt_stat_mutex);
+	mysql_mutex_lock(&crypt_stat_mutex);
 	*stat = crypt_stat;
-	mutex_exit(&crypt_stat_mutex);
+	mysql_mutex_unlock(&crypt_stat_mutex);
 }
 
 #endif /* UNIV_INNOCHECKSUM */
@@ -2566,7 +2382,7 @@ bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
 
 	/* Compressed and encrypted pages do not have checksum. Assume not
 	corrupted. Page verification happens after decompression in
-	buf_page_read_complete() using buf_page_is_corrupted(). */
+	buf_page_t::read_complete() using buf_page_is_corrupted(). */
 	if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
 		return true;
 	}
@@ -2578,46 +2394,31 @@ bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
 	/* If stored checksum matches one of the calculated checksums
 	page is not corrupted. */
 
-	switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
+#ifndef UNIV_INNOCHECKSUM
+	switch (srv_checksum_algorithm) {
 	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+#endif /* !UNIV_INNOCHECKSUM */
 		if (zip_size) {
 			return checksum == page_zip_calc_checksum(
-				page, zip_size, SRV_CHECKSUM_ALGORITHM_CRC32);
+				page, zip_size, false);
 		}
 
 		return checksum == buf_calc_page_crc32(page);
-	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		/* Starting with MariaDB 10.1.25, 10.2.7, 10.3.1,
-		due to MDEV-12114, fil_crypt_calculate_checksum()
-		is only using CRC32 for the encrypted pages.
-		Due to this, we must treat "strict_none" as "none". */
-	case SRV_CHECKSUM_ALGORITHM_NONE:
-		return true;
-	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-		/* Starting with MariaDB 10.1.25, 10.2.7, 10.3.1,
-		due to MDEV-12114, fil_crypt_calculate_checksum()
-		is only using CRC32 for the encrypted pages.
-		Due to this, we must treat "strict_innodb" as "innodb". */
-	case SRV_CHECKSUM_ALGORITHM_INNODB:
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+#ifndef UNIV_INNOCHECKSUM
+	default:
 		if (checksum == BUF_NO_CHECKSUM_MAGIC) {
 			return true;
 		}
 		if (zip_size) {
 			return checksum == page_zip_calc_checksum(
-				page, zip_size,
-				SRV_CHECKSUM_ALGORITHM_CRC32)
+				page, zip_size, false)
 				|| checksum == page_zip_calc_checksum(
-					page, zip_size,
-					SRV_CHECKSUM_ALGORITHM_INNODB);
+					page, zip_size, true);
 		}
 
 		return checksum == buf_calc_page_crc32(page)
 			|| checksum == buf_calc_page_new_checksum(page);
 	}
-
-	ut_ad("unhandled innodb_checksum_algorithm" == 0);
-	return false;
+#endif /* !UNIV_INNOCHECKSUM */
 }
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index fd2404a009a..19ebdc8d67e 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -46,8 +46,6 @@ Created 10/25/1995 Heikki Tuuri
 #include "trx0purge.h"
 #include "buf0lru.h"
 #include "ibuf0ibuf.h"
-#include "os0event.h"
-#include "sync0sync.h"
 #include "buf0flu.h"
 #include "log.h"
 #ifdef __linux__
@@ -56,6 +54,12 @@ Created 10/25/1995 Heikki Tuuri
 # include <dirent.h>
 #endif
 
+ATTRIBUTE_COLD void fil_space_t::set_corrupted() const
+{
+  if (!is_stopping() && !is_corrupted.test_and_set())
+    sql_print_error("InnoDB: File '%s' is corrupted", chain.start->name);
+}
+
 /** Determine if the space id is a user tablespace id or not.
 @param space_id tablespace identifier
 @return true if it is a user tablespace ID */
@@ -70,17 +74,16 @@ inline bool fil_is_user_tablespace_id(ulint space_id)
 @return whether a file was closed */
 bool fil_space_t::try_to_close(bool print_info)
 {
-  ut_ad(mutex_own(&fil_system.mutex));
-  for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space;
-       space= UT_LIST_GET_NEXT(space_list, space))
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  for (fil_space_t &space : fil_system.space_list)
   {
-    switch (space->purpose) {
+    switch (space.purpose) {
     case FIL_TYPE_TEMPORARY:
       continue;
     case FIL_TYPE_IMPORT:
       break;
     case FIL_TYPE_TABLESPACE:
-      if (!fil_is_user_tablespace_id(space->id))
+      if (!fil_is_user_tablespace_id(space.id))
         continue;
     }
 
@@ -88,7 +91,7 @@ bool fil_space_t::try_to_close(bool print_info)
     fil_node_open_file_low(), newly opened files are moved to the end
     of fil_system.space_list, so that they would be less likely to be
     closed here. */
-    fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+    fil_node_t *node= UT_LIST_GET_FIRST(space.chain);
     if (!node)
       /* fil_ibd_create() did not invoke fil_space_t::add() yet */
       continue;
@@ -97,7 +100,13 @@ bool fil_space_t::try_to_close(bool print_info)
     if (!node->is_open())
       continue;
 
-    if (const auto n= space->set_closing())
+    /* Other thread is trying to do fil_delete_tablespace()
+    concurrently for the same tablespace. So ignore this
+    tablespace and try to close the other one */
+    const auto n= space.set_closing();
+    if (n & STOPPING)
+      continue;
+    if (n & (PENDING | NEEDS_FSYNC))
     {
       if (!print_info)
         continue;
@@ -128,22 +137,6 @@ bool fil_space_t::try_to_close(bool print_info)
   return false;
 }
 
-/** Rename a single-table tablespace.
-The tablespace must exist in the memory cache.
-@param[in]	id		tablespace identifier
-@param[in]	old_path	old file name
-@param[in]	new_name	new table name in the
-databasename/tablename format
-@param[in]	new_path_in	new file name,
-or NULL if it is located in the normal data directory
-@return true if success */
-static bool
-fil_rename_tablespace(
-	ulint		id,
-	const char*	old_path,
-	const char*	new_name,
-	const char*	new_path_in);
-
 /*
 		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
 		=============================================
@@ -192,7 +185,7 @@ to a hash table. Each tablespace and log file is given an unique 32-bit
 identifier. */
 
 /** Reference to the server data directory. Usually it is the
-current working directory ".", but in the MySQL Embedded Server Library
+current working directory ".", but in the MariaDB Embedded Server Library
 it is an absolute path. */
 const char*	fil_path_to_mysql_datadir;
 
@@ -207,7 +200,7 @@ initialized. */
 fil_system_t	fil_system;
 
 /** At this age or older a space/page will be rotated */
-UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age;
+extern uint srv_fil_crypt_rotate_key_age;
 
 #ifdef UNIV_DEBUG
 /** Try fil_validate() every this many times */
@@ -242,12 +235,10 @@ fil_space_get_by_id(
 	fil_space_t*	space;
 
 	ut_ad(fil_system.is_initialised());
-	ut_ad(mutex_own(&fil_system.mutex));
+	mysql_mutex_assert_owner(&fil_system.mutex);
 
 	HASH_SEARCH(hash, &fil_system.spaces, id,
-		    fil_space_t*, space,
-		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
-		    space->id == id);
+		    fil_space_t*, space,, space->id == id);
 
 	return(space);
 }
@@ -264,9 +255,9 @@ fil_space_t*
 fil_space_get(
 	ulint	id)
 {
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	fil_space_t*	space = fil_space_get_by_id(id);
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 	return(space);
 }
 
@@ -337,8 +328,6 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
 
 	node->size = size;
 
-	node->magic_n = FIL_NODE_MAGIC_N;
-
 	node->init_size = size;
 	node->max_size = max_pages;
 
@@ -346,18 +335,18 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
 
 	node->atomic_write = atomic_write;
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	this->size += size;
 	UT_LIST_ADD_LAST(chain, node);
 	if (node->is_open()) {
-		n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+		clear_closing();
 		if (++fil_system.n_open >= srv_max_n_open_files) {
 			reacquire();
 			try_to_close(true);
 			release();
 		}
 	}
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	return node;
 }
@@ -369,7 +358,7 @@ static bool fil_node_open_file_low(fil_node_t *node)
 {
   ut_ad(!node->is_open());
   ut_ad(node->space->is_closing());
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   ulint type;
   static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
   switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
@@ -423,7 +412,7 @@ static bool fil_node_open_file_low(fil_node_t *node)
 @return whether the file was successfully opened */
 static bool fil_node_open_file(fil_node_t *node)
 {
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   ut_ad(!node->is_open());
   ut_ad(fil_is_user_tablespace_id(node->space->id) ||
         srv_operation == SRV_OPERATION_BACKUP ||
@@ -448,11 +437,11 @@ static bool fil_node_open_file(fil_node_t *node)
     }
     else
     {
-      mutex_exit(&fil_system.mutex);
-      os_thread_sleep(20000);
+      mysql_mutex_unlock(&fil_system.mutex);
+      std::this_thread::sleep_for(std::chrono::milliseconds(20));
       /* Flush tablespaces so that we can close modified files. */
       fil_flush_file_spaces();
-      mutex_enter(&fil_system.mutex);
+      mysql_mutex_lock(&fil_system.mutex);
       if (node->is_open())
         return true;
     }
@@ -483,7 +472,7 @@ pfs_os_file_t fil_node_t::detach()
 
 void fil_node_t::prepare_to_close_or_detach()
 {
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP ||
         srv_operation == SRV_OPERATION_RESTORE_DELTA);
   ut_a(is_open());
@@ -498,7 +487,7 @@ void fil_node_t::prepare_to_close_or_detach()
 /** Flush any writes cached by the file system. */
 void fil_space_t::flush_low()
 {
-  ut_ad(!mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
 
   uint32_t n= 1;
   while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
@@ -527,13 +516,13 @@ void fil_space_t::flush_low()
 
   if (is_in_unflushed_spaces)
   {
-    mutex_enter(&fil_system.mutex);
+    mysql_mutex_lock(&fil_system.mutex);
     if (is_in_unflushed_spaces)
     {
       is_in_unflushed_spaces= false;
       fil_system.unflushed_spaces.remove(*this);
     }
-    mutex_exit(&fil_system.mutex);
+    mysql_mutex_unlock(&fil_system.mutex);
   }
 
   clear_flush();
@@ -554,7 +543,7 @@ fil_space_extend_must_retry(
 	uint32_t	size,
 	bool*		success)
 {
-	ut_ad(mutex_own(&fil_system.mutex));
+	mysql_mutex_assert_owner(&fil_system.mutex);
 	ut_ad(UT_LIST_GET_LAST(space->chain) == node);
 	ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
 	ut_ad(node->space == space);
@@ -572,8 +561,8 @@ fil_space_extend_must_retry(
 		for it to finish.
 		It'd have been better to use event driven mechanism but
 		the entire module is peppered with polling stuff. */
-		mutex_exit(&fil_system.mutex);
-		os_thread_sleep(100000);
+		mysql_mutex_unlock(&fil_system.mutex);
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
 		return(true);
 	}
 
@@ -582,7 +571,7 @@ fil_space_extend_must_retry(
 	/* At this point it is safe to release fil_system.mutex. No
 	other thread can rename, delete, close or extend the file because
 	we have set the node->being_extended flag. */
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	ut_ad(size >= space->size);
 
@@ -603,7 +592,7 @@ fil_space_extend_must_retry(
 		os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift));
 
 	*success = os_file_set_size(node->name, node->handle, new_size,
-				    space->is_compressed());
+				    node->punch_hole == 1);
 
 	os_has_said_disk_full = *success;
 	if (*success) {
@@ -619,7 +608,7 @@ fil_space_extend_must_retry(
 		last_page_no = uint32_t(fsize / page_size)
 			+ file_start_page_no;
 	}
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 
 	ut_a(node->being_extended);
 	node->being_extended = false;
@@ -639,10 +628,10 @@ fil_space_extend_must_retry(
 		srv_sys_space.set_last_file_size(pages_in_MiB);
 	do_flush:
 		space->reacquire();
-		mutex_exit(&fil_system.mutex);
+		mysql_mutex_unlock(&fil_system.mutex);
 		space->flush_low();
 		space->release();
-		mutex_enter(&fil_system.mutex);
+		mysql_mutex_lock(&fil_system.mutex);
 		break;
 	default:
 		ut_ad(space->purpose == FIL_TYPE_TABLESPACE
@@ -665,7 +654,7 @@ fil_space_extend_must_retry(
 ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
 {
   ut_ad(referenced());
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   fil_node_t *node= UT_LIST_GET_LAST(chain);
   ut_ad(!id || purpose == FIL_TYPE_TEMPORARY ||
         node == UT_LIST_GET_FIRST(chain));
@@ -674,13 +663,14 @@ ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
 
   if (!is_open)
     release();
+  else if (node->deferred);
   else if (auto desired_size= recv_size)
   {
     bool success;
     while (fil_space_extend_must_retry(this, node, desired_size, &success))
-      mutex_enter(&fil_system.mutex);
+      mysql_mutex_lock(&fil_system.mutex);
 
-    ut_ad(mutex_own(&fil_system.mutex));
+    mysql_mutex_assert_owner(&fil_system.mutex);
     /* Crash recovery requires the file extension to succeed. */
     ut_a(success);
     /* InnoDB data files cannot shrink. */
@@ -707,7 +697,7 @@ ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
   }
   else
 clear:
-   n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+    clear_closing();
 
   return is_open;
 }
@@ -715,10 +705,10 @@ clear:
 /** @return whether the file is usable for io() */
 ATTRIBUTE_COLD bool fil_space_t::acquire_and_prepare()
 {
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
   const auto flags= acquire_low() & (STOPPING | CLOSING);
   const bool is_open= !flags || (flags == CLOSING && prepare_acquired());
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
   return is_open;
 }
 
@@ -731,14 +721,14 @@ bool fil_space_extend(fil_space_t *space, uint32_t size)
   ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY);
   bool success= false;
   const bool acquired= space->acquire();
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
   if (acquired || space->is_being_truncated)
   {
     while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
                                        size, &success))
-      mutex_enter(&fil_system.mutex);
+      mysql_mutex_lock(&fil_system.mutex);
   }
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
   if (acquired)
     space->release();
   return success;
@@ -747,8 +737,7 @@ bool fil_space_extend(fil_space_t *space, uint32_t size)
 /** Prepare to free a file from fil_system. */
 inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
 {
-  ut_ad(mutex_own(&fil_system.mutex));
-  ut_a(magic_n == FIL_NODE_MAGIC_N);
+  mysql_mutex_assert_owner(&fil_system.mutex);
   ut_a(!being_extended);
 
   if (is_open() &&
@@ -756,10 +745,10 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
                                  std::memory_order_acquire) &
        fil_space_t::PENDING))
   {
-    mutex_exit(&fil_system.mutex);
+    mysql_mutex_unlock(&fil_system.mutex);
     while (space->referenced())
-      os_thread_sleep(100);
-    mutex_enter(&fil_system.mutex);
+      std::this_thread::sleep_for(std::chrono::microseconds(100));
+    mysql_mutex_lock(&fil_system.mutex);
   }
 
   while (is_open())
@@ -787,11 +776,14 @@ inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
   return OS_FILE_CLOSED;
 }
 
-/** Detach a tablespace from the cache and close the files. */
-std::vector<pfs_os_file_t> fil_system_t::detach(fil_space_t *space,
-                                                bool detach_handle)
+/** Detach a tablespace from the cache and close the files.
+@param space tablespace
+@param detach_handle whether to detach the handle, instead of closing
+@return detached handle
+@retval OS_FILE_CLOSED if no handle was detached */
+pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle)
 {
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   HASH_DELETE(fil_space_t, hash, &spaces, space->id, space);
 
   if (space->is_in_unflushed_spaces)
@@ -806,16 +798,31 @@ std::vector<pfs_os_file_t> fil_system_t::detach(fil_space_t *space,
     space->is_in_default_encrypt= false;
     default_encrypt_tables.remove(*space);
   }
-  if (space_list_last_opened == space)
-    space_list_last_opened = UT_LIST_GET_PREV(space_list, space);
-  UT_LIST_REMOVE(space_list, space);
+
+  {
+    space_list_t::iterator s= space_list_t::iterator(space);
+    if (space_list_last_opened == space)
+    {
+      if (s == space_list.begin())
+      {
+        ut_ad(srv_operation > SRV_OPERATION_EXPORT_RESTORED ||
+              srv_shutdown_state > SRV_SHUTDOWN_NONE);
+        space_list_last_opened= nullptr;
+      }
+      else
+      {
+        space_list_t::iterator prev= s;
+        space_list_last_opened= &*--prev;
+      }
+    }
+    space_list.erase(s);
+  }
+
   if (space == sys_space)
     sys_space= nullptr;
   else if (space == temp_space)
     temp_space= nullptr;
 
-  ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
-
   for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
        node= UT_LIST_GET_NEXT(chain, node))
     if (node->is_open())
@@ -824,19 +831,17 @@ std::vector<pfs_os_file_t> fil_system_t::detach(fil_space_t *space,
       n_open--;
     }
 
-  std::vector<pfs_os_file_t> handles;
-  handles.reserve(UT_LIST_GET_LEN(space->chain));
+  ut_ad(!detach_handle || space->id);
+  ut_ad(!detach_handle || UT_LIST_GET_LEN(space->chain) <= 1);
+
+  pfs_os_file_t handle= OS_FILE_CLOSED;
 
   for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
        node= UT_LIST_GET_NEXT(chain, node))
-  {
-    auto handle= node->close_to_free(detach_handle);
-    if (handle != OS_FILE_CLOSED)
-      handles.push_back(handle);
-  }
+    handle= node->close_to_free(detach_handle);
 
   ut_ad(!space->referenced());
-  return handles;
+  return handle;
 }
 
 /** Free a tablespace object on which fil_system_t::detach() was invoked.
@@ -855,7 +860,7 @@ fil_space_free_low(
 	fil_system_t::detach(), the tablespace cannot be found, so
 	fil_space_t::get() would return NULL */
 	while (space->referenced()) {
-		os_thread_sleep(100);
+		std::this_thread::sleep_for(std::chrono::microseconds(100));
 	}
 
 	for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
@@ -869,7 +874,6 @@ fil_space_free_low(
 
 	ut_ad(space->size == 0);
 
-	rw_lock_free(&space->latch);
 	fil_space_destroy_crypt_data(&space->crypt_data);
 
 	space->~fil_space_t();
@@ -889,18 +893,18 @@ fil_space_free(
 {
 	ut_ad(id != TRX_SYS_SPACE);
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	fil_space_t*	space = fil_space_get_by_id(id);
 
 	if (space != NULL) {
 		fil_system.detach(space);
 	}
 
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	if (space != NULL) {
 		if (x_latched) {
-			rw_lock_x_unlock(&space->latch);
+			space->x_unlock();
 		}
 
 		if (!recv_recovery_is_on()) {
@@ -911,7 +915,7 @@ fil_space_free(
 
 		if (space->max_lsn != 0) {
 			ut_d(space->max_lsn = 0);
-			UT_LIST_REMOVE(fil_system.named_spaces, space);
+			fil_system.named_spaces.remove(*space);
 		}
 
 		if (!recv_recovery_is_on()) {
@@ -934,7 +938,7 @@ fil_space_free(
 @param opened     true if space files are opened
 @return pointer to created tablespace, to be filled in with add()
 @retval nullptr on failure (such as when the same tablespace exists) */
-fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags,
+fil_space_t *fil_space_t::create(ulint id, ulint flags,
                                  fil_type_t purpose,
 				 fil_space_crypt_t *crypt_data,
 				 fil_encryption_t mode,
@@ -953,49 +957,37 @@ fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags,
 	space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t;
 
 	space->id = id;
-	space->name = mem_strdup(name);
 
 	UT_LIST_INIT(space->chain, &fil_node_t::chain);
 
 	space->purpose = purpose;
 	space->flags = flags;
 
-	space->magic_n = FIL_SPACE_MAGIC_N;
 	space->crypt_data = crypt_data;
 	space->n_pending.store(CLOSING, std::memory_order_relaxed);
 
-	DBUG_LOG("tablespace",
-		 "Created metadata for " << id << " name " << name);
+	DBUG_LOG("tablespace", "Created metadata for " << id);
 	if (crypt_data) {
 		DBUG_LOG("crypt",
-			 "Tablespace " << id << " name " << name
+			 "Tablespace " << id
 			 << " encryption " << crypt_data->encryption
 			 << " key id " << crypt_data->key_id
 			 << ":" << fil_crypt_get_mode(crypt_data)
 			 << " " << fil_crypt_get_type(crypt_data));
 	}
 
-	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
-
-	if (space->purpose == FIL_TYPE_TEMPORARY) {
-		/* SysTablespace::open_or_create() would pass
-		size!=0 to fil_space_t::add(), so first_time_open
-		would not hold in fil_node_open_file(), and we
-		must assign this manually. We do not care about
-		the durability or atomicity of writes to the
-		temporary tablespace files. */
-		space->atomic_write_supported = true;
-	}
+	space->latch.SRW_LOCK_INIT(fil_space_latch_key);
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 
 	if (const fil_space_t *old_space = fil_space_get_by_id(id)) {
-		ib::error() << "Trying to add tablespace '" << name
-			<< "' with id " << id
-			<< " to the tablespace memory cache, but tablespace '"
-			<< old_space->name << "' already exists in the cache!";
-		mutex_exit(&fil_system.mutex);
-		rw_lock_free(&space->latch);
+		ib::error() << "Trying to add tablespace with id " << id
+			    << " to the cache, but tablespace '"
+			    << (old_space->chain.start
+				? old_space->chain.start->name
+				: "")
+			    << "' already exists in the cache!";
+		mysql_mutex_unlock(&fil_system.mutex);
 		space->~fil_space_t();
 		ut_free(space);
 		return(NULL);
@@ -1006,7 +998,7 @@ fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags,
 	if (opened)
 	  fil_system.add_opened_last_to_space_list(space);
 	else
-	  UT_LIST_ADD_LAST(fil_system.space_list, space);
+          fil_system.space_list.push_back(*space);
 
 	switch (id) {
 	case 0:
@@ -1027,32 +1019,27 @@ fil_space_t *fil_space_t::create(const char *name, ulint id, ulint flags,
 		}
 		if (!fil_system.space_id_reuse_warned) {
 			ib::warn() << "Allocated tablespace ID " << id
-				<< " for " << name << ", old maximum was "
+				<< ", old maximum was "
 				<< fil_system.max_assigned_id;
 		}
 
 		fil_system.max_assigned_id = id;
 	}
 
-	const bool rotate =
-		(purpose == FIL_TYPE_TABLESPACE
-		 && (mode == FIL_ENCRYPTION_ON
-		     || mode == FIL_ENCRYPTION_OFF || srv_encrypt_tables)
-		 && fil_crypt_must_default_encrypt());
+	const bool rotate = purpose == FIL_TYPE_TABLESPACE
+		&& (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF
+		    || srv_encrypt_tables)
+		&& fil_crypt_must_default_encrypt();
 
-	/* Inform key rotation that there could be something
-	to do */
 	if (rotate) {
-		/* Key rotation is not enabled, need to inform background
-		encryption threads. */
 		fil_system.default_encrypt_tables.push_back(*space);
 		space->is_in_default_encrypt = true;
 	}
 
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	if (rotate && srv_n_fil_crypt_threads_started) {
-		os_event_set(fil_crypt_threads_event);
+		fil_crypt_threads_signal();
 	}
 
 	return(space);
@@ -1071,7 +1058,7 @@ fil_assign_new_space_id(
 	ulint	id;
 	bool	success;
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 
 	id = *space_id;
 
@@ -1103,7 +1090,7 @@ fil_assign_new_space_id(
 		*space_id = ULINT_UNDEFINED;
 	}
 
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	return(success);
 }
@@ -1113,7 +1100,7 @@ fil_assign_new_space_id(
 bool fil_space_t::read_page0()
 {
   ut_ad(fil_system.is_initialised());
-  ut_ad(mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_owner(&fil_system.mutex);
   if (size)
     return true;
 
@@ -1144,7 +1131,7 @@ static fil_space_t *fil_space_get_space(ulint id)
 void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags)
 {
   ut_ad(id < SRV_SPACE_ID_UPPER_BOUND);
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
   if (fil_space_t *space= fil_space_get_space(id))
   {
     if (size)
@@ -1152,7 +1139,7 @@ void fil_space_set_recv_size_and_flags(ulint id, uint32_t size, uint32_t flags)
     if (flags != FSP_FLAGS_FCRC32_MASK_MARKER)
       space->flags= flags;
   }
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
 }
 
 /** Open each file. Never invoked on .ibd files.
@@ -1166,7 +1153,7 @@ bool fil_space_t::open(bool create_new_db)
   bool success= true;
   bool skip_read= create_new_db;
 
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
 
   for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
        node= UT_LIST_GET_NEXT(chain, node))
@@ -1202,7 +1189,7 @@ err_exit:
 
   if (!create_new_db)
     committed_size= size;
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
   return success;
 }
 
@@ -1213,7 +1200,7 @@ void fil_space_t::close()
 		return;
 	}
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	ut_ad(this == fil_system.temp_space
 	      || srv_operation == SRV_OPERATION_BACKUP
 	      || srv_operation == SRV_OPERATION_RESTORE
@@ -1227,7 +1214,7 @@ void fil_space_t::close()
 		}
 	}
 
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 }
 
 void fil_system_t::create(ulint hash_size)
@@ -1245,7 +1232,7 @@ void fil_system_t::create(ulint hash_size)
 
 	ut_ad(hash_size > 0);
 
-	mutex_create(LATCH_ID_FIL_SYSTEM, &mutex);
+	mysql_mutex_init(fil_system_mutex_key, &mutex, nullptr);
 
 	spaces.create(hash_size);
 
@@ -1316,7 +1303,7 @@ void fil_system_t::close()
 {
   ut_ad(this == &fil_system);
   ut_a(unflushed_spaces.empty());
-  ut_a(!UT_LIST_GET_LEN(space_list));
+  ut_a(space_list.empty());
   ut_ad(!sys_space);
   ut_ad(!temp_space);
 
@@ -1324,7 +1311,7 @@ void fil_system_t::close()
   {
     m_initialised= false;
     spaces.free();
-    mutex_free(&mutex);
+    mysql_mutex_destroy(&mutex);
     fil_space_crypt_cleanup();
   }
 
@@ -1339,9 +1326,9 @@ void fil_system_t::close()
 void fil_system_t::add_opened_last_to_space_list(fil_space_t *space)
 {
   if (UNIV_LIKELY(space_list_last_opened != nullptr))
-    UT_LIST_INSERT_AFTER(space_list, space_list_last_opened, space);
+    space_list.insert(++space_list_t::iterator(space_list_last_opened), *space);
   else
-    UT_LIST_ADD_FIRST(space_list, space);
+    space_list.push_front(*space);
   space_list_last_opened= space;
 }
 
@@ -1349,87 +1336,86 @@ void fil_system_t::add_opened_last_to_space_list(fil_space_t *space)
 ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
 {
   ut_ad(is_initialised());
-  mutex_enter(&mutex);
-  for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list); space;
-       space= UT_LIST_GET_NEXT(space_list, space))
+  mysql_mutex_lock(&mutex);
+  for (fil_space_t &space : fil_system.space_list)
   {
-    const uint32_t size= space->recv_size;
+    const uint32_t size= space.recv_size;
 
-    if (size > space->size)
+    if (size > space.size)
     {
-      if (space->is_closing())
+      if (space.is_closing())
         continue;
-      space->reacquire();
+      space.reacquire();
       bool success;
-      while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+      while (fil_space_extend_must_retry(&space, UT_LIST_GET_LAST(space.chain),
                                          size, &success))
-        mutex_enter(&mutex);
+        mysql_mutex_lock(&mutex);
       /* Crash recovery requires the file extension to succeed. */
       ut_a(success);
-      space->release();
+      space.release();
     }
   }
-  mutex_exit(&mutex);
+  mysql_mutex_unlock(&mutex);
 }
 
 /** Close all tablespace files at shutdown */
 void fil_space_t::close_all()
 {
-	if (!fil_system.is_initialised()) {
-		return;
-	}
-
-	fil_space_t*	space;
-
-	/* At shutdown, we should not have any files in this list. */
-	ut_ad(srv_fast_shutdown == 2
-	      || !srv_was_started
-	      || UT_LIST_GET_LEN(fil_system.named_spaces) == 0);
-	fil_flush_file_spaces();
-
-	mutex_enter(&fil_system.mutex);
+  if (!fil_system.is_initialised())
+    return;
 
-	for (space = UT_LIST_GET_FIRST(fil_system.space_list); space; ) {
-		fil_node_t*	node;
-		fil_space_t*	prev_space = space;
+  /* At shutdown, we should not have any files in this list. */
+  ut_ad(srv_fast_shutdown == 2 || !srv_was_started ||
+        fil_system.named_spaces.empty());
+  fil_flush_file_spaces();
 
-		for (node = UT_LIST_GET_FIRST(space->chain);
-		     node != NULL;
-		     node = UT_LIST_GET_NEXT(chain, node)) {
+  mysql_mutex_lock(&fil_system.mutex);
 
-			if (!node->is_open()) {
-next:
-				continue;
-			}
+  while (!fil_system.space_list.empty())
+  {
+    fil_space_t &space= fil_system.space_list.front();
 
-			for (ulint count = 10000; count--; ) {
-				if (!space->set_closing()) {
-					node->close();
-					goto next;
-				}
-				mutex_exit(&fil_system.mutex);
-				os_thread_sleep(100);
-				mutex_enter(&fil_system.mutex);
-				if (!node->is_open()) {
-					goto next;
-				}
-			}
+    for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL;
+         node= UT_LIST_GET_NEXT(chain, node))
+    {
 
-			ib::error() << "File '" << node->name
-				    << "' has " << space->referenced()
-				    << " operations";
-		}
+      if (!node->is_open())
+      {
+      next:
+        continue;
+      }
+
+      for (ulint count= 10000; count--;)
+      {
+        const auto n= space.set_closing();
+        if (n & STOPPING)
+          goto next;
+        if (!(n & (PENDING | NEEDS_FSYNC)))
+        {
+          node->close();
+          goto next;
+        }
+        mysql_mutex_unlock(&fil_system.mutex);
+        std::this_thread::sleep_for(std::chrono::microseconds(100));
+        mysql_mutex_lock(&fil_system.mutex);
+        if (!node->is_open())
+          goto next;
+      }
+
+      ib::error() << "File '" << node->name << "' has " << space.referenced()
+                  << " operations";
+    }
 
-		space = UT_LIST_GET_NEXT(space_list, space);
-		fil_system.detach(prev_space);
-		fil_space_free_low(prev_space);
-	}
+    fil_system.detach(&space);
+    mysql_mutex_unlock(&fil_system.mutex);
+    fil_space_free_low(&space);
+    mysql_mutex_lock(&fil_system.mutex);
+  }
 
-	mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
 
-	ut_ad(srv_fast_shutdown == 2
-	      || !srv_was_started
-	      || UT_LIST_GET_LEN(fil_system.named_spaces) == 0);
+  ut_ad(srv_fast_shutdown == 2 || !srv_was_started ||
+        fil_system.named_spaces.empty());
 }
 
 /*******************************************************************//**
@@ -1440,18 +1426,16 @@ fil_set_max_space_id_if_bigger(
 /*===========================*/
 	ulint	max_id)	/*!< in: maximum known id */
 {
-	if (max_id >= SRV_SPACE_ID_UPPER_BOUND) {
-		ib::fatal() << "Max tablespace id is too high, " << max_id;
-	}
+	ut_a(max_id < SRV_SPACE_ID_UPPER_BOUND);
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 
 	if (fil_system.max_assigned_id < max_id) {
 
 		fil_system.max_assigned_id = max_id;
 	}
 
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 }
 
 /** Write the flushed LSN to the page header of the first page in the
@@ -1488,8 +1472,6 @@ fil_write_flushed_lsn(
 		fio = fil_system.sys_space->io(IORequestWrite,
 					       0, srv_page_size, buf);
 		fil_flush_file_spaces();
-	} else {
-		fil_system.sys_space->release();
 	}
 
 	aligned_free(buf);
@@ -1502,7 +1484,7 @@ fil_write_flushed_lsn(
 @retval nullptr if the tablespace is missing or inaccessible */
 fil_space_t *fil_space_t::get(ulint id)
 {
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
   fil_space_t *space= fil_space_get_by_id(id);
   const uint32_t n= space ? space->acquire_low() : 0;
 
@@ -1511,7 +1493,7 @@ fil_space_t *fil_space_t::get(ulint id)
   else if ((n & CLOSING) && !space->prepare_acquired())
     space= nullptr;
 
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
   return space;
 }
 
@@ -1528,11 +1510,11 @@ inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id,
 
   /* fil_name_parse() requires that there be at least one path
   separator and that the file path end with ".ibd". */
-  ut_ad(strchr(path, OS_PATH_SEPARATOR) != NULL);
+  ut_ad(strchr(path, '/'));
   ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
 
-  flag_modified();
-  if (m_log_mode != MTR_LOG_ALL)
+  m_modifications= true;
+  if (!is_logged())
     return;
   m_last= nullptr;
 
@@ -1566,7 +1548,7 @@ inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id,
 
   if (type == FILE_RENAME)
   {
-    ut_ad(strchr(new_path, OS_PATH_SEPARATOR));
+    ut_ad(strchr(new_path, '/'));
     m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1));
     m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len));
   }
@@ -1574,40 +1556,6 @@ inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id,
     m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
 }
 
-/** Write redo log for renaming a file.
-@param[in]	space_id	tablespace id
-@param[in]	old_name	tablespace file name
-@param[in]	new_name	tablespace file name after renaming
-@param[in,out]	mtr		mini-transaction */
-static
-void
-fil_name_write_rename_low(
-	ulint		space_id,
-	const char*	old_name,
-	const char*	new_name,
-	mtr_t*		mtr)
-{
-  ut_ad(!is_predefined_tablespace(space_id));
-  mtr->log_file_op(FILE_RENAME, space_id, old_name, new_name);
-}
-
-/** Write redo log for renaming a file.
-@param[in]	space_id	tablespace id
-@param[in]	old_name	tablespace file name
-@param[in]	new_name	tablespace file name after renaming */
-static void
-fil_name_write_rename(
-	ulint		space_id,
-	const char*	old_name,
-	const char*	new_name)
-{
-	mtr_t	mtr;
-	mtr.start();
-	fil_name_write_rename_low(space_id, old_name, new_name, &mtr);
-	mtr.commit();
-	log_write_up_to(mtr.commit_lsn(), true);
-}
-
 /** Write FILE_MODIFY for a file.
 @param[in]	space_id	tablespace id
 @param[in]	name		tablespace file name
@@ -1623,148 +1571,67 @@ fil_name_write(
   mtr->log_file_op(FILE_MODIFY, space_id, name);
 }
 
-/** Check for pending operations.
-@param[in]	space	tablespace
-@param[in]	count	number of attempts so far
-@return 0 if no operations else count + 1. */
-static ulint fil_check_pending_ops(const fil_space_t* space, ulint count)
-{
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	if (!space) {
-		return 0;
-	}
-
-	if (auto n_pending_ops = space->referenced()) {
-
-		/* Give a warning every 10 second, starting after 1 second */
-		if ((count % 500) == 50) {
-			ib::warn() << "Trying to delete"
-				" tablespace '" << space->name
-				<< "' but there are " << n_pending_ops
-				<< " pending operations on it.";
-		}
-
-		return(count + 1);
-	}
-
-	return(0);
-}
-
-/*******************************************************************//**
-Check for pending IO.
-@return 0 if no pending else count + 1. */
-static
-ulint
-fil_check_pending_io(
-/*=================*/
-	fil_space_t*	space,		/*!< in/out: Tablespace to check */
-	fil_node_t**	node,		/*!< out: Node in space list */
-	ulint		count)		/*!< in: number of attempts so far */
+fil_space_t *fil_space_t::check_pending_operations(ulint id)
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-
-	/* The following code must change when InnoDB supports
-	multiple datafiles per tablespace. */
-	ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
-
-	*node = UT_LIST_GET_FIRST(space->chain);
-
-	if (const uint32_t p = space->referenced()) {
-		ut_a(!(*node)->being_extended);
-
-                /* Give a warning every 10 second, starting after 1 second */
-		if ((count % 500) == 50) {
-			ib::info() << "Trying to delete"
-				" tablespace '" << space->name
-				<< "' but there are " << p
-				<< " pending i/o's on it.";
-		}
-
-		return(count + 1);
-	}
-
-	return(0);
-}
-
-/*******************************************************************//**
-Check pending operations on a tablespace.
-@return tablespace */
-static
-fil_space_t*
-fil_check_pending_operations(
-/*=========================*/
-	ulint		id,		/*!< in: space id */
-	bool		truncate,	/*!< in: whether to truncate a file */
-	char**		path)		/*!< out/own: tablespace path */
-{
-	ulint		count = 0;
-
-	ut_a(!is_system_tablespace(id));
-	mutex_enter(&fil_system.mutex);
-	fil_space_t* sp = fil_space_get_by_id(id);
-
-	if (sp) {
-		sp->set_stopping(true);
-		if (sp->crypt_data) {
-			sp->reacquire();
-			mutex_exit(&fil_system.mutex);
-			fil_space_crypt_close_tablespace(sp);
-			mutex_enter(&fil_system.mutex);
-			sp->release();
-		}
-	}
-
-	/* Check for pending operations. */
-
-	do {
-		count = fil_check_pending_ops(sp, count);
-
-		mutex_exit(&fil_system.mutex);
-
-		if (count) {
-			os_thread_sleep(20000); // Wait 0.02 seconds
-		} else if (!sp) {
-			return nullptr;
-		}
-
-		mutex_enter(&fil_system.mutex);
-
-		sp = fil_space_get_by_id(id);
-	} while (count);
-
-	/* Check for pending IO. */
-
-	for (;;) {
-		if (truncate) {
-			sp->is_being_truncated = true;
-		}
-
-		fil_node_t*	node;
-
-		count = fil_check_pending_io(sp, &node, count);
-
-		if (count == 0 && path) {
-			*path = mem_strdup(node->name);
-		}
-
-		mutex_exit(&fil_system.mutex);
+  ut_a(!is_system_tablespace(id));
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_space_t *space= fil_space_get_by_id(id);
 
-		if (count == 0) {
-			break;
-		}
+  if (!space)
+  {
+    mysql_mutex_unlock(&fil_system.mutex);
+    return nullptr;
+  }
 
-		os_thread_sleep(20000);         // Wait 0.02 seconds
-		mutex_enter(&fil_system.mutex);
-		sp = fil_space_get_by_id(id);
+  if (space->pending() & STOPPING)
+  {
+being_deleted:
+    /* A thread executing DDL and another thread executing purge may
+    be executing fil_delete_tablespace() concurrently for the same
+    tablespace. Wait for the other thread to complete the operation. */
+    for (ulint count= 0;; count++)
+    {
+      space= fil_space_get_by_id(id);
+      ut_ad(!space || space->is_stopping());
+      mysql_mutex_unlock(&fil_system.mutex);
+      if (!space)
+        return nullptr;
+      /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
+      if ((count & 511) == 128)
+        sql_print_warning("InnoDB: Waiting for tablespace " ULINTPF
+                          " to be deleted", id);
+      std::this_thread::sleep_for(std::chrono::milliseconds(20));
+      mysql_mutex_lock(&fil_system.mutex);
+    }
+  }
+  else
+  {
+    if (space->crypt_data)
+    {
+      space->reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      fil_space_crypt_close_tablespace(space);
+      mysql_mutex_lock(&fil_system.mutex);
+      space->release();
+    }
+    if (space->set_stopping_check())
+      goto being_deleted;
+  }
 
-		if (!sp) {
-			mutex_exit(&fil_system.mutex);
-			break;
-		}
-	}
+  mysql_mutex_unlock(&fil_system.mutex);
 
-	return sp;
+  for (ulint count= 0;; count++)
+  {
+    const unsigned pending= space->referenced();
+    if (!pending)
+      return space;
+    /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
+    if ((count & 511) == 128)
+      sql_print_warning("InnoDB: Trying to delete tablespace '%s' "
+                        "but there are %u pending operations",
+                        space->chain.start->name, id);
+    std::this_thread::sleep_for(std::chrono::milliseconds(20));
+  }
 }
 
 /** Close a single-table tablespace on failed IMPORT TABLESPACE.
@@ -1773,13 +1640,12 @@ Free all pages used by the tablespace. */
 void fil_close_tablespace(ulint id)
 {
 	ut_ad(!is_system_tablespace(id));
-	char* path = nullptr;
-	fil_space_t* space = fil_check_pending_operations(id, false, &path);
+	fil_space_t* space = fil_space_t::check_pending_operations(id);
 	if (!space) {
 		return;
 	}
 
-	rw_lock_x_lock(&space->latch);
+	space->x_lock();
 
 	/* Invalidate in the buffer pool all pages belonging to the
 	tablespace. Since we have invoked space->set_stopping(), readahead
@@ -1789,179 +1655,65 @@ void fil_close_tablespace(ulint id)
 	while (buf_flush_list_space(space));
 	ut_ad(space->is_stopping());
 
-	/* If the free is successful, the X lock will be released before
-	the space memory data structure is freed. */
-
-	if (!fil_space_free(id, true)) {
-		rw_lock_x_unlock(&space->latch);
-	}
-
 	/* If it is a delete then also delete any generated files, otherwise
 	when we drop the database the remove directory will fail. */
 
-	if (char* cfg_name = fil_make_filepath(path, NULL, CFG, false)) {
+	if (char* cfg_name = fil_make_filepath(space->chain.start->name,
+					       fil_space_t::name_type{},
+					       CFG, false)) {
 		os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
 		ut_free(cfg_name);
 	}
 
-	ut_free(path);
-}
-
-/** Delete a tablespace and associated .ibd file.
-@param[in]	id		tablespace identifier
-@param[in]	if_exists	whether to ignore missing tablespace
-@param[in,out]	detached_handles	return detached handles if not nullptr
-@return	DB_SUCCESS or error */
-dberr_t fil_delete_tablespace(ulint id, bool if_exists,
-			      std::vector<pfs_os_file_t>* detached_handles)
-{
-	char* path = NULL;
-	ut_ad(!is_system_tablespace(id));
-	ut_ad(!detached_handles || detached_handles->empty());
-
-	dberr_t err;
-	fil_space_t *space = fil_check_pending_operations(id, false, &path);
-
-	if (!space) {
-		err = DB_TABLESPACE_NOT_FOUND;
-		if (!if_exists) {
-			ib::error() << "Cannot delete tablespace " << id
-				    << " because it is not found"
-				       " in the tablespace memory cache.";
-		}
-
-		goto func_exit;
-	}
-
-	/* IMPORTANT: Because we have set space::stop_new_ops there
-	can't be any new reads or flushes. We are here
-	because node::n_pending was zero above. However, it is still
-	possible to have pending read and write requests:
-
-	A read request can happen because the reader thread has
-	gone through the ::stop_new_ops check in buf_page_init_for_read()
-	before the flag was set and has not yet incremented ::n_pending
-	when we checked it above.
-
-	A write request can be issued any time because we don't check
-	fil_space_t::is_stopping() when queueing a block for write.
-
-	We deal with pending write requests in the following function
-	where we'd minimally evict all dirty pages belonging to this
-	space from the flush_list. Note that if a block is IO-fixed
-	we'll wait for IO to complete.
-
-	To deal with potential read requests, we will check the
-	is_stopping() in fil_space_t::io(). */
-
-	err = DB_SUCCESS;
-	buf_flush_remove_pages(id);
-
-	/* If it is a delete then also delete any generated files, otherwise
-	when we drop the database the remove directory will fail. */
-	{
-		/* Before deleting the file, write a log record about
-		it, so that InnoDB crash recovery will expect the file
-		to be gone. */
-		mtr_t		mtr;
-
-		mtr.start();
-		mtr.log_file_op(FILE_DELETE, id, path);
-		mtr.commit();
-		/* Even if we got killed shortly after deleting the
-		tablespace file, the record must have already been
-		written to the redo log. */
-		log_write_up_to(mtr.commit_lsn(), true);
-
-		char*	cfg_name = fil_make_filepath(path, NULL, CFG, false);
-		if (cfg_name != NULL) {
-			os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
-			ut_free(cfg_name);
-		}
-	}
-
-	/* Delete the link file pointing to the ibd file we are deleting. */
-	if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) {
-		RemoteDatafile::delete_link_file(space->name);
-	}
-
-	mutex_enter(&fil_system.mutex);
-
-	/* Double check the sanity of pending ops after reacquiring
-	the fil_system::mutex. */
-	if (const fil_space_t* s = fil_space_get_by_id(id)) {
-		ut_a(s == space);
-		ut_a(!space->referenced());
-		ut_a(UT_LIST_GET_LEN(space->chain) == 1);
-		auto handles = fil_system.detach(space,
-						 detached_handles != nullptr);
-		if (detached_handles) {
-			*detached_handles = std::move(handles);
-		}
-		mutex_exit(&fil_system.mutex);
-
-		mysql_mutex_lock(&log_sys.mutex);
-
-		if (space->max_lsn != 0) {
-			ut_d(space->max_lsn = 0);
-			UT_LIST_REMOVE(fil_system.named_spaces, space);
-		}
-
-		mysql_mutex_unlock(&log_sys.mutex);
-		fil_space_free_low(space);
-
-		if (!os_file_delete(innodb_data_file_key, path)
-		    && !os_file_delete_if_exists(
-			    innodb_data_file_key, path, NULL)) {
-
-			/* Note: This is because we have removed the
-			tablespace instance from the cache. */
+	/* If the free is successful, the wrlock will be released before
+	the space memory data structure is freed. */
 
-			err = DB_IO_ERROR;
-		}
-	} else {
-		mutex_exit(&fil_system.mutex);
-		err = DB_TABLESPACE_NOT_FOUND;
+	if (!fil_space_free(id, true)) {
+		space->x_unlock();
 	}
-
-func_exit:
-	ut_free(path);
-	ibuf_delete_for_discarded_space(id);
-	return(err);
 }
 
-/** Prepare to truncate an undo tablespace.
-@param[in]	space_id	undo tablespace id
-@return	the tablespace
-@retval	NULL if tablespace not found */
-fil_space_t *fil_truncate_prepare(ulint space_id)
-{
-  return fil_check_pending_operations(space_id, true, nullptr);
+/** Delete a tablespace and associated .ibd file.
+@param id    tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return	OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(ulint id)
+{
+  ut_ad(!is_system_tablespace(id));
+  pfs_os_file_t handle= OS_FILE_CLOSED;
+  if (fil_space_t *space= fil_space_t::check_pending_operations(id))
+  {
+    /* Before deleting the file(s), persistently write a log record. */
+    mtr_t mtr;
+    mtr.start();
+    mtr.log_file_op(FILE_DELETE, id, space->chain.start->name);
+    mtr.commit_file(*space, nullptr, &handle);
+    fil_space_free_low(space);
+  }
+
+  ibuf_delete_for_discarded_space(id);
+  return handle;
 }
 
 /*******************************************************************//**
 Allocates and builds a file name from a path, a table or tablespace name
 and a suffix. The string must be freed by caller with ut_free().
 @param[in] path NULL or the directory path or the full path and filename.
-@param[in] name NULL if path is full, or Table/Tablespace name
-@param[in] suffix NULL or the file extention to use.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
 @param[in] trim_name true if the last name on the path should be trimmed.
 @return own: file name */
-char*
-fil_make_filepath(
-	const char*	path,
-	const char*	name,
-	ib_extention	ext,
-	bool		trim_name)
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+                        ib_extention ext, bool trim_name)
 {
 	/* The path may contain the basename of the file, if so we do not
 	need the name.  If the path is NULL, we can use the default path,
 	but there needs to be a name. */
-	ut_ad(path != NULL || name != NULL);
+	ut_ad(path || name.data());
 
 	/* If we are going to strip a name off the path, there better be a
 	path and a new name to put back on. */
-	ut_ad(!trim_name || (path != NULL && name != NULL));
+	ut_ad(!trim_name || (path && name.data()));
 
 	if (path == NULL) {
 		path = fil_path_to_mysql_datadir;
@@ -1969,20 +1721,20 @@ fil_make_filepath(
 
 	ulint	len		= 0;	/* current length */
 	ulint	path_len	= strlen(path);
-	ulint	name_len	= (name ? strlen(name) : 0);
 	const char* suffix	= dot_ext[ext];
 	ulint	suffix_len	= strlen(suffix);
-	ulint	full_len	= path_len + 1 + name_len + suffix_len + 1;
+	ulint	full_len	= path_len + 1 + name.size() + suffix_len + 1;
 
 	char*	full_name = static_cast<char*>(ut_malloc_nokey(full_len));
 	if (full_name == NULL) {
 		return NULL;
 	}
 
-	/* If the name is a relative path, do not prepend "./". */
+	/* If the name is a relative or absolute path, do not prepend "./". */
 	if (path[0] == '.'
-	    && (path[1] == '\0' || path[1] == OS_PATH_SEPARATOR)
-	    && name != NULL && name[0] == '.') {
+	    && (path[1] == '\0' || path[1] == '/' IF_WIN(|| path[1] == '\\',))
+	    && name.size() && (name.data()[0] == '.'
+			       || is_absolute_path(name.data()))) {
 		path = NULL;
 		path_len = 0;
 	}
@@ -1993,30 +1745,35 @@ fil_make_filepath(
 	}
 
 	full_name[len] = '\0';
-	os_normalize_path(full_name);
 
 	if (trim_name) {
 		/* Find the offset of the last DIR separator and set it to
 		null in order to strip off the old basename from this path. */
-		char* last_dir_sep = strrchr(full_name, OS_PATH_SEPARATOR);
+		char* last_dir_sep = strrchr(full_name, '/');
+#ifdef _WIN32
+		if (char *last = strrchr(full_name, '\\')) {
+			if (last > last_dir_sep) {
+				last_dir_sep = last;
+			}
+		}
+#endif
 		if (last_dir_sep) {
 			last_dir_sep[0] = '\0';
 			len = strlen(full_name);
 		}
 	}
 
-	if (name != NULL) {
-		if (len && full_name[len - 1] != OS_PATH_SEPARATOR) {
+	if (name.size()) {
+		if (len && full_name[len - 1] != '/') {
 			/* Add a DIR separator */
-			full_name[len] = OS_PATH_SEPARATOR;
+			full_name[len] = '/';
 			full_name[++len] = '\0';
 		}
 
 		char*	ptr = &full_name[len];
-		memcpy(ptr, name, name_len);
-		len += name_len;
+		memcpy(ptr, name.data(), name.size());
+		len += name.size();
 		full_name[len] = '\0';
-		os_normalize_path(ptr);
 	}
 
 	/* Make sure that the specified suffix is at the end of the filepath
@@ -2044,196 +1801,67 @@ fil_make_filepath(
 	return(full_name);
 }
 
-/** Test if a tablespace file can be renamed to a new filepath by checking
-if that the old filepath exists and the new filepath does not exist.
-@param[in]	old_path	old filepath
-@param[in]	new_path	new filepath
-@param[in]	replace_new	whether to ignore the existence of new_path
-@return innodb error code */
-static dberr_t
-fil_rename_tablespace_check(
-	const char*	old_path,
-	const char*	new_path,
-	bool		replace_new)
-{
-	bool	exists = false;
-	os_file_type_t	ftype;
-
-	if (os_file_status(old_path, &exists, &ftype) && !exists) {
-		ib::error() << "Cannot rename '" << old_path
-			<< "' to '" << new_path
-			<< "' because the source file"
-			<< " does not exist.";
-		return(DB_TABLESPACE_NOT_FOUND);
-	}
-
-	exists = false;
-	if (os_file_status(new_path, &exists, &ftype) && !exists) {
-		return DB_SUCCESS;
-	}
-
-	if (!replace_new) {
-		ib::error() << "Cannot rename '" << old_path
-			<< "' to '" << new_path
-			<< "' because the target file exists."
-			" Remove the target file and try again.";
-		return(DB_TABLESPACE_EXISTS);
-	}
-
-	/* This must be during the ROLLBACK of TRUNCATE TABLE.
-	Because InnoDB only allows at most one data dictionary
-	transaction at a time, and because this incomplete TRUNCATE
-	would have created a new tablespace file, we must remove
-	a possibly existing tablespace that is associated with the
-	new tablespace file. */
-retry:
-	mutex_enter(&fil_system.mutex);
-	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space; space = UT_LIST_GET_NEXT(space_list, space)) {
-		ulint id = space->id;
-		if (id
-		    && space->purpose == FIL_TYPE_TABLESPACE
-		    && !strcmp(new_path,
-			       UT_LIST_GET_FIRST(space->chain)->name)) {
-			ib::info() << "TRUNCATE rollback: " << id
-				<< "," << new_path;
-			mutex_exit(&fil_system.mutex);
-			dberr_t err = fil_delete_tablespace(id);
-			if (err != DB_SUCCESS) {
-				return err;
-			}
-			goto retry;
-		}
-	}
-	mutex_exit(&fil_system.mutex);
-	fil_delete_file(new_path);
-
-	return(DB_SUCCESS);
-}
-
-dberr_t fil_space_t::rename(const char* name, const char* path, bool log,
-			    bool replace)
+char *fil_make_filepath(const char* path, const table_name_t name,
+                        ib_extention suffix, bool strip_name)
 {
-	ut_ad(UT_LIST_GET_LEN(chain) == 1);
-	ut_ad(!is_system_tablespace(id));
-
-	if (log) {
-		dberr_t err = fil_rename_tablespace_check(
-			chain.start->name, path, replace);
-		if (err != DB_SUCCESS) {
-			return(err);
-		}
-		fil_name_write_rename(id, chain.start->name, path);
-	}
-
-	return fil_rename_tablespace(id, chain.start->name, name, path)
-		? DB_SUCCESS : DB_ERROR;
+  return fil_make_filepath(path, {name.m_name, strlen(name.m_name)},
+                           suffix, strip_name);
 }
 
-/** Rename a single-table tablespace.
-The tablespace must exist in the memory cache.
-@param[in]	id		tablespace identifier
-@param[in]	old_path	old file name
-@param[in]	new_name	new table name in the
-databasename/tablename format
-@param[in]	new_path_in	new file name,
-or NULL if it is located in the normal data directory
-@return true if success */
-static bool
-fil_rename_tablespace(
-	ulint		id,
-	const char*	old_path,
-	const char*	new_name,
-	const char*	new_path_in)
+dberr_t fil_space_t::rename(const char *path, bool log, bool replace)
 {
-	fil_space_t*	space;
-	fil_node_t*	node;
-	ut_a(id != 0);
-
-	ut_ad(strchr(new_name, '/') != NULL);
+  ut_ad(UT_LIST_GET_LEN(chain) == 1);
+  ut_ad(!is_predefined_tablespace(id));
 
-	mutex_enter(&fil_system.mutex);
+  const char *old_path= chain.start->name;
 
-	space = fil_space_get_by_id(id);
+  ut_ad(strchr(old_path, '/'));
+  ut_ad(strchr(path, '/'));
 
-	if (space == NULL) {
-		ib::error() << "Cannot find space id " << id
-			<< " in the tablespace memory cache, though the file '"
-			<< old_path
-			<< "' in a rename operation should have that id.";
-		mutex_exit(&fil_system.mutex);
-		return(false);
-	}
+  if (!strcmp(path, old_path))
+    return DB_SUCCESS;
 
-	/* The following code must change when InnoDB supports
-	multiple datafiles per tablespace. */
-	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
-	node = UT_LIST_GET_FIRST(space->chain);
-	space->reacquire();
-
-	mutex_exit(&fil_system.mutex);
-
-	char*	new_file_name = new_path_in == NULL
-		? fil_make_filepath(NULL, new_name, IBD, false)
-		: mem_strdup(new_path_in);
-	char*	old_file_name = node->name;
-	char*	new_space_name = mem_strdup(new_name);
-	char*	old_space_name = space->name;
-
-	ut_ad(strchr(old_file_name, OS_PATH_SEPARATOR) != NULL);
-	ut_ad(strchr(new_file_name, OS_PATH_SEPARATOR) != NULL);
-
-	if (!recv_recovery_is_on()) {
-		mysql_mutex_lock(&log_sys.mutex);
-	}
-
-	/* log_sys.mutex is above fil_system.mutex in the latching order */
-	mysql_mutex_assert_owner(&log_sys.mutex);
-	mutex_enter(&fil_system.mutex);
-	space->release();
-	ut_ad(space->name == old_space_name);
-	ut_ad(node->name == old_file_name);
-	bool success;
-	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
-			goto skip_second_rename; );
-	success = os_file_rename(innodb_data_file_key,
-				 old_file_name,
-				 new_file_name);
-	DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2",
-skip_second_rename:
-                       success = false; );
-
-	ut_ad(node->name == old_file_name);
-
-	if (success) {
-		node->name = new_file_name;
-	}
-
-	if (!recv_recovery_is_on()) {
-		mysql_mutex_unlock(&log_sys.mutex);
-	}
+  if (!log)
+  {
+    if (!os_file_rename(innodb_data_file_key, old_path, path))
+      return DB_ERROR;
+    mysql_mutex_lock(&fil_system.mutex);
+    ut_free(chain.start->name);
+    chain.start->name= mem_strdup(path);
+    mysql_mutex_unlock(&fil_system.mutex);
+    return DB_SUCCESS;
+  }
 
-	ut_ad(space->name == old_space_name);
-	if (success) {
-		space->name = new_space_name;
-	} else {
-		/* Because nothing was renamed, we must free the new
-		names, not the old ones. */
-		old_file_name = new_file_name;
-		old_space_name = new_space_name;
-	}
+  bool exists= false;
+  os_file_type_t ftype;
 
-	mutex_exit(&fil_system.mutex);
+  /* Check upfront if the rename operation might succeed, because we
+  must durably write redo log before actually attempting to execute
+  the rename in the file system. */
+  if (os_file_status(old_path, &exists, &ftype) && !exists)
+  {
+    sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+                    " because the source file does not exist.",
+                    old_path, path);
+    return DB_TABLESPACE_NOT_FOUND;
+  }
 
-	ut_free(old_file_name);
-	ut_free(old_space_name);
+  exists= false;
+  if (replace);
+  else if (!os_file_status(path, &exists, &ftype) || exists)
+  {
+    sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+                    " because the target file exists.",
+                    old_path, path);
+    return DB_TABLESPACE_EXISTS;
+  }
 
-	return(success);
+  mtr_t mtr;
+  mtr.start();
+  mtr.log_file_op(FILE_RENAME, id, old_path, path);
+  return mtr.commit_file(*this, path) ? DB_SUCCESS : DB_ERROR;
 }
 
-/* FIXME: remove this! */
-IF_WIN(, bool os_is_sparse_file_supported(os_file_t fh));
-
 /** Create a tablespace file.
 @param[in]	space_id	Tablespace ID
 @param[in]	name		Tablespace name in dbname/tablename format.
@@ -2249,7 +1877,7 @@ must be >= FIL_IBD_FILE_INITIAL_SIZE
 fil_space_t*
 fil_ibd_create(
 	ulint		space_id,
-	const char*	name,
+	const table_name_t name,
 	const char*	path,
 	ulint		flags,
 	uint32_t	size,
@@ -2258,8 +1886,8 @@ fil_ibd_create(
 	dberr_t*	err)
 {
 	pfs_os_file_t	file;
-	byte*		page;
 	bool		success;
+	mtr_t		mtr;
 	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0;
 
 	ut_ad(!is_system_tablespace(space_id));
@@ -2275,6 +1903,11 @@ fil_ibd_create(
 		return NULL;
 	}
 
+	mtr.start();
+	mtr.log_file_op(FILE_CREATE, space_id, path);
+	mtr.commit();
+	log_write_up_to(mtr.commit_lsn(), true);
+
 	ulint type;
 	static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
 		      "compatibility");
@@ -2315,129 +1948,70 @@ fil_ibd_create(
 	}
 
 	const bool is_compressed = fil_space_t::is_compressed(flags);
-	bool punch_hole = is_compressed;
-	fil_space_crypt_t* crypt_data = nullptr;
 #ifdef _WIN32
+	const bool is_sparse = is_compressed;
 	if (is_compressed) {
 		os_file_set_sparse_win32(file);
 	}
+#else
+	const bool is_sparse = is_compressed
+		&& DB_SUCCESS == os_file_punch_hole(file, 0, 4096)
+		&& !my_test_if_thinly_provisioned(file);
 #endif
 
-	if (!os_file_set_size(
-		path, file,
-		os_offset_t(size) << srv_page_size_shift, is_compressed)) {
-		*err = DB_OUT_OF_FILE_SPACE;
-err_exit:
-		os_file_close(file);
-		os_file_delete(innodb_data_file_key, path);
-		free(crypt_data);
-		return NULL;
-	}
-
-	/* FIXME: remove this */
-	IF_WIN(, punch_hole = punch_hole && os_is_sparse_file_supported(file));
-
-	/* We have to write the space id to the file immediately and flush the
-	file to disk. This is because in crash recovery we must be aware what
-	tablespaces exist and what are their space id's, so that we can apply
-	the log records to the right file. It may take quite a while until
-	buffer pool flush algorithms write anything to the file and flush it to
-	disk. If we would not write here anything, the file would be filled
-	with zeros from the call of os_file_set_size(), until a buffer pool
-	flush would write to it. */
-
-	/* Align the memory for file i/o if we might have O_DIRECT set */
-	page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
-						 srv_page_size));
-
-	memset(page, '\0', srv_page_size);
-
 	if (fil_space_t::full_crc32(flags)) {
 		flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE();
 	} else {
 		flags |= FSP_FLAGS_PAGE_SSIZE();
 	}
 
-	fsp_header_init_fields(page, space_id, flags);
-	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
-
 	/* Create crypt data if the tablespace is either encrypted or user has
 	requested it to remain unencrypted. */
-	crypt_data = (mode != FIL_ENCRYPTION_DEFAULT || srv_encrypt_tables)
+	fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT
+					 || srv_encrypt_tables)
 		? fil_space_create_crypt_data(mode, key_id)
-		: NULL;
+		: nullptr;
 
-	if (crypt_data) {
-		/* Write crypt data information in page0 while creating
-		ibd file. */
-		crypt_data->fill_page0(flags, page);
-	}
-
-	if (ulint zip_size = fil_space_t::zip_size(flags)) {
-		page_zip_des_t	page_zip;
-		page_zip_set_size(&page_zip, zip_size);
-		page_zip.data = page + srv_page_size;
-#ifdef UNIV_DEBUG
-		page_zip.m_start = 0;
-#endif /* UNIV_DEBUG */
-		page_zip.m_end = 0;
-		page_zip.m_nonempty = 0;
-		page_zip.n_blobs = 0;
-
-		buf_flush_init_for_writing(NULL, page, &page_zip, false);
-
-		*err = os_file_write(IORequestWrite, path, file,
-				     page_zip.data, 0, zip_size);
-	} else {
-		buf_flush_init_for_writing(NULL, page, NULL,
-					   fil_space_t::full_crc32(flags));
-
-		*err = os_file_write(IORequestWrite, path, file,
-				     page, 0, srv_page_size);
-	}
-
-	aligned_free(page);
-
-	if (*err != DB_SUCCESS) {
-		ib::error()
-			<< "Could not write the first page to"
-			<< " tablespace '" << path << "'";
-		goto err_exit;
+	if (!os_file_set_size(path, file,
+			      os_offset_t(size) << srv_page_size_shift,
+			      is_sparse)) {
+		*err = DB_OUT_OF_FILE_SPACE;
+err_exit:
+		os_file_close(file);
+		os_file_delete(innodb_data_file_key, path);
+		free(crypt_data);
+		return nullptr;
 	}
 
-	if (!os_file_flush(file)) {
-		ib::error() << "File flush of tablespace '"
-			<< path << "' failed";
-		*err = DB_ERROR;
-		goto err_exit;
-	}
+	fil_space_t::name_type space_name;
 
 	if (has_data_dir) {
 		/* Make the ISL file if the IBD file is not
 		in the default location. */
-		*err = RemoteDatafile::create_link_file(name, path);
+		space_name = {name.m_name, strlen(name.m_name)};
+		*err = RemoteDatafile::create_link_file(space_name, path);
 		if (*err != DB_SUCCESS) {
 			goto err_exit;
 		}
 	}
 
-	if (fil_space_t* space = fil_space_t::create(name, space_id, flags,
+	DBUG_EXECUTE_IF("checkpoint_after_file_create",
+			log_make_checkpoint(););
+
+	if (fil_space_t* space = fil_space_t::create(space_id, flags,
 						     FIL_TYPE_TABLESPACE,
 						     crypt_data, mode, true)) {
-		space->punch_hole = punch_hole;
 		fil_node_t* node = space->add(path, file, size, false, true);
-		mtr_t mtr;
+		IF_WIN(node->find_metadata(), node->find_metadata(file, true));
 		mtr.start();
-		mtr.log_file_op(FILE_CREATE, space_id, node->name);
+		mtr.set_named_space(space);
+		ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS);
 		mtr.commit();
-
-		node->find_metadata(file);
-		*err = DB_SUCCESS;
 		return space;
 	}
 
-	if (has_data_dir) {
-		RemoteDatafile::delete_link_file(name);
+	if (space_name.data()) {
+		RemoteDatafile::delete_link_file(space_name);
 	}
 
 	*err = DB_ERROR;
@@ -2450,7 +2024,7 @@ to the .err log. This function is used to open a tablespace when we start
 mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
 
 NOTE that we assume this operation is used either at the database startup
-or under the protection of the dictionary mutex, so that two users cannot
+or under the protection of dict_sys.latch, so that two users cannot
 race here. This operation does not leave the file associated with the
 tablespace open, but closes it after we have looked at the space id in it.
 
@@ -2464,12 +2038,11 @@ a remote tablespace is found it will be changed to true.
 If the fix_dict boolean is set, then it is safe to use an internal SQL
 statement to update the dictionary tables if they are incorrect.
 
-@param[in]	validate	true if we should validate the tablespace
-@param[in]	fix_dict	true if the dictionary is available to be fixed
+@param[in]	validate	0=maybe missing, 1=do not validate, 2=validate
 @param[in]	purpose		FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
 @param[in]	id		tablespace ID
 @param[in]	flags		expected FSP_SPACE_FLAGS
-@param[in]	space_name	tablespace name of the datafile
+@param[in]	name		table name
 If file-per-table, it is the table name in the databasename/tablename format
 @param[in]	path_in		expected filepath, usually read from dictionary
 @param[out]	err		DB_SUCCESS or error code
@@ -2477,102 +2050,76 @@ If file-per-table, it is the table name in the databasename/tablename format
 @retval	NULL	if the tablespace could not be opened */
 fil_space_t*
 fil_ibd_open(
-	bool			validate,
-	bool			fix_dict,
+	unsigned		validate,
 	fil_type_t		purpose,
 	ulint			id,
 	ulint			flags,
-	const table_name_t&	tablename,
+	fil_space_t::name_type	name,
 	const char*		path_in,
 	dberr_t*		err)
 {
-	mutex_enter(&fil_system.mutex);
-	if (fil_space_t* space = fil_space_get_by_id(id)) {
-		if (strcmp(space->name, tablename.m_name)) {
-			table_name_t space_name;
-			space_name.m_name = space->name;
-			ib::error()
-				<< "Trying to open table " << tablename
-				<< " with id " << id
-				<< ", conflicting with " << space_name;
-			space = NULL;
-			if (err) *err = DB_TABLESPACE_EXISTS;
-		} else if (err) *err = DB_SUCCESS;
-
-		mutex_exit(&fil_system.mutex);
-
-		if (space && validate && !srv_read_only_mode) {
+	mysql_mutex_lock(&fil_system.mutex);
+	fil_space_t* space = fil_space_get_by_id(id);
+	mysql_mutex_unlock(&fil_system.mutex);
+	if (space) {
+		if (validate > 1 && !srv_read_only_mode) {
 			fsp_flags_try_adjust(space,
 					     flags & ~FSP_FLAGS_MEM_MASK);
 		}
-
 		return space;
 	}
-	mutex_exit(&fil_system.mutex);
 
-	bool		dict_filepath_same_as_default = false;
-	bool		link_file_found = false;
-	bool		link_file_is_bad = false;
-	Datafile	df_default;	/* default location */
-	Datafile	df_dict;	/* dictionary location */
-	RemoteDatafile	df_remote;	/* remote location */
-	ulint		tablespaces_found = 0;
-	ulint		valid_tablespaces_found = 0;
-
-	if (fix_dict) {
-		ut_d(dict_sys.assert_locked());
-		ut_ad(!srv_read_only_mode);
-		ut_ad(srv_log_file_size != 0);
-	}
+	dberr_t local_err = DB_SUCCESS;
 
 	/* Table flags can be ULINT_UNDEFINED if
 	dict_tf_to_fsp_flags_failure is set. */
 	if (flags == ULINT_UNDEFINED) {
 corrupted:
-		if (err) *err = DB_CORRUPTION;
-		return NULL;
+		local_err = DB_CORRUPTION;
+func_exit:
+		if (err) *err = local_err;
+		return space;
 	}
 
 	ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
-	df_default.init(tablename.m_name, flags);
-	df_dict.init(tablename.m_name, flags);
-	df_remote.init(tablename.m_name, flags);
+
+	Datafile	df_default;	/* default location */
+	RemoteDatafile	df_remote;	/* remote location */
+	ulint		tablespaces_found = 0;
+	ulint		valid_tablespaces_found = 0;
+
+	df_default.init(flags);
+	df_remote.init(flags);
 
 	/* Discover the correct file by looking in three possible locations
 	while avoiding unecessary effort. */
 
 	/* We will always look for an ibd in the default location. */
-	df_default.make_filepath(NULL, tablename.m_name, IBD);
+	df_default.make_filepath(nullptr, name, IBD);
 
 	/* Look for a filepath embedded in an ISL where the default file
 	would be. */
-	if (df_remote.open_read_only(true) == DB_SUCCESS) {
-		ut_ad(df_remote.is_open());
-
-		/* Always validate a file opened from an ISL pointer */
-		validate = true;
-		++tablespaces_found;
-		link_file_found = true;
-	} else if (df_remote.filepath() != NULL) {
-		/* An ISL file was found but contained a bad filepath in it.
-		Better validate anything we do find. */
-		validate = true;
-	}
+	bool must_validate = df_remote.open_link_file(name);
 
-	/* Attempt to open the tablespace at the dictionary filepath. */
-	if (path_in) {
-		if (df_default.same_filepath_as(path_in)) {
-			dict_filepath_same_as_default = true;
+	if (must_validate) {
+		if (df_remote.open_read_only(true) == DB_SUCCESS) {
+			ut_ad(df_remote.is_open());
+			++tablespaces_found;
 		} else {
-			/* Dict path is not the default path. Always validate
-			remote files. If default is opened, it was moved. */
-			validate = true;
-			df_dict.set_filepath(path_in);
-			if (df_dict.open_read_only(true) == DB_SUCCESS) {
-				ut_ad(df_dict.is_open());
-				++tablespaces_found;
-			}
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+			ib::error() << "A link file was found named '"
+				    << df_remote.link_filepath()
+				    << "' but the linked tablespace '"
+				    << df_remote.filepath()
+				    << "' could not be opened read-only.";
 		}
+	} else if (path_in && !df_default.same_filepath_as(path_in)) {
+		/* Dict path is not the default path. Always validate
+		remote files. If default is opened, it was moved. */
+		must_validate = true;
+	} else if (validate > 1) {
+		must_validate = true;
 	}
 
 	const bool operation_not_for_export =
@@ -2582,8 +2129,20 @@ corrupted:
 	/* Always look for a file at the default location. But don't log
 	an error if the tablespace is already open in remote or dict. */
 	ut_a(df_default.filepath());
-	const bool	strict = operation_not_for_export
-	  && (tablespaces_found == 0);
+
+	/* Mariabackup will not copy files whose names start with
+	#sql-. We will suppress messages about such files missing on
+	the first server startup. The tables ought to be dropped by
+	drop_garbage_tables_after_restore() a little later. */
+
+	const bool strict = validate && !tablespaces_found
+		&& operation_not_for_export
+		&& !(srv_operation == SRV_OPERATION_NORMAL
+		     && srv_start_after_restore
+		     && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+		     && dict_table_t::is_temporary_name(
+			     df_default.filepath()));
+
 	if (df_default.open_read_only(strict) == DB_SUCCESS) {
 		ut_ad(df_default.is_open());
 		++tablespaces_found;
@@ -2597,21 +2156,13 @@ corrupted:
 		df_remote.delete_link_file();
 		df_remote.close();
 	}
-	if (tablespaces_found > 1 && df_default.same_as(df_dict)) {
-		--tablespaces_found;
-		df_dict.close();
-	}
-	if (tablespaces_found > 1 && df_remote.same_as(df_dict)) {
-		--tablespaces_found;
-		df_dict.close();
-	}
 
 	/*  We have now checked all possible tablespace locations and
 	have a count of how many unique files we found.  If things are
 	normal, we only found 1. */
 	/* For encrypted tablespace, we need to check the
 	encryption in header of first page. */
-	if (!validate && tablespaces_found == 1) {
+	if (!must_validate && tablespaces_found == 1) {
 		goto skip_validate;
 	}
 
@@ -2623,47 +2174,47 @@ corrupted:
 	valid_tablespaces_found +=
 		(df_default.validate_to_dd(id, flags) == DB_SUCCESS);
 
-	valid_tablespaces_found +=
-		(df_dict.validate_to_dd(id, flags) == DB_SUCCESS);
-
 	/* Make sense of these three possible locations.
 	First, bail out if no tablespace files were found. */
 	if (valid_tablespaces_found == 0) {
-		os_file_get_last_error(
-		    operation_not_for_export, !operation_not_for_export);
-		if (operation_not_for_export)
-		  ib::error() << "Could not find a valid tablespace file for `"
-		    << tablename << "`. " << TROUBLESHOOT_DATADICT_MSG;
+		if (!strict
+		    && IF_WIN(GetLastError() == ERROR_FILE_NOT_FOUND
+			      || GetLastError() == ERROR_PATH_NOT_FOUND,
+			      errno == ENOENT)) {
+			/* Suppress a message about a missing file. */
+			goto corrupted;
+		}
+
+		os_file_get_last_error(operation_not_for_export,
+				       !operation_not_for_export);
+		if (!operation_not_for_export) {
+			goto corrupted;
+		}
+		sql_print_error("InnoDB: Could not find a valid tablespace"
+				" file for %.*s. %s",
+				static_cast<int>(name.size()), name.data(),
+				TROUBLESHOOT_DATADICT_MSG);
 		goto corrupted;
 	}
-	if (!validate) {
+	if (!must_validate) {
 		goto skip_validate;
 	}
 
 	/* Do not open any tablespaces if more than one tablespace with
 	the correct space ID and flags were found. */
-	if (tablespaces_found > 1) {
-		ib::error() << "A tablespace for `" << tablename
-			<< "` has been found in multiple places;";
-
-		if (df_default.is_open()) {
-			ib::error() << "Default location: "
-				<< df_default.filepath()
-				<< ", Space ID=" << df_default.space_id()
-				<< ", Flags=" << df_default.flags();
-		}
-		if (df_remote.is_open()) {
-			ib::error() << "Remote location: "
-				<< df_remote.filepath()
-				<< ", Space ID=" << df_remote.space_id()
-				<< ", Flags=" << df_remote.flags();
-		}
-		if (df_dict.is_open()) {
-			ib::error() << "Dictionary location: "
-				<< df_dict.filepath()
-				<< ", Space ID=" << df_dict.space_id()
-				<< ", Flags=" << df_dict.flags();
-		}
+	if (df_default.is_open() && df_remote.is_open()) {
+		ib::error()
+			<< "A tablespace has been found in multiple places: "
+			<< df_default.filepath()
+			<< "(Space ID=" << df_default.space_id()
+			<< ", Flags=" << df_default.flags()
+			<< ") and "
+			<< df_remote.filepath()
+			<< "(Space ID=" << df_remote.space_id()
+			<< ", Flags=" << df_remote.flags()
+			<< (valid_tablespaces_found > 1 || srv_force_recovery
+			    ? "); will not open"
+			    : ")");
 
 		/* Force-recovery will allow some tablespaces to be
 		skipped by REDO if there was more than one file found.
@@ -2673,24 +2224,19 @@ corrupted:
 		recovery and there is only one good tablespace, ignore
 		any bad tablespaces. */
 		if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
-			ib::error() << "Will not open tablespace `"
-				<< tablename << "`";
-
 			/* If the file is not open it cannot be valid. */
 			ut_ad(df_default.is_open() || !df_default.is_valid());
-			ut_ad(df_dict.is_open()    || !df_dict.is_valid());
 			ut_ad(df_remote.is_open()  || !df_remote.is_valid());
 
 			/* Having established that, this is an easy way to
 			look for corrupted data files. */
 			if (df_default.is_open() != df_default.is_valid()
-			    || df_dict.is_open() != df_dict.is_valid()
 			    || df_remote.is_open() != df_remote.is_valid()) {
 				goto corrupted;
 			}
 error:
-			if (err) *err = DB_ERROR;
-			return NULL;
+			local_err = DB_ERROR;
+			goto func_exit;
 		}
 
 		/* There is only one valid tablespace found and we did
@@ -2701,17 +2247,9 @@ error:
 			tablespaces_found--;
 		}
 
-		if (df_dict.is_open() && !df_dict.is_valid()) {
-			df_dict.close();
-			/* Leave dict.filepath so that SYS_DATAFILES
-			can be corrected below. */
-			tablespaces_found--;
-		}
-
 		if (df_remote.is_open() && !df_remote.is_valid()) {
 			df_remote.close();
 			tablespaces_found--;
-			link_file_is_bad = true;
 		}
 	}
 
@@ -2719,78 +2257,9 @@ error:
 	ut_a(tablespaces_found == 1);
 	ut_a(valid_tablespaces_found == 1);
 
-	/* Only fix the dictionary at startup when there is only one thread.
-	Calls to dict_load_table() can be done while holding other latches. */
-	if (!fix_dict) {
-		goto skip_validate;
-	}
-
-	/* We may need to update what is stored in SYS_DATAFILES or
-	SYS_TABLESPACES or adjust the link file.  Since a failure to
-	update SYS_TABLESPACES or SYS_DATAFILES does not prevent opening
-	and using the tablespace either this time or the next, we do not
-	check the return code or fail to open the tablespace. But if it
-	fails, dict_update_filepath() will issue a warning to the log. */
-	if (df_dict.filepath()) {
-		ut_ad(path_in != NULL);
-		ut_ad(df_dict.same_filepath_as(path_in));
-
-		if (df_remote.is_open()) {
-			if (!df_remote.same_filepath_as(path_in)) {
-				dict_update_filepath(id, df_remote.filepath());
-			}
-
-		} else if (df_default.is_open()) {
-			ut_ad(!dict_filepath_same_as_default);
-			dict_update_filepath(id, df_default.filepath());
-			if (link_file_is_bad) {
-				RemoteDatafile::delete_link_file(
-					tablename.m_name);
-			}
-
-		} else if (!link_file_found || link_file_is_bad) {
-			ut_ad(df_dict.is_open());
-			/* Fix the link file if we got our filepath
-			from the dictionary but a link file did not
-			exist or it did not point to a valid file. */
-			RemoteDatafile::delete_link_file(tablename.m_name);
-			RemoteDatafile::create_link_file(
-				tablename.m_name, df_dict.filepath());
-		}
-
-	} else if (df_remote.is_open()) {
-		if (dict_filepath_same_as_default) {
-			dict_update_filepath(id, df_remote.filepath());
-
-		} else if (path_in == NULL) {
-			/* SYS_DATAFILES record for this space ID
-			was not found. */
-			dict_replace_tablespace_and_filepath(
-				id, tablename.m_name,
-				df_remote.filepath(), flags);
-		}
-
-	} else if (df_default.is_open()) {
-		/* We opened the tablespace in the default location.
-		SYS_DATAFILES.PATH needs to be updated if it is different
-		from this default path or if the SYS_DATAFILES.PATH was not
-		supplied and it should have been. Also update the dictionary
-		if we found an ISL file (since !df_remote.is_open).  Since
-		path_in is not suppled for file-per-table, we must assume
-		that it matched the ISL. */
-		if ((path_in != NULL && !dict_filepath_same_as_default)
-		    || (path_in == NULL && DICT_TF_HAS_DATA_DIR(flags))
-		    || df_remote.filepath() != NULL) {
-			dict_replace_tablespace_and_filepath(
-				id, tablename.m_name, df_default.filepath(),
-				flags);
-		}
-	}
-
 skip_validate:
 	const byte* first_page =
 		df_default.is_open() ? df_default.get_first_page() :
-		df_dict.is_open() ? df_dict.get_first_page() :
 		df_remote.get_first_page();
 
 	fil_space_crypt_t* crypt_data = first_page
@@ -2798,8 +2267,7 @@ skip_validate:
 					    first_page)
 		: NULL;
 
-	fil_space_t* space = fil_space_t::create(
-		tablename.m_name, id, flags, purpose, crypt_data);
+	space = fil_space_t::create(id, flags, purpose, crypt_data);
 	if (!space) {
 		goto error;
 	}
@@ -2809,12 +2277,10 @@ skip_validate:
 
 	space->add(
 		df_remote.is_open() ? df_remote.filepath() :
-		df_dict.is_open() ? df_dict.filepath() :
 		df_default.filepath(), OS_FILE_CLOSED, 0, false, true);
 
-	if (validate && !srv_read_only_mode) {
+	if (must_validate && !srv_read_only_mode) {
 		df_remote.close();
-		df_dict.close();
 		df_default.close();
 		if (space->acquire()) {
 			if (purpose != FIL_TYPE_IMPORT) {
@@ -2825,91 +2291,7 @@ skip_validate:
 		}
 	}
 
-	if (err) *err = DB_SUCCESS;
-	return space;
-}
-
-/** Looks for a pre-existing fil_space_t with the given tablespace ID
-and, if found, returns the name and filepath in newly allocated buffers
-that the caller must free.
-@param[in]	space_id	The tablespace ID to search for.
-@param[out]	name		Name of the tablespace found.
-@param[out]	filepath	The filepath of the first datafile for the
-tablespace.
-@return true if tablespace is found, false if not. */
-bool
-fil_space_read_name_and_filepath(
-	ulint	space_id,
-	char**	name,
-	char**	filepath)
-{
-	bool	success = false;
-	*name = NULL;
-	*filepath = NULL;
-
-	mutex_enter(&fil_system.mutex);
-
-	fil_space_t*	space = fil_space_get_by_id(space_id);
-
-	if (space != NULL) {
-		*name = mem_strdup(space->name);
-
-		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
-		*filepath = mem_strdup(node->name);
-
-		success = true;
-	}
-
-	mutex_exit(&fil_system.mutex);
-
-	return(success);
-}
-
-/** Convert a file name to a tablespace name.
-@param[in]	filename	directory/databasename/tablename.ibd
-@return database/tablename string, to be freed with ut_free() */
-char*
-fil_path_to_space_name(
-	const char*	filename)
-{
-	/* Strip the file name prefix and suffix, leaving
-	only databasename/tablename. */
-	ulint		filename_len	= strlen(filename);
-	const char*	end		= filename + filename_len;
-#ifdef HAVE_MEMRCHR
-	const char*	tablename	= 1 + static_cast<const char*>(
-		memrchr(filename, OS_PATH_SEPARATOR,
-			filename_len));
-	const char*	dbname		= 1 + static_cast<const char*>(
-		memrchr(filename, OS_PATH_SEPARATOR,
-			tablename - filename - 1));
-#else /* HAVE_MEMRCHR */
-	const char*	tablename	= filename;
-	const char*	dbname		= NULL;
-
-	while (const char* t = static_cast<const char*>(
-		       memchr(tablename, OS_PATH_SEPARATOR,
-			      ulint(end - tablename)))) {
-		dbname = tablename;
-		tablename = t + 1;
-	}
-#endif /* HAVE_MEMRCHR */
-
-	ut_ad(dbname != NULL);
-	ut_ad(tablename > dbname);
-	ut_ad(tablename < end);
-	ut_ad(end - tablename > 4);
-	ut_ad(memcmp(end - 4, DOT_IBD, 4) == 0);
-
-	char*	name = mem_strdupl(dbname, ulint(end - dbname) - 4);
-
-	ut_ad(name[tablename - dbname - 1] == OS_PATH_SEPARATOR);
-#if OS_PATH_SEPARATOR != '/'
-	/* space->name uses '/', not OS_PATH_SEPARATOR. */
-	name[tablename - dbname - 1] = '/';
-#endif
-
-	return(name);
+	goto func_exit;
 }
 
 /** Discover the correct IBD file to open given a remote or missing
@@ -2942,14 +2324,18 @@ fil_ibd_discover(
 	ulint		sep_found = 0;
 	const char*	db = basename;
 	for (; db > filename && sep_found < 2; db--) {
-		if (db[0] == OS_PATH_SEPARATOR) {
+		switch (db[0]) {
+#ifdef _WIN32
+		case '\\':
+#endif
+		case '/':
 			sep_found++;
 		}
 	}
 	if (sep_found == 2) {
 		db += 2;
-		df_def_per.init(db, 0);
-		df_def_per.make_filepath(NULL, db, IBD);
+		df_def_per.init(0);
+		df_def_per.set_filepath(db);
 		if (df_def_per.open_read_only(false) == DB_SUCCESS
 		    && df_def_per.validate_for_recovery() == DB_SUCCESS
 		    && df_def_per.space_id() == space_id) {
@@ -2963,6 +2349,7 @@ fil_ibd_discover(
 		switch (srv_operation) {
 		case SRV_OPERATION_BACKUP:
 		case SRV_OPERATION_RESTORE_DELTA:
+		case SRV_OPERATION_BACKUP_NO_DEFER:
 			ut_ad(0);
 			break;
 		case SRV_OPERATION_RESTORE_EXPORT:
@@ -2970,8 +2357,13 @@ fil_ibd_discover(
 			break;
 		case SRV_OPERATION_NORMAL:
 		case SRV_OPERATION_EXPORT_RESTORED:
-			df_rem_per.set_name(db);
-			if (df_rem_per.open_link_file() != DB_SUCCESS) {
+			size_t len= strlen(db);
+			if (len <= 4 || strcmp(db + len - 4, dot_ext[IBD])) {
+				break;
+			}
+			df_rem_per.open_link_file({db, len - 4});
+
+			if (!df_rem_per.filepath()) {
 				break;
 			}
 
@@ -2994,7 +2386,7 @@ fil_ibd_discover(
 			}
 
 			/* Use this file if it has the space_id from the
-			MLOG record. */
+			FILE_ record. */
 			if (df_rem_per.space_id() == space_id) {
 				df.set_filepath(df_rem_per.filepath());
 				df.open_read_only(false);
@@ -3041,9 +2433,9 @@ fil_ibd_load(
 {
 	/* If the a space is already in the file system cache with this
 	space ID, then there is nothing to do. */
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	space = fil_space_get_by_id(space_id);
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	if (space) {
 		/* Compare the filename we are trying to open with the
@@ -3066,9 +2458,20 @@ fil_ibd_load(
 	if (srv_operation == SRV_OPERATION_RESTORE) {
 		/* Replace absolute DATA DIRECTORY file paths with
 		short names relative to the backup directory. */
-		if (const char* name = strrchr(filename, OS_PATH_SEPARATOR)) {
+		const char* name = strrchr(filename, '/');
+#ifdef _WIN32
+		if (const char *last = strrchr(filename, '\\')) {
+			if (last > name) {
+				name = last;
+			}
+		}
+#endif
+		if (name) {
 			while (--name > filename
-			       && *name != OS_PATH_SEPARATOR);
+#ifdef _WIN32
+			       && *name != '\\'
+#endif
+			       && *name != '/');
 			if (name > filename) {
 				filename = name + 1;
 			}
@@ -3088,15 +2491,23 @@ fil_ibd_load(
 	}
 
 	os_offset_t	size;
+	bool		deferred_space = false;
 
 	/* Read and validate the first page of the tablespace.
 	Assign a tablespace name based on the tablespace type. */
 	switch (file.validate_for_recovery()) {
 		os_offset_t	minimum_size;
 	case DB_SUCCESS:
+		deferred_space = file.m_defer;
+
+		if (deferred_space) {
+			goto tablespace_check;
+		}
+
 		if (file.space_id() != space_id) {
 			return(FIL_LOAD_ID_CHANGED);
 		}
+tablespace_check:
 		/* Get and test the file size. */
 		size = os_file_get_size(file.handle());
 
@@ -3112,6 +2523,8 @@ fil_ibd_load(
 			ib::error() << "Could not measure the size of"
 				" single-table tablespace file '"
 				<< file.filepath() << "'";
+		} else if (deferred_space) {
+			return FIL_LOAD_DEFER;
 		} else if (size < minimum_size) {
 			ib::error() << "The size of tablespace file '"
 				<< file.filepath() << "' is only " << size
@@ -3154,7 +2567,7 @@ fil_ibd_load(
 	}
 
 	space = fil_space_t::create(
-		file.name(), space_id, flags, FIL_TYPE_TABLESPACE, crypt_data);
+		space_id, flags, FIL_TYPE_TABLESPACE, crypt_data);
 
 	if (space == NULL) {
 		return(FIL_LOAD_INVALID);
@@ -3195,7 +2608,7 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags)
 	if (buf_block_t* b = buf_page_get(
 		    page_id_t(space->id, 0), space->zip_size(),
 		    RW_X_LATCH, &mtr)) {
-		uint32_t f = fsp_header_get_flags(b->frame);
+		uint32_t f = fsp_header_get_flags(b->page.frame);
 		if (fil_space_t::full_crc32(f)) {
 			goto func_exit;
 		}
@@ -3213,7 +2626,7 @@ void fsp_flags_try_adjust(fil_space_t* space, ulint flags)
 		mtr.set_named_space(space);
 		mtr.write<4,mtr_t::FORCED>(*b,
 					   FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
-					   + b->frame, flags);
+					   + b->page.frame, flags);
 	}
 func_exit:
 	mtr.commit();
@@ -3223,19 +2636,14 @@ func_exit:
 memory cache. Note that if we have not done a crash recovery at the database
 startup, there may be many tablespaces which are not yet in the memory cache.
 @param[in]	id		Tablespace ID
-@param[in]	name		Tablespace name used in fil_space_t::create().
 @param[in]	table_flags	table flags
 @return the tablespace
 @retval	NULL	if no matching tablespace exists in the memory cache */
-fil_space_t*
-fil_space_for_table_exists_in_mem(
-	ulint		id,
-	const char*	name,
-	ulint		table_flags)
+fil_space_t *fil_space_for_table_exists_in_mem(ulint id, ulint table_flags)
 {
 	const ulint	expected_flags = dict_tf_to_fsp_flags(table_flags);
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	if (fil_space_t* space = fil_space_get_by_id(id)) {
 		ulint tf = expected_flags & ~FSP_FLAGS_MEM_MASK;
 		ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
@@ -3245,22 +2653,11 @@ fil_space_for_table_exists_in_mem(
 			goto func_exit;
 		}
 
-		if (strcmp(space->name, name)) {
-			ib::error() << "Table " << name
-				<< " in InnoDB data dictionary"
-				" has tablespace id " << id
-				<< ", but the tablespace"
-				" with that id has name " << space->name << "."
-				" Have you deleted or moved .ibd files?";
-			ib::info() << TROUBLESHOOT_DATADICT_MSG;
-			goto func_exit;
-		}
-
 		/* Adjust the flags that are in FSP_FLAGS_MEM_MASK.
 		FSP_SPACE_FLAGS will not be written back here. */
 		space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK)
 			| (expected_flags & FSP_FLAGS_MEM_MASK);
-		mutex_exit(&fil_system.mutex);
+		mysql_mutex_unlock(&fil_system.mutex);
 		if (!srv_read_only_mode) {
 			fsp_flags_try_adjust(space, expected_flags
 					     & ~FSP_FLAGS_MEM_MASK);
@@ -3269,7 +2666,7 @@ fil_space_for_table_exists_in_mem(
 	}
 
 func_exit:
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 	return NULL;
 }
 
@@ -3277,35 +2674,35 @@ func_exit:
 
 /** Report information about an invalid page access. */
 ATTRIBUTE_COLD
-static void fil_invalid_page_access_msg(bool fatal, const char *name,
+static void fil_invalid_page_access_msg(const char *name,
                                         os_offset_t offset, ulint len,
                                         bool is_read)
 {
-  sql_print_error("%s%s %zu bytes at " UINT64PF
+  sql_print_error("%s %zu bytes at " UINT64PF
                   " outside the bounds of the file: %s",
-                  fatal ? "[FATAL] InnoDB: " : "InnoDB: ",
-                  is_read ? "Trying to read" : "Trying to write",
-                  len, offset, name);
-  if (fatal)
+                  is_read
+                  ? "InnoDB: Trying to read"
+                  : "[FATAL] InnoDB: Trying to write", len, offset, name);
+  if (!is_read)
     abort();
 }
 
 /** Update the data structures on write completion */
 inline void fil_node_t::complete_write()
 {
-  ut_ad(!mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
 
   if (space->purpose != FIL_TYPE_TEMPORARY &&
       srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
       space->set_needs_flush())
   {
-    mutex_enter(&fil_system.mutex);
+    mysql_mutex_lock(&fil_system.mutex);
     if (!space->is_in_unflushed_spaces)
     {
       space->is_in_unflushed_spaces= true;
       fil_system.unflushed_spaces.push_front(*space);
     }
-    mutex_exit(&fil_system.mutex);
+    mysql_mutex_unlock(&fil_system.mutex);
   }
 }
 
@@ -3335,15 +2732,22 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
 
 	fil_node_t* node= UT_LIST_GET_FIRST(chain);
 	ut_ad(node);
+	ulint p = static_cast<ulint>(offset >> srv_page_size_shift);
+	dberr_t err;
 
-	if (type.type == IORequest::READ_ASYNC && is_stopping()
-	    && !is_being_truncated) {
-		release();
-		return {DB_TABLESPACE_DELETED, nullptr};
+	if (type.type == IORequest::READ_ASYNC && is_stopping()) {
+		err = DB_TABLESPACE_DELETED;
+		node = nullptr;
+		goto release;
 	}
 
-	ulint p = static_cast<ulint>(offset >> srv_page_size_shift);
-	bool fatal;
+	DBUG_EXECUTE_IF("intermittent_recovery_failure",
+			if (type.is_read() && !(~get_rnd_value() & 0x3ff0))
+			goto io_error;);
+
+	DBUG_EXECUTE_IF("intermittent_read_failure",
+			if (srv_was_started && type.is_read() &&
+			    !(~get_rnd_value() & 0x3ff0)) goto io_error;);
 
 	if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) {
 		ut_ad(this == fil_system.sys_space
@@ -3354,17 +2758,20 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
 			p -= node->size;
 			node = UT_LIST_GET_NEXT(chain, node);
 			if (!node) {
-				if (type.type == IORequest::READ_ASYNC) {
-					release();
-					return {DB_ERROR, nullptr};
-				}
-
-				fatal = true;
 fail:
-				fil_invalid_page_access_msg(fatal, node->name,
-							    offset, len,
-							    type.is_read());
-				return {DB_IO_ERROR, nullptr};
+				if (type.type != IORequest::READ_ASYNC) {
+					fil_invalid_page_access_msg(
+						node->name,
+						offset, len,
+						type.is_read());
+				}
+#ifndef DBUG_OFF
+io_error:
+#endif
+				set_corrupted();
+				err = DB_CORRUPTION;
+				node = nullptr;
+				goto release;
 			}
 		}
 
@@ -3372,52 +2779,38 @@ fail:
 	}
 
 	if (UNIV_UNLIKELY(node->size <= p)) {
-		release();
-
-		if (type.type == IORequest::READ_ASYNC) {
-			/* If we can tolerate the non-existent pages, we
-			should return with DB_ERROR and let caller decide
-			what to do. */
-			return {DB_ERROR, nullptr};
-		}
-
-		fatal = node->space->purpose != FIL_TYPE_IMPORT;
 		goto fail;
 	}
 
-	dberr_t err;
-
 	if (type.type == IORequest::PUNCH_RANGE) {
 		err = os_file_punch_hole(node->handle, offset, len);
 		/* Punch hole is not supported, make space not to
 		support punch hole */
 		if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) {
-			punch_hole = false;
+			node->punch_hole = false;
 			err = DB_SUCCESS;
 		}
 		goto release_sync_write;
 	} else {
 		/* Queue the aio request */
-		err = os_aio(IORequest(bpage, node, type.type),
+		err = os_aio(IORequest{bpage, type.slot, node, type.type},
 			     buf, offset, len);
 	}
 
-	/* We an try to recover the page from the double write buffer if
-	the decompression fails or the page is corrupt. */
-
-	ut_a(type.type == IORequest::DBLWR_RECOVER || err == DB_SUCCESS);
 	if (!type.is_async()) {
 		if (type.is_write()) {
 release_sync_write:
 			node->complete_write();
 release:
 			release();
+			goto func_exit;
 		}
 		ut_ad(fil_validate_skip());
 	}
 	if (err != DB_SUCCESS) {
 		goto release;
 	}
+func_exit:
 	return {err, node};
 }
 
@@ -3454,13 +2847,18 @@ write_completed:
     files and never issue asynchronous reads of change buffer pages. */
     const page_id_t id(request.bpage->id());
 
-    if (dberr_t err= buf_page_read_complete(request.bpage, *request.node))
+    if (dberr_t err= request.bpage->read_complete(*request.node))
     {
       if (recv_recovery_is_on() && !srv_force_recovery)
-        recv_sys.found_corrupt_fs= true;
-
-      ib::error() << "Failed to read page " << id.page_no()
-                  << " from file '" << request.node->name << "': " << err;
+      {
+        mysql_mutex_lock(&recv_sys.mutex);
+        recv_sys.set_corrupt_fs();
+        mysql_mutex_unlock(&recv_sys.mutex);
+      }
+
+      if (err != DB_FAIL)
+        ib::error() << "Failed to read page " << id.page_no()
+                    << " from file '" << request.node->name << "': " << err;
     }
   }
 
@@ -3473,28 +2871,28 @@ void fil_flush_file_spaces()
 {
   if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
   {
-    ut_d(mutex_enter(&fil_system.mutex));
+    ut_d(mysql_mutex_lock(&fil_system.mutex));
     ut_ad(fil_system.unflushed_spaces.empty());
-    ut_d(mutex_exit(&fil_system.mutex));
+    ut_d(mysql_mutex_unlock(&fil_system.mutex));
     return;
   }
 
 rescan:
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
 
   for (fil_space_t &space : fil_system.unflushed_spaces)
   {
     if (space.needs_flush_not_stopping())
     {
       space.reacquire();
-      mutex_exit(&fil_system.mutex);
+      mysql_mutex_unlock(&fil_system.mutex);
       space.flush_low();
       space.release();
       goto rescan;
     }
   }
 
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
 }
 
 /** Functor to validate the file node list of a tablespace. */
@@ -3520,7 +2918,7 @@ struct	Check {
 	@return		number of open file nodes */
 	static ulint validate(const fil_space_t* space)
 	{
-		ut_ad(mutex_own(&fil_system.mutex));
+		mysql_mutex_assert_owner(&fil_system.mutex);
 		Check	check;
 		ut_list_validate(space->chain, check);
 		ut_a(space->size == check.size);
@@ -3549,17 +2947,15 @@ bool fil_validate()
 {
 	ulint		n_open		= 0;
 
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 
-	for (fil_space_t *space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space != NULL;
-	     space = UT_LIST_GET_NEXT(space_list, space)) {
-		n_open += Check::validate(space);
+	for (fil_space_t &space : fil_system.space_list) {
+		n_open += Check::validate(&space);
 	}
 
 	ut_a(fil_system.n_open == n_open);
 
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 
 	return(true);
 }
@@ -3581,23 +2977,18 @@ fil_page_set_type(
 Delete the tablespace file and any related files like .cfg.
 This should not be called for temporary tables.
 @param[in] ibd_filepath File path of the IBD tablespace */
-void
-fil_delete_file(
-/*============*/
-	const char*	ibd_filepath)
+void fil_delete_file(const char *ibd_filepath)
 {
-	/* Force a delete of any stale .ibd files that are lying around. */
+  ib::info() << "Deleting " << ibd_filepath;
+  os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, nullptr);
 
-	ib::info() << "Deleting " << ibd_filepath;
-	os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, NULL);
-
-	char*	cfg_filepath = fil_make_filepath(
-		ibd_filepath, NULL, CFG, false);
-	if (cfg_filepath != NULL) {
-		os_file_delete_if_exists(
-			innodb_data_file_key, cfg_filepath, NULL);
-		ut_free(cfg_filepath);
-	}
+  if (char *cfg_filepath= fil_make_filepath(ibd_filepath,
+					    fil_space_t::name_type{}, CFG,
+					    false))
+  {
+    os_file_delete_if_exists(innodb_data_file_key, cfg_filepath, nullptr);
+    ut_free(cfg_filepath);
+  }
 }
 
 #ifdef UNIV_DEBUG
@@ -3608,15 +2999,14 @@ void
 fil_space_validate_for_mtr_commit(
 	const fil_space_t*	space)
 {
-	ut_ad(!mutex_own(&fil_system.mutex));
+	mysql_mutex_assert_not_owner(&fil_system.mutex);
 	ut_ad(space != NULL);
 	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
 	ut_ad(!is_predefined_tablespace(space->id));
 
 	/* We are serving mtr_commit(). While there is an active
 	mini-transaction, we should have !space->stop_new_ops. This is
-	guaranteed by meta-data locks or transactional locks, or
-	dict_sys.latch (X-lock in DROP, S-lock in purge). */
+	guaranteed by meta-data locks or transactional locks. */
 	ut_ad(!space->is_stopping()
 	      || space->is_being_truncated /* fil_truncate_prepare() */
 	      || space->referenced());
@@ -3649,7 +3039,7 @@ fil_names_dirty(
 	ut_ad(space->max_lsn == 0);
 	ut_d(fil_space_validate_for_mtr_commit(space));
 
-	UT_LIST_ADD_LAST(fil_system.named_spaces, space);
+	fil_system.named_spaces.push_back(*space);
 	space->max_lsn = log_sys.get_lsn();
 }
 
@@ -3663,7 +3053,7 @@ void fil_names_dirty_and_write(fil_space_t* space)
 	ut_d(fil_space_validate_for_mtr_commit(space));
 	ut_ad(space->max_lsn == log_sys.get_lsn());
 
-	UT_LIST_ADD_LAST(fil_system.named_spaces, space);
+	fil_system.named_spaces.push_back(*space);
 	mtr_t mtr;
 	mtr.start();
 	fil_names_write(space, &mtr);
@@ -3671,7 +3061,6 @@ void fil_names_dirty_and_write(fil_space_t* space)
 	DBUG_EXECUTE_IF("fil_names_write_bogus",
 			{
 				char bogus_name[] = "./test/bogus file.ibd";
-				os_normalize_path(bogus_name);
 				fil_name_write(
 					SRV_SPACE_ID_UPPER_BOUND,
 					bogus_name, &mtr);
@@ -3699,27 +3088,27 @@ fil_names_clear(
 
 	mtr.start();
 
-	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.named_spaces);
-	     space != NULL; ) {
+	for (auto it = fil_system.named_spaces.begin();
+	     it != fil_system.named_spaces.end(); ) {
 		if (mtr.get_log_size()
-		    + strlen(space->chain.start->name)
+		    + strlen(it->chain.start->name)
 		    >= RECV_SCAN_SIZE - (3 + 5 + 1)) {
 			/* Prevent log parse buffer overflow */
 			mtr.commit_files();
 			mtr.start();
 		}
 
-		fil_space_t*	next = UT_LIST_GET_NEXT(named_spaces, space);
+		auto next = std::next(it);
 
-		ut_ad(space->max_lsn > 0);
-		if (space->max_lsn < lsn) {
+		ut_ad(it->max_lsn > 0);
+		if (it->max_lsn < lsn) {
 			/* The tablespace was last dirtied before the
 			checkpoint LSN. Remove it from the list, so
 			that if the tablespace is not going to be
 			modified any more, subsequent checkpoints will
 			avoid calling fil_names_write() on it. */
-			space->max_lsn = 0;
-			UT_LIST_REMOVE(fil_system.named_spaces, space);
+			it->max_lsn = 0;
+			fil_system.named_spaces.erase(it);
 		}
 
 		/* max_lsn is the last LSN where fil_names_dirty_and_write()
@@ -3727,10 +3116,10 @@ fil_names_clear(
 		where max_lsn turned nonzero), we could avoid the
 		fil_names_write() call if min_lsn > lsn. */
 
-		fil_names_write(space, &mtr);
+		fil_names_write(&*it, &mtr);
 		do_write = true;
 
-		space = next;
+		it = next;
 	}
 
 	if (do_write) {
@@ -3793,9 +3182,7 @@ test_make_filepath()
 @param[in]	space		tablespace
 @param[in]	offset		page number
 @return	block size */
-UNIV_INTERN
-ulint
-fil_space_get_block_size(const fil_space_t* space, unsigned offset)
+ulint fil_space_get_block_size(const fil_space_t *space, unsigned offset)
 {
 	ulint block_size = 512;
 
@@ -3818,3 +3205,82 @@ fil_space_get_block_size(const fil_space_t* space, unsigned offset)
 
 	return block_size;
 }
+
+/** @return the tablespace name (databasename/tablename) */
+fil_space_t::name_type fil_space_t::name() const
+{
+  switch (id) {
+  case 0:
+    return name_type{"innodb_system", 13};
+  case SRV_TMP_SPACE_ID:
+    return name_type{"innodb_temporary", 16};
+  }
+
+  if (!UT_LIST_GET_FIRST(chain) || srv_is_undo_tablespace(id))
+    return name_type{};
+
+  ut_ad(purpose != FIL_TYPE_TEMPORARY);
+  ut_ad(UT_LIST_GET_LEN(chain) == 1);
+
+  const char *path= UT_LIST_GET_FIRST(chain)->name;
+  const char *sep= strchr(path, '/');
+  ut_ad(sep);
+
+  while (const char *next_sep= strchr(sep + 1, '/'))
+    path= sep + 1, sep= next_sep;
+
+#ifdef _WIN32
+  if (const char *last_sep= strchr(path, '\\'))
+    if (last_sep < sep)
+      path= last_sep;
+#endif
+
+  size_t len= strlen(path);
+  ut_ad(len > 4);
+  len-= 4;
+  ut_ad(!strcmp(&path[len], DOT_IBD));
+
+  return name_type{path, len};
+}
+
+#ifdef UNIV_DEBUG
+
+fil_space_t *fil_space_t::next_in_space_list()
+{
+  space_list_t::iterator it(this);
+  auto end= fil_system.space_list.end();
+  if (it == end)
+    return nullptr;
+  ++it;
+  return it == end ? nullptr : &*it;
+}
+
+fil_space_t *fil_space_t::prev_in_space_list()
+{
+  space_list_t::iterator it(this);
+  if (it == fil_system.space_list.begin())
+    return nullptr;
+  --it;
+  return &*it;
+}
+
+fil_space_t *fil_space_t::next_in_unflushed_spaces()
+{
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it(this);
+  auto end= fil_system.unflushed_spaces.end();
+  if (it == end)
+    return nullptr;
+  ++it;
+  return it == end ? nullptr : &*it;
+}
+
+fil_space_t *fil_space_t::prev_in_unflushed_spaces()
+{
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it(this);
+  if (it == fil_system.unflushed_spaces.begin())
+    return nullptr;
+  --it;
+  return &*it;
+}
+
+#endif
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
index 25b039aa9f1..9961bdf056c 100644
--- a/storage/innobase/fil/fil0pagecompress.cc
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -48,7 +48,6 @@ Updated 14/02/2015
 #include "row0mysql.h"
 #include "buf0lru.h"
 #include "ibuf0ibuf.h"
-#include "sync0sync.h"
 #include "zlib.h"
 #ifdef __linux__
 #include <linux/fs.h>
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
index f131e4e90da..c38c506cc08 100644
--- a/storage/innobase/fsp/fsp0file.cc
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,29 +31,12 @@ Created 2013-7-26 by Kevin Lewis
 #include "srv0start.h"
 #include "log.h"
 
-/** Initialize the name, size and order of this datafile
-@param[in]	name	tablespace name, will be copied
-@param[in]	flags	tablespace flags */
-void
-Datafile::init(
-	const char*	name,
-	ulint		flags)
-{
-	ut_ad(m_name == NULL);
-	ut_ad(name != NULL);
-
-	m_name = mem_strdup(name);
-	m_flags = flags;
-}
-
 /** Release the resources. */
 void
 Datafile::shutdown()
 {
 	close();
 
-	ut_free(m_name);
-	m_name = NULL;
 	free_filepath();
 	free_first_page();
 }
@@ -120,13 +103,12 @@ Datafile::open_read_only(bool strict)
 
 /** Open a data file in read-write mode during start-up so that
 doublewrite pages can be restored and then it can be validated.*
-@param[in]	read_only_mode	if true, then readonly mode checks are enforced.
 @return DB_SUCCESS or error code */
-dberr_t
-Datafile::open_read_write(bool read_only_mode)
+inline dberr_t Datafile::open_read_write()
 {
 	bool	success = false;
 	ut_ad(m_handle == OS_FILE_CLOSED);
+	ut_ad(!srv_read_only_mode);
 
 	/* This function can be called for file objects that do not need
 	to be opened, which is the case when the m_filepath is NULL */
@@ -137,7 +119,7 @@ Datafile::open_read_write(bool read_only_mode)
 	set_open_flags(OS_FILE_OPEN);
 	m_handle = os_file_create_simple_no_error_handling(
 		innodb_data_file_key, m_filepath, m_open_flags,
-		OS_FILE_READ_WRITE, read_only_mode, &success);
+		OS_FILE_READ_WRITE, false, &success);
 
 	if (!success) {
 		m_last_os_error = os_file_get_last_error(true);
@@ -183,24 +165,17 @@ Datafile::close()
 Prepend the dirpath to filename using the extension given.
 If dirpath is NULL, prepend the default datadir to filepath.
 Store the result in m_filepath.
-@param[in]	dirpath		directory path
-@param[in]	filename	filename or filepath
-@param[in]	ext		filename extension */
-void
-Datafile::make_filepath(
-	const char*	dirpath,
-	const char*	filename,
-	ib_extention	ext)
+@param dirpath  directory path
+@param name     tablespace (table) name
+@param ext      filename extension */
+void Datafile::make_filepath(const char *dirpath, fil_space_t::name_type name,
+                             ib_extention ext)
 {
-	ut_ad(dirpath != NULL || filename != NULL);
-
-	free_filepath();
-
-	m_filepath = fil_make_filepath(dirpath, filename, ext, false);
-
-	ut_ad(m_filepath != NULL);
-
-	set_filename();
+  ut_ad(dirpath || name.size());
+  free_filepath();
+  m_filepath= fil_make_filepath(dirpath, name, ext, false);
+  ut_ad(m_filepath);
+  set_filename();
 }
 
 /** Set the filepath by duplicating the filepath sent in. This is the
@@ -259,23 +234,6 @@ Datafile::same_as(
 #endif /* WIN32 */
 }
 
-/** Allocate and set the datafile or tablespace name in m_name.
-If a name is provided, use it; else extract a file-per-table
-tablespace name from m_filepath. The value of m_name
-will be freed in the destructor.
-@param[in]	name	tablespace name if known, NULL if not */
-void
-Datafile::set_name(const char*	name)
-{
-	ut_free(m_name);
-
-	if (name != NULL) {
-		m_name = mem_strdup(name);
-	} else {
-		m_name = fil_path_to_space_name(m_filepath);
-	}
-}
-
 /** Reads a few significant fields from the first page of the first
 datafile.  The Datafile must already be open.
 @param[in]	read_only_mode	If true, then readonly mode checks are enforced.
@@ -306,28 +264,24 @@ Datafile::read_first_page(bool read_only_mode)
 
 		ulint	n_read = 0;
 
-		err = os_file_read_no_error_handling(
+		err = os_file_read(
 			IORequestReadPartial, m_handle, m_first_page, 0,
 			page_size, &n_read);
 
-		if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {
-
-			page_size >>= 1;
-
-		} else if (err == DB_SUCCESS) {
-
-			ut_a(n_read == page_size);
-
+		if (err == DB_SUCCESS) {
 			break;
+		}
 
+		if (err == DB_IO_ERROR && n_read == 0) {
+			break;
+		}
+		if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {
+			page_size >>= 1;
 		} else if (srv_operation == SRV_OPERATION_BACKUP) {
 			break;
 		} else {
-
-			ib::error()
-				<< "Cannot read first page of '"
-				<< m_filepath << "' "
-				<< err;
+			ib::info() << "Cannot read first page of '"
+				<< m_filepath << "': " << err;
 			break;
 		}
 	}
@@ -459,41 +413,53 @@ Datafile::validate_for_recovery()
 	err = validate_first_page(0);
 
 	switch (err) {
-	case DB_SUCCESS:
 	case DB_TABLESPACE_EXISTS:
 		break;
-
+	case DB_SUCCESS:
+		if (!m_defer || !m_space_id) {
+			break;
+		}
+		/* InnoDB should check whether the deferred
+		tablespace page0 can be recovered from
+		double write buffer. InnoDB should try
+	        to recover only if m_space_id exists because
+		dblwr pages can be searched via {space_id, 0}.
+		m_space_id is set in read_first_page(). */
+		/* fall through */
 	default:
 		/* Re-open the file in read-write mode  Attempt to restore
 		page 0 from doublewrite and read the space ID from a survey
 		of the first few pages. */
 		close();
-		err = open_read_write(srv_read_only_mode);
+		err = open_read_write();
 		if (err != DB_SUCCESS) {
 			return(err);
 		}
 
-		err = find_space_id();
-		if (err != DB_SUCCESS || m_space_id == 0) {
-			ib::error() << "Datafile '" << m_filepath << "' is"
-				" corrupted. Cannot determine the space ID from"
-				" the first 64 pages.";
-			return(err);
+		if (!m_defer) {
+			err = find_space_id();
+			if (err != DB_SUCCESS || m_space_id == 0) {
+				ib::error() << "Datafile '" << m_filepath
+					<< "' is corrupted. Cannot determine "
+					"the space ID from the first 64 pages.";
+				return(err);
+			}
+		}
+
+		if (m_space_id == ULINT_UNDEFINED) {
+			return DB_SUCCESS; /* empty file */
 		}
 
 		if (restore_from_doublewrite()) {
-			return(DB_CORRUPTION);
+			return m_defer ? err : DB_CORRUPTION;
 		}
 
 		/* Free the previously read first page and then re-validate. */
 		free_first_page();
+		m_defer = false;
 		err = validate_first_page(0);
 	}
 
-	if (err == DB_SUCCESS) {
-		set_name(NULL);
-	}
-
 	return(err);
 }
 
@@ -505,11 +471,8 @@ m_is_valid is set true on success, else false.
 @retval DB_SUCCESS on if the datafile is valid
 @retval DB_CORRUPTION if the datafile is not readable
 @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
-dberr_t
-Datafile::validate_first_page(lsn_t* flush_lsn)
+dberr_t Datafile::validate_first_page(lsn_t *flush_lsn)
 {
-	char*		prev_name;
-	char*		prev_filepath;
 	const char*	error_txt = NULL;
 
 	m_is_valid = true;
@@ -530,11 +493,18 @@ Datafile::validate_first_page(lsn_t* flush_lsn)
 
 	if (error_txt != NULL) {
 err_exit:
+		free_first_page();
+
+		if (recv_recovery_is_on()
+		    || srv_operation == SRV_OPERATION_BACKUP) {
+			m_defer= true;
+			return DB_SUCCESS;
+		}
+
 		ib::info() << error_txt << " in datafile: " << m_filepath
 			<< ", Space ID:" << m_space_id  << ", Flags: "
 			<< m_flags;
 		m_is_valid = false;
-		free_first_page();
 		return(DB_CORRUPTION);
 	}
 
@@ -563,13 +533,18 @@ err_exit:
 	ulint logical_size = fil_space_t::logical_size(m_flags);
 
 	if (srv_page_size != logical_size) {
+		free_first_page();
+		if (recv_recovery_is_on()
+		    || srv_operation == SRV_OPERATION_BACKUP) {
+			m_defer= true;
+			return DB_SUCCESS;
+		}
 		/* Logical size must be innodb_page_size. */
 		ib::error()
 			<< "Data file '" << m_filepath << "' uses page size "
 			<< logical_size << ", but the innodb_page_size"
 			" start-up parameter is "
 			<< srv_page_size;
-		free_first_page();
 		return(DB_ERROR);
 	}
 
@@ -590,26 +565,38 @@ err_exit:
 		goto err_exit;
 	}
 
-	if (fil_space_read_name_and_filepath(
-		m_space_id, &prev_name, &prev_filepath)) {
+	mysql_mutex_lock(&fil_system.mutex);
 
-		if (0 == strcmp(m_filepath, prev_filepath)) {
-			ut_free(prev_name);
-			ut_free(prev_filepath);
-			return(DB_SUCCESS);
+	fil_space_t* space = fil_space_get_by_id(m_space_id);
+
+	if (space) {
+		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+
+		if (node && !strcmp(m_filepath, node->name)) {
+ok_exit:
+			mysql_mutex_unlock(&fil_system.mutex);
+			return DB_SUCCESS;
+		}
+
+		if (!m_space_id
+		    && (recv_recovery_is_on()
+			|| srv_operation == SRV_OPERATION_BACKUP)) {
+			m_defer= true;
+			goto ok_exit;
 		}
 
 		/* Make sure the space_id has not already been opened. */
 		ib::error() << "Attempted to open a previously opened"
-			" tablespace. Previous tablespace " << prev_name
-			<< " at filepath: " << prev_filepath
-			<< " uses space ID: " << m_space_id
-			<< ". Cannot open filepath: " << m_filepath
-			<< " which uses the same space ID.";
+			" tablespace. Previous tablespace: "
+			    << (node ? node->name : "(unknown)")
+			    << " uses space ID: " << m_space_id
+			    << ". Cannot open filepath: " << m_filepath
+			    << " which uses the same space ID.";
+	}
 
-		ut_free(prev_name);
-		ut_free(prev_filepath);
+	mysql_mutex_unlock(&fil_system.mutex);
 
+	if (space) {
 		m_is_valid = false;
 
 		free_first_page();
@@ -634,6 +621,10 @@ Datafile::find_space_id()
 
 	file_size = os_file_get_size(m_handle);
 
+	if (!file_size) {
+		return DB_SUCCESS;
+	}
+
 	if (file_size == (os_offset_t) -1) {
 		ib::error() << "Could not get file size of datafile '"
 			<< m_filepath << "'";
@@ -686,7 +677,7 @@ Datafile::find_space_id()
 
 		for (ulint j = 0; j < page_count; ++j) {
 			if (os_file_read(IORequestRead, m_handle, page,
-					 j * page_size, page_size)) {
+					 j * page_size, page_size, nullptr)) {
 				ib::info()
 					<< "READ FAIL: page_no:" << j;
 				continue;
@@ -792,7 +783,6 @@ Datafile::restore_from_doublewrite()
 			<< "Corrupted page " << page_id
 			<< " of datafile '" << m_filepath
 			<< "' could not be found in the doublewrite buffer.";
-
 		return(true);
 	}
 
@@ -823,68 +813,63 @@ Datafile::restore_from_doublewrite()
 	       != DB_SUCCESS);
 }
 
-/** Create a link filename based on the contents of m_name,
-open that file, and read the contents into m_filepath.
-@retval DB_SUCCESS if remote linked tablespace file is opened and read.
-@retval DB_CANNOT_OPEN_FILE if the link file does not exist. */
-dberr_t
-RemoteDatafile::open_link_file()
+/** Read an InnoDB Symbolic Link (ISL) file by name.
+@param link_filepath   filepath of the ISL file
+@return data file name (must be freed by the caller)
+@retval nullptr  on error */
+static char *read_link_file(const char *link_filepath)
 {
-	if (m_link_filepath == NULL) {
-		m_link_filepath = fil_make_filepath(NULL, name(), ISL, false);
-	}
-
-	m_filepath = read_link_file(m_link_filepath);
-
-	return(m_filepath == NULL ? DB_CANNOT_OPEN_FILE : DB_SUCCESS);
+  if (FILE* file= fopen(link_filepath, "r+b" STR_O_CLOEXEC))
+  {
+    char *filepath= static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH));
+
+    os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
+    fclose(file);
+
+    if (size_t len= strlen(filepath))
+    {
+      /* Trim whitespace from end of filepath */
+      len--;
+      while (static_cast<byte>(filepath[len]) <= 0x20)
+      {
+        if (!len)
+          return nullptr;
+        filepath[len--]= 0;
+      }
+      /* Ensure that the last 2 path separators are forward slashes,
+      because elsewhere we are assuming that tablespace file names end
+      in "/databasename/tablename.ibd". */
+      unsigned trailing_slashes= 0;
+      for (; len; len--)
+      {
+        switch (filepath[len]) {
+#ifdef _WIN32
+        case '\\':
+          filepath[len]= '/';
+          /* fall through */
+#endif
+        case '/':
+          if (++trailing_slashes >= 2)
+            return filepath;
+        }
+      }
+    }
+  }
+
+  return nullptr;
 }
 
-/** Opens a handle to the file linked to in an InnoDB Symbolic Link file
-in read-only mode so that it can be validated.
-@param[in]	strict	whether to issue error messages
-@return DB_SUCCESS if remote linked tablespace file is found and opened. */
-dberr_t
-RemoteDatafile::open_read_only(bool strict)
+/** Create a link filename,
+open that file, and read the contents into m_filepath.
+@param name   table name
+@return filepath()
+@retval nullptr  if the .isl file does not exist or cannot be read */
+const char *RemoteDatafile::open_link_file(const fil_space_t::name_type name)
 {
-	if (m_filepath == NULL && open_link_file() == DB_CANNOT_OPEN_FILE) {
-		return(DB_ERROR);
-	}
-
-	dberr_t err = Datafile::open_read_only(strict);
-
-	if (err != DB_SUCCESS && strict) {
-		/* The following call prints an error message */
-		os_file_get_last_error(true);
-		ib::error() << "A link file was found named '"
-			<< m_link_filepath << "' but the linked tablespace '"
-			<< m_filepath << "' could not be opened read-only.";
-	}
-
-	return(err);
-}
-
-/** Opens a handle to the file linked to in an InnoDB Symbolic Link file
-in read-write mode so that it can be restored from doublewrite and validated.
-@param[in]	read_only_mode	If true, then readonly mode checks are enforced.
-@return DB_SUCCESS if remote linked tablespace file is found and opened. */
-dberr_t
-RemoteDatafile::open_read_write(bool read_only_mode)
-{
-	if (m_filepath == NULL && open_link_file() == DB_CANNOT_OPEN_FILE) {
-		return(DB_ERROR);
-	}
-
-	dberr_t err = Datafile::open_read_write(read_only_mode);
-
-	if (err != DB_SUCCESS) {
-		/* The following call prints an error message */
-		m_last_os_error = os_file_get_last_error(true);
-		ib::error() << "A link file was found named '"
-			<< m_link_filepath << "' but the linked data file '"
-			<< m_filepath << "' could not be opened for writing.";
-	}
-
-	return(err);
+  if (!m_link_filepath)
+    m_link_filepath= fil_make_filepath(nullptr, name, ISL, false);
+  m_filepath= read_link_file(m_link_filepath);
+  return m_filepath;
 }
 
 /** Release the resources. */
@@ -899,16 +884,12 @@ RemoteDatafile::shutdown()
 	}
 }
 
-/** Creates a new InnoDB Symbolic Link (ISL) file.  It is always created
-under the 'datadir' of MySQL. The datadir is the directory of a
-running mysqld program. We can refer to it by simply using the path ".".
-@param[in]	name		tablespace name
-@param[in]	filepath	remote filepath of tablespace datafile
+/** Create InnoDB Symbolic Link (ISL) file.
+@param name     tablespace name
+@param filepath full file name
 @return DB_SUCCESS or error code */
-dberr_t
-RemoteDatafile::create_link_file(
-	const char*	name,
-	const char*	filepath)
+dberr_t RemoteDatafile::create_link_file(fil_space_t::name_type name,
+                                         const char *filepath)
 {
 	bool		success;
 	dberr_t		err = DB_SUCCESS;
@@ -916,7 +897,6 @@ RemoteDatafile::create_link_file(
 	char*		prev_filepath = NULL;
 
 	ut_ad(!srv_read_only_mode);
-	ut_ad(0 == strcmp(&filepath[strlen(filepath) - 4], DOT_IBD));
 
 	link_filepath = fil_make_filepath(NULL, name, ISL, false);
 
@@ -929,7 +909,8 @@ RemoteDatafile::create_link_file(
 		/* Truncate (starting with MySQL 5.6, probably no
 		longer since MariaDB Server 10.2.19) used to call this
 		with an existing link file which contains the same filepath. */
-		bool same = !strcmp(prev_filepath, filepath);
+		bool same = !strncmp(prev_filepath, name.data(), name.size())
+			&& !strcmp(prev_filepath + name.size(), DOT_IBD);
 		ut_free(prev_filepath);
 		if (same) {
 			ut_free(link_filepath);
@@ -977,9 +958,8 @@ RemoteDatafile::create_link_file(
 		return(err);
 	}
 
-	ulint rbytes = fwrite(filepath, 1, strlen(filepath), file);
-
-	if (rbytes != strlen(filepath)) {
+	const size_t len = strlen(filepath);
+	if (fwrite(filepath, 1, len, file) != len) {
 		error = os_file_get_last_error(true);
 		ib::error() <<
 			"Cannot write link file: "
@@ -1008,50 +988,12 @@ RemoteDatafile::delete_link_file(void)
 }
 
 /** Delete an InnoDB Symbolic Link (ISL) file by name.
-@param[in]	name	tablespace name */
-void
-RemoteDatafile::delete_link_file(
-	const char*	name)
-{
-	char* link_filepath = fil_make_filepath(NULL, name, ISL, false);
-
-	if (link_filepath != NULL) {
-		os_file_delete_if_exists(
-			innodb_data_file_key, link_filepath, NULL);
-
-		ut_free(link_filepath);
-	}
-}
-
-/** Read an InnoDB Symbolic Link (ISL) file by name.
-It is always created under the datadir of MySQL.
-For file-per-table tablespaces, the isl file is expected to be
-in a 'database' directory and called 'tablename.isl'.
-The caller must free the memory returned if it is not null.
-@param[in]	link_filepath	filepath of the ISL file
-@return Filepath of the IBD file read from the ISL file */
-char*
-RemoteDatafile::read_link_file(
-	const char*	link_filepath)
+@param name	tablespace name */
+void RemoteDatafile::delete_link_file(fil_space_t::name_type name)
 {
-	FILE* file = fopen(link_filepath, "r+b" STR_O_CLOEXEC);
-	if (file == NULL) {
-		return(NULL);
-	}
-
-	char* filepath = static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH));
-
-	os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
-	fclose(file);
-
-	if (filepath[0] != '\0') {
-		/* Trim whitespace from end of filepath */
-		ulint last_ch = strlen(filepath) - 1;
-		while (last_ch > 4 && filepath[last_ch] <= 0x20) {
-			filepath[last_ch--] = 0x00;
-		}
-		os_normalize_path(filepath);
-	}
-
-	return(filepath);
+  if (char *link_filepath= fil_make_filepath(NULL, name, ISL, false))
+  {
+    os_file_delete_if_exists(innodb_data_file_key, link_filepath, nullptr);
+    ut_free(link_filepath);
+  }
 }
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
index 7532ce85b6b..514083d35cc 100644
--- a/storage/innobase/fsp/fsp0fsp.cc
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,7 +31,6 @@ Created 11/29/1995 Heikki Tuuri
 #include "mtr0log.h"
 #include "ut0byte.h"
 #include "page0page.h"
-#include "fut0fut.h"
 #include "srv0srv.h"
 #include "srv0start.h"
 #include "ibuf0ibuf.h"
@@ -41,30 +40,17 @@ Created 11/29/1995 Heikki Tuuri
 #include "log0log.h"
 #include "dict0mem.h"
 #include "fsp0types.h"
-
-// JAN: MySQL 5.7 Encryption
-// #include <my_aes.h>
+#include "log.h"
 
 typedef uint32_t page_no_t;
 
-/** Return an extent to the free list of a space.
-@param[in,out]	space		tablespace
-@param[in]	offset		page number in the extent
-@param[in,out]	mtr		mini-transaction */
-MY_ATTRIBUTE((nonnull))
-static
-void
-fsp_free_extent(
-	fil_space_t*		space,
-	page_no_t		offset,
-	mtr_t*			mtr);
-
 /** Returns the first extent descriptor for a segment.
 We think of the extent lists of the segment catenated in the order
 FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
 @param[in]	inode		segment inode
 @param[in]	space		tablespace
 @param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
 @return the first extent descriptor, or NULL if none */
 MY_ATTRIBUTE((nonnull, warn_unused_result))
 static
@@ -72,8 +58,10 @@ xdes_t*
 fseg_get_first_extent(
 	fseg_inode_t*		inode,
 	const fil_space_t*	space,
-	mtr_t*			mtr);
+	mtr_t*			mtr,
+	dberr_t*		err);
 
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Put new extents to the free list if there are free extents above the free
 limit. If an extent happens to contain an extent descriptor page, the extent
 is put to the FSP_FREE_FRAG list with the page marked as used.
@@ -83,8 +71,8 @@ then we will not allocate more extents
 @param[in,out]	space		tablespace
 @param[in,out]	header		tablespace header
 @param[in,out]	mtr		mini-transaction */
-static ATTRIBUTE_COLD
-void
+static
+dberr_t
 fsp_fill_free_list(
 	bool		init_space,
 	fil_space_t*	space,
@@ -104,7 +92,9 @@ direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
 @param[in,out]	mtr			mini-transaction
 @param[in,out]	init_mtr		mtr or another mini-transaction in
 which the page should be initialized.
-@retval NULL	if no page could be allocated */
+@param[out]	err			error code
+@return the allocated page
+@retval nullptr	if no page could be allocated */
 static
 buf_block_t*
 fseg_alloc_free_page_low(
@@ -118,21 +108,38 @@ fseg_alloc_free_page_low(
 	/*!< whether the space has already been reserved */
 #endif /* UNIV_DEBUG */
 	mtr_t*			mtr,
-	mtr_t*			init_mtr)
-	MY_ATTRIBUTE((warn_unused_result));
+	mtr_t*			init_mtr,
+	dberr_t*		err)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Get the tablespace header block, SX-latched
 @param[in]      space           tablespace
 @param[in,out]  mtr             mini-transaction
-@return pointer to the space header, page x-locked */
-inline buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr)
+@param[out]     err             error code
+@return pointer to the space header, page x-locked
+@retval nullptr if the page cannot be retrieved or is corrupted */
+static buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr,
+                                   dberr_t *err)
 {
- buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(),
-                                  RW_SX_LATCH, mtr);
- buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
- ut_ad(space->id == mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
-                                     block->frame));
- return block;
+  const page_id_t id{space->id, 0};
+  buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_SX_FIX);
+  if (block)
+    *err= DB_SUCCESS;
+  else
+  {
+    block= buf_page_get_gen(id, space->zip_size(), RW_SX_LATCH,
+                            nullptr, BUF_GET_POSSIBLY_FREED,
+                            mtr, err);
+    if (block &&
+        space->id != mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
+                                      block->page.frame))
+    {
+      *err= DB_CORRUPTION;
+      block= nullptr;
+    }
+  }
+  return block;
 }
 
 /** Set the XDES_FREE_BIT of a page.
@@ -148,7 +155,7 @@ inline void xdes_set_free(const buf_block_t &block, xdes_t *descr,
   ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
                                    MTR_MEMO_PAGE_X_FIX));
   ut_ad(offset < FSP_EXTENT_SIZE);
-  ut_ad(page_align(descr) == block.frame);
+  ut_ad(page_align(descr) == block.page.frame);
   compile_time_assert(XDES_BITS_PER_PAGE == 2);
   compile_time_assert(XDES_FREE_BIT == 0);
   compile_time_assert(XDES_CLEAN_BIT == 1);
@@ -213,14 +220,14 @@ inline bool xdes_is_full(const xdes_t *descr)
 @param[in]      state   the state
 @param[in,out]  mtr     mini-transaction */
 inline void xdes_set_state(const buf_block_t &block, xdes_t *descr,
-			   byte state, mtr_t *mtr)
+                           byte state, mtr_t *mtr)
 {
   ut_ad(descr && mtr);
   ut_ad(state >= XDES_FREE);
   ut_ad(state <= XDES_FSEG);
   ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
                                    MTR_MEMO_PAGE_X_FIX));
-  ut_ad(page_align(descr) == block.frame);
+  ut_ad(page_align(descr) == block.page.frame);
   ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG);
   mtr->write<1>(block, XDES_STATE + 3 + descr, state);
 }
@@ -248,7 +255,7 @@ inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
 {
   ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
                                    MTR_MEMO_PAGE_X_FIX));
-  mtr->memset(&block, uint16_t(descr - block.frame) + XDES_BITMAP,
+  mtr->memset(&block, uint16_t(descr - block.page.frame) + XDES_BITMAP,
               XDES_SIZE - XDES_BITMAP, 0xff);
   xdes_set_state(block, descr, XDES_FREE, mtr);
 }
@@ -259,29 +266,34 @@ inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
 @param[in]      page            page number
 @param[in,out]  descr           extent descriptor
 @param[in,out]  xdes            extent descriptor page
-@param[in,out]  mtr             mini-transaction */
-static MY_ATTRIBUTE((nonnull))
-void
+@param[in,out]  mtr             mini-transaction
+@return error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
 fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
                     ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
 {
-  ut_ad(fil_page_get_type(iblock->frame) == FIL_PAGE_INODE);
+  ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE);
   ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
-  ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
   ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));
 
-  const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
-  const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+  const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE);
+  const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
 
   if (!xdes_get_n_used(descr))
   {
     /* We move the extent from the free list to the NOT_FULL list */
-    flst_remove(iblock, uint16_t(FSEG_FREE + ioffset), xdes, xoffset, mtr);
-    flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
-                  xdes, xoffset, mtr);
+    if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset),
+                                 xdes, xoffset, mtr))
+      return err;
+    if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+                                   xdes, xoffset, mtr))
+      return err;
   }
 
-  ut_ad(xdes_is_free(descr, page % FSP_EXTENT_SIZE));
+  if (UNIV_UNLIKELY(!xdes_is_free(descr, page % FSP_EXTENT_SIZE)))
+    return DB_CORRUPTION;
 
   /* We mark the page as used */
   xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr);
@@ -292,19 +304,26 @@ fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
   if (xdes_is_full(descr))
   {
     /* We move the extent from the NOT_FULL list to the FULL list */
-    flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset), xdes, xoffset, mtr);
-    flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset), xdes, xoffset, mtr);
+    if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+                                 xdes, xoffset, mtr))
+      return err;
+    if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset),
+                                   xdes, xoffset, mtr))
+      return err;
     mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
                   not_full_n_used - FSP_EXTENT_SIZE);
   }
+
+  return DB_SUCCESS;
 }
 
 /** Get pointer to a the extent descriptor of a page.
 @param[in,out]	sp_header	tablespace header page, x-latched
 @param[in]	space		tablespace
 @param[in]	offset		page offset
-@param[out]	desc_block	descriptor block
 @param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
+@param[out]	desc_block	descriptor block
 @param[in]	init_space	whether the tablespace is being initialized
 @return pointer to the extent descriptor, NULL if the page does not
 exist in the space or if the offset exceeds free limit */
@@ -314,18 +333,19 @@ xdes_get_descriptor_with_space_hdr(
 	buf_block_t*		header,
 	const fil_space_t*	space,
 	page_no_t		offset,
-	buf_block_t**		desc_block,
 	mtr_t*			mtr,
+	dberr_t*		err = nullptr,
+	buf_block_t**		desc_block = nullptr,
 	bool			init_space = false)
 {
-	ut_ad(mtr->memo_contains(*space));
+	ut_ad(space->is_owner());
 	ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX
 					 | MTR_MEMO_PAGE_X_FIX));
 	/* Read free limit and space size */
 	uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
-					  + header->frame);
+					  + header->page.frame);
 	uint32_t size  = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					  + header->frame);
+					  + header->page.frame);
 	ut_ad(limit == space->free_limit
 	      || (space->free_limit == 0
 		  && (init_space
@@ -335,8 +355,8 @@ xdes_get_descriptor_with_space_hdr(
 			      || srv_is_undo_tablespace(space->id))))));
 	ut_ad(size == space->size_in_header);
 
-	if ((offset >= size) || (offset >= limit)) {
-		return(NULL);
+	if (offset >= size || offset >= limit) {
+		return nullptr;
 	}
 
 	const unsigned zip_size = space->zip_size();
@@ -346,22 +366,23 @@ xdes_get_descriptor_with_space_hdr(
 	buf_block_t* block = header;
 
 	if (descr_page_no) {
-		block = buf_page_get(
-			page_id_t(space->id, descr_page_no), zip_size,
-			RW_SX_LATCH, mtr);
-
-		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+		block = buf_page_get_gen(page_id_t(space->id, descr_page_no),
+					 zip_size, RW_SX_LATCH, nullptr,
+					 BUF_GET_POSSIBLY_FREED, mtr, err);
 	}
 
-	if (desc_block != NULL) {
+	if (desc_block) {
 		*desc_block = block;
 	}
 
-	return XDES_ARR_OFFSET + XDES_SIZE
+	return block
+		? XDES_ARR_OFFSET + XDES_SIZE
 		* xdes_calc_descriptor_index(zip_size, offset)
-		+ block->frame;
+		+ block->page.frame
+		: nullptr;
 }
 
+MY_ATTRIBUTE((nonnull(1,3), warn_unused_result))
 /** Get the extent descriptor of a page.
 The page where the extent descriptor resides is x-locked. If the page
 offset is equal to the free limit of the space, we will add new
@@ -371,91 +392,49 @@ defined, as they are uninitialized above the free limit.
 @param[in]	space		tablespace
 @param[in]	offset		page offset; if equal to the free limit, we
 try to add new extents to the space free list
-@param[out]	xdes		extent descriptor page
 @param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
+@param[out]	xdes		extent descriptor page
 @return the extent descriptor */
-static xdes_t* xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
-                                   buf_block_t **xdes, mtr_t *mtr)
+static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
+                                   mtr_t *mtr, dberr_t *err= nullptr,
+                                   buf_block_t **xdes= nullptr)
 {
-  buf_block_t *block= buf_page_get(page_id_t(space->id, 0), space->zip_size(),
-                                   RW_SX_LATCH, mtr);
-  buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-  return xdes_get_descriptor_with_space_hdr(block, space, offset, xdes, mtr);
-}
-
-/** Get the extent descriptor of a page.
-The page where the extent descriptor resides is x-locked. If the page
-offset is equal to the free limit of the space, we will add new
-extents from above the free limit to the space free list, if not free
-limit == space size. This adding is necessary to make the descriptor
-defined, as they are uninitialized above the free limit.
-@param[in]	space		tablespace
-@param[in]	page		descriptor page offset
-@param[in]	offset		page offset
-@param[in,out]	mtr		mini-transaction
-@return	the extent descriptor
-@retval	NULL	if the descriptor is not available */
-MY_ATTRIBUTE((warn_unused_result))
-static
-const xdes_t*
-xdes_get_descriptor_const(
-	const fil_space_t*	space,
-	page_no_t		page,
-	page_no_t		offset,
-	mtr_t*			mtr)
-{
-	ut_ad(mtr->memo_contains(space->latch, MTR_MEMO_SX_LOCK));
-	ut_ad(offset < space->free_limit);
-	ut_ad(offset < space->size_in_header);
-
-	const ulint zip_size = space->zip_size();
-
-	if (buf_block_t* block = buf_page_get_gen(page_id_t(space->id, page),
-						  zip_size, RW_S_LATCH,
-						  nullptr,
-						  BUF_GET_POSSIBLY_FREED,
-						  __FILE__, __LINE__, mtr)) {
-		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
-		if (block->page.status == buf_page_t::FREED) {
-			return nullptr;
-		}
-
-		ut_ad(page != 0 || space->free_limit == mach_read_from_4(
-			      FSP_FREE_LIMIT + FSP_HEADER_OFFSET
-			      + block->frame));
-		ut_ad(page != 0 || space->size_in_header == mach_read_from_4(
-			      FSP_SIZE + FSP_HEADER_OFFSET
-			      + block->frame));
-
-		return(block->frame + XDES_ARR_OFFSET + XDES_SIZE
-		       * xdes_calc_descriptor_index(zip_size, offset));
-	}
-
-	return(NULL);
+  if (buf_block_t *block=
+      buf_page_get_gen(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH,
+                       nullptr, BUF_GET_POSSIBLY_FREED, mtr, err))
+    return xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
+                                              err, xdes);
+  return nullptr;
 }
 
+MY_ATTRIBUTE((nonnull(3), warn_unused_result))
 /** Get a pointer to the extent descriptor. The page where the
 extent descriptor resides is x-locked.
-@param[in]	space		tablespace
-@param[in]	lst_node	file address of the list node
-				contained in the descriptor
-@param[out]	block		extent descriptor block
-@param[in,out]	mtr		mini-transaction
+@param space    tablespace
+@param lst_node file address of the list node contained in the descriptor
+@param mtr      mini-transaction
+@param err      error code
+@param block    extent descriptor block
 @return pointer to the extent descriptor */
-MY_ATTRIBUTE((nonnull, warn_unused_result))
-UNIV_INLINE
-xdes_t*
-xdes_lst_get_descriptor(
-	const fil_space_t*	space,
-	fil_addr_t		lst_node,
-	buf_block_t**		block,
-	mtr_t*			mtr)
+static inline
+xdes_t *xdes_lst_get_descriptor(const fil_space_t &space, fil_addr_t lst_node,
+                                mtr_t *mtr, buf_block_t **block= nullptr,
+                                dberr_t *err= nullptr)
 {
-	ut_ad(mtr->memo_contains(*space));
-	return fut_get_ptr(space->id, space->zip_size(),
-			   lst_node, RW_SX_LATCH, mtr, block)
-		- XDES_FLST_NODE;
+  ut_ad(mtr->memo_contains(space));
+  ut_ad(lst_node.boffset < space.physical_size());
+  buf_block_t *b;
+  if (!block)
+    block= &b;
+  *block= buf_page_get_gen(page_id_t{space.id, lst_node.page},
+                           space.zip_size(), RW_SX_LATCH,
+                           nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
+  if (*block)
+    return (*block)->page.frame + lst_node.boffset - XDES_FLST_NODE;
+
+  space.set_corrupted();
+  return nullptr;
 }
 
 /********************************************************************//**
@@ -473,26 +452,27 @@ static uint32_t xdes_get_offset(const xdes_t *descr)
 @param[in,out]	block	buffer pool block */
 void fsp_apply_init_file_page(buf_block_t *block)
 {
-  memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size);
+  memset_aligned<UNIV_PAGE_SIZE_MIN>(block->page.frame, 0, srv_page_size);
   const page_id_t id(block->page.id());
 
-  mach_write_to_4(block->frame + FIL_PAGE_OFFSET, id.page_no());
+  mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no());
   if (log_sys.is_physical())
-    memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8);
-  mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id.space());
+    memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+                  id.space());
   if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
   {
     memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
                                       page_zip_get_size(page_zip));
     static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
     memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
-                      block->frame + FIL_PAGE_OFFSET, 4);
+                      block->page.frame + FIL_PAGE_OFFSET, 4);
     if (log_sys.is_physical())
       memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
     static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
                   "not perfect alignment");
     memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
-                      block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+                      block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
   }
 }
 
@@ -502,66 +482,39 @@ updating an allocation bitmap page.
 @param[in]	mtr	mini-transaction */
 void fil_space_t::modify_check(const mtr_t& mtr) const
 {
-	switch (mtr.get_log_mode()) {
-	case MTR_LOG_NONE:
-		/* These modes are only allowed within a non-bitmap page
-		when there is a higher-level redo log record written. */
-		ut_ad(purpose == FIL_TYPE_TABLESPACE
-		      || purpose == FIL_TYPE_TEMPORARY);
-		break;
-	case MTR_LOG_NO_REDO:
-		ut_ad(purpose == FIL_TYPE_TEMPORARY
-		      || purpose == FIL_TYPE_IMPORT);
-		return;
-	case MTR_LOG_ALL:
-		/* We may only write redo log for a persistent
-		tablespace. */
-		ut_ad(purpose == FIL_TYPE_TABLESPACE);
-		ut_ad(mtr.is_named_space(id));
-		return;
-	}
-
-	ut_ad("invalid log mode" == 0);
+  switch (mtr.get_log_mode()) {
+  case MTR_LOG_NONE:
+    /* These modes are only allowed within a non-bitmap page
+       when there is a higher-level redo log record written. */
+    ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY);
+    break;
+  case MTR_LOG_NO_REDO:
+    ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT);
+    break;
+  default:
+    /* We may only write redo log for a persistent tablespace. */
+    ut_ad(purpose == FIL_TYPE_TABLESPACE);
+    ut_ad(mtr.is_named_space(id));
+  }
 }
 #endif
 
-/**********************************************************************//**
-Writes the space id and flags to a tablespace header.  The flags contain
-row type, physical/compressed page size, and logical/uncompressed page
-size of the tablespace. */
-void
-fsp_header_init_fields(
-/*===================*/
-	page_t*	page,		/*!< in/out: first page in the space */
-	ulint	space_id,	/*!< in: space id */
-	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS) */
-{
-	flags &= ~FSP_FLAGS_MEM_MASK;
-	ut_a(fil_space_t::is_valid_flags(flags, space_id));
-
-	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
-			space_id);
-	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page,
-			flags);
-}
-
 /** Initialize a tablespace header.
 @param[in,out]	space	tablespace
 @param[in]	size	current size in blocks
-@param[in,out]	mtr	mini-transaction */
-void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
+@param[in,out]	mtr	mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
 {
 	const page_id_t page_id(space->id, 0);
 	const ulint zip_size = space->zip_size();
 
 	buf_block_t *free_block = buf_LRU_get_free_block(false);
 
-	mtr_x_lock_space(space, mtr);
+	mtr->x_lock_space(space);
 
 	buf_block_t* block = buf_page_create(space, 0, zip_size, mtr,
 					     free_block);
-	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-
 	if (UNIV_UNLIKELY(block != free_block)) {
 		buf_pool.free_block(free_block);
 	}
@@ -574,28 +527,28 @@ void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
 
 	fsp_init_file_page(space, block, mtr);
 
-	mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE,
+	mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
 		      FIL_PAGE_TYPE_FSP_HDR);
 
 	mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID
-				       + block->frame, space->id);
+				       + block->page.frame, space->id);
 	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
-				    + block->frame));
+				    + block->page.frame));
 	/* recv_sys_t::parse() expects to find a WRITE record that
 	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
 	in order to avoid optimizing away any unchanged most
 	significant bytes of FSP_SIZE. */
 	mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
-				    + block->frame, size);
+				    + block->page.frame, size);
 	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
-				    + block->frame));
+				    + block->page.frame));
 	if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) {
 		mtr->write<4,mtr_t::FORCED>(*block,
 					    FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
-					    + block->frame, f);
+					    + block->page.frame, f);
 	}
 	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
-				    + block->frame));
+				    + block->page.frame));
 
 	flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr);
 	flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr);
@@ -603,11 +556,14 @@ void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
 	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr);
 	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr);
 
-	mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID + block->frame,
+	mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID
+		      + block->page.frame,
 		      1U);
 
-	fsp_fill_free_list(!is_system_tablespace(space->id),
-			   space, block, mtr);
+	if (dberr_t err = fsp_fill_free_list(!is_system_tablespace(space->id),
+					     space, block, mtr)) {
+		return err;
+	}
 
 	/* Write encryption metadata to page 0 if tablespace is
 	encrypted or encryption is disabled by table option. */
@@ -616,6 +572,8 @@ void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
 	     space->crypt_data->not_encrypted())) {
 		space->crypt_data->write_page0(block, mtr);
 	}
+
+	return DB_SUCCESS;
 }
 
 /** Try to extend a single-table tablespace so that a page would fit in the
@@ -636,10 +594,11 @@ fsp_try_extend_data_file_with_pages(
 	bool	success;
 	ulint	size;
 
-	ut_a(!is_system_tablespace(space->id));
+	ut_ad(!is_system_tablespace(space->id));
 	ut_d(space->modify_check(*mtr));
 
-	size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->frame);
+	size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+				+ header->page.frame);
 	ut_ad(size == space->size_in_header);
 
 	ut_a(page_no >= size);
@@ -651,7 +610,7 @@ fsp_try_extend_data_file_with_pages(
 	in order to avoid optimizing away any unchanged most
 	significant bytes of FSP_SIZE. */
 	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
-				    + header->frame, space->size);
+				    + header->page.frame, space->size);
 	space->size_in_header = space->size;
 
 	return(success);
@@ -721,9 +680,9 @@ fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
 		to reset the flag to false as dealing with this
 		error requires server restart. */
 		if (!srv_sys_space.get_tablespace_full_status()) {
-			ib::error() << "The InnoDB system tablespace "
-				<< OUT_OF_SPACE_MSG
-				<< " innodb_data_file_path.";
+			sql_print_error("InnoDB: The InnoDB system tablespace "
+                                        "%s" " innodb_data_file_path.",
+                                        OUT_OF_SPACE_MSG);
 			srv_sys_space.set_tablespace_full_status(true);
 		}
 		return(0);
@@ -735,16 +694,17 @@ fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
 		to reset the flag to false as dealing with this
 		error requires server restart. */
 		if (!srv_tmp_space.get_tablespace_full_status()) {
-			ib::error() << "The InnoDB temporary tablespace "
-				<< OUT_OF_SPACE_MSG
-				<< " innodb_temp_data_file_path.";
+			sql_print_error("InnoDB: The InnoDB temporary"
+                                        " tablespace %s"
+                                        " innodb_temp_data_file_path.",
+                                        OUT_OF_SPACE_MSG);
 			srv_tmp_space.set_tablespace_full_status(true);
 		}
 		return(0);
 	}
 
 	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					 + header->frame);
+					 + header->page.frame);
 	ut_ad(size == space->size_in_header);
 	uint32_t size_increase;
 
@@ -792,7 +752,8 @@ fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
 	in order to avoid optimizing away any unchanged most
 	significant bytes of FSP_SIZE. */
 	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
-				    + header->frame, space->size_in_header);
+				    + header->page.frame,
+				    space->size_in_header);
 
 	return(size_increase);
 }
@@ -807,10 +768,9 @@ Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
 ATTRIBUTE_COLD
 void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr)
 {
-	ib::info()
-		<< "Resetting invalid page " << block.page.id() << " type "
-		<< fil_page_get_type(block.frame) << " to " << type << ".";
-	mtr->write<2>(block, block.frame + FIL_PAGE_TYPE, type);
+  ib::info() << "Resetting invalid page " << block.page.id() << " type "
+             << fil_page_get_type(block.page.frame) << " to " << type << ".";
+  mtr->write<2>(block, block.page.frame + FIL_PAGE_TYPE, type);
 }
 
 /** Put new extents to the free list if there are free extents above the free
@@ -821,228 +781,248 @@ and we are only initializing the first extent and the first bitmap pages;
 then we will not allocate more extents
 @param[in,out]	space		tablespace
 @param[in,out]	header		tablespace header
-@param[in,out]	mtr		mini-transaction */
+@param[in,out]	mtr		mini-transaction
+@return error code */
 static
-void
+dberr_t
 fsp_fill_free_list(
 	bool		init_space,
 	fil_space_t*	space,
 	buf_block_t*	header,
 	mtr_t*		mtr)
 {
-	ut_d(space->modify_check(*mtr));
+  ut_d(space->modify_check(*mtr));
 
-	/* Check if we can fill free list from above the free list limit */
-	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					 + header->frame);
-	uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
-					  + header->frame);
+  /* Check if we can fill free list from above the free list limit */
+  uint32_t size=
+    mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame);
+  uint32_t limit=
+    mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame);
 
-	ut_ad(size == space->size_in_header);
-	ut_ad(limit == space->free_limit);
+  ut_ad(size == space->size_in_header);
+  ut_ad(limit == space->free_limit);
 
-	const ulint zip_size = space->zip_size();
+  const auto zip_size= space->zip_size();
 
-	if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
-		bool	skip_resize	= init_space;
-		switch (space->id) {
-		case TRX_SYS_SPACE:
-			skip_resize = !srv_sys_space.can_auto_extend_last_file();
-			break;
-		case SRV_TMP_SPACE_ID:
-			skip_resize = !srv_tmp_space.can_auto_extend_last_file();
-			break;
-		}
+  if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD)
+  {
+    bool skip_resize= init_space;
+    switch (space->id) {
+    case TRX_SYS_SPACE:
+      skip_resize= !srv_sys_space.can_auto_extend_last_file();
+      break;
+    case SRV_TMP_SPACE_ID:
+      skip_resize= !srv_tmp_space.can_auto_extend_last_file();
+      break;
+    }
 
-		if (!skip_resize) {
-			fsp_try_extend_data_file(space, header, mtr);
-			size = space->size_in_header;
-		}
-	}
+    if (!skip_resize)
+    {
+      fsp_try_extend_data_file(space, header, mtr);
+      size= space->size_in_header;
+    }
+  }
 
-	uint32_t count = 0;
-
-	for (uint32_t i = limit, extent_size = FSP_EXTENT_SIZE,
-		     physical_size = space->physical_size();
-	     (init_space && i < 1)
-		     || (i + extent_size <= size && count < FSP_FREE_ADD);
-	     i += extent_size) {
-		const bool init_xdes = !ut_2pow_remainder(i, physical_size);
-
-		space->free_limit = i + extent_size;
-		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT
-			      + header->frame, i + extent_size);
-
-		if (init_xdes) {
-
-			buf_block_t*	block;
-
-			/* We are going to initialize a new descriptor page
-			and a new ibuf bitmap page: the prior contents of the
-			pages should be ignored. */
-
-			if (i > 0) {
-				buf_block_t *f= buf_LRU_get_free_block(false);
-				block= buf_page_create(
-					space, static_cast<uint32_t>(i),
-					zip_size, mtr, f);
-				buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-				if (UNIV_UNLIKELY(block != f)) {
-					buf_pool.free_block(f);
-				}
-				fsp_init_file_page(space, block, mtr);
-				mtr->write<2>(*block,
-					      FIL_PAGE_TYPE + block->frame,
-					      FIL_PAGE_TYPE_XDES);
-			}
+  uint32_t count= 0;
+  for (uint32_t i= limit, extent_size= FSP_EXTENT_SIZE,
+         physical_size= space->physical_size();
+       (init_space && i < 1) ||
+         (i + extent_size <= size && count < FSP_FREE_ADD);
+       i += extent_size)
+  {
+    const bool init_xdes= !ut_2pow_remainder(i, physical_size);
+    space->free_limit= i + extent_size;
+    mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT +
+                  header->page.frame, i + extent_size);
 
-			if (space->purpose != FIL_TYPE_TEMPORARY) {
-				buf_block_t *f= buf_LRU_get_free_block(false);
-				block = buf_page_create(
-					space,
-					static_cast<uint32_t>(
-						i + FSP_IBUF_BITMAP_OFFSET),
-					zip_size, mtr, f);
-				buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-				if (UNIV_UNLIKELY(block != f)) {
-					buf_pool.free_block(f);
-				}
-				fsp_init_file_page(space, block, mtr);
-				mtr->write<2>(*block,
-					      block->frame + FIL_PAGE_TYPE,
-					      FIL_PAGE_IBUF_BITMAP);
-			}
-		}
+    if (init_xdes)
+    {
+      /* We are going to initialize a new descriptor page
+      and a new ibuf bitmap page: the prior contents of the
+      pages should be ignored. */
+
+      if (i)
+      {
+        buf_block_t *f= buf_LRU_get_free_block(false);
+        buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(i),
+                               zip_size, mtr, f);
+        if (UNIV_UNLIKELY(block != f))
+          buf_pool.free_block(f);
+        fsp_init_file_page(space, block, mtr);
+        mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                      FIL_PAGE_TYPE_XDES);
+      }
+
+      if (space->purpose != FIL_TYPE_TEMPORARY)
+      {
+        buf_block_t *f= buf_LRU_get_free_block(false);
+        buf_block_t *block=
+          buf_page_create(space,
+                          static_cast<uint32_t>(i + FSP_IBUF_BITMAP_OFFSET),
+                          zip_size, mtr, f);
+        if (UNIV_UNLIKELY(block != f))
+          buf_pool.free_block(f);
+        fsp_init_file_page(space, block, mtr);
+        mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                      FIL_PAGE_IBUF_BITMAP);
+      }
+    }
 
-		buf_block_t* xdes;
-		xdes_t*	descr = xdes_get_descriptor_with_space_hdr(
-			header, space, i, &xdes, mtr, init_space);
-		if (xdes != header && !space->full_crc32()) {
-			fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
-		}
-		xdes_init(*xdes, descr, mtr);
-		const uint16_t xoffset= static_cast<uint16_t>(
-			descr - xdes->frame + XDES_FLST_NODE);
-
-		if (UNIV_UNLIKELY(init_xdes)) {
-
-			/* The first page in the extent is a descriptor page
-			and the second is an ibuf bitmap page: mark them
-			used */
-
-			xdes_set_free<false>(*xdes, descr, 0, mtr);
-			xdes_set_free<false>(*xdes, descr,
-					     FSP_IBUF_BITMAP_OFFSET, mtr);
-			xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
-
-			flst_add_last(header,
-				      FSP_HEADER_OFFSET + FSP_FREE_FRAG,
-				      xdes, xoffset, mtr);
-			byte* n_used = FSP_HEADER_OFFSET + FSP_FRAG_N_USED
-				+ header->frame;
-			mtr->write<4>(*header, n_used,
-				      2U + mach_read_from_4(n_used));
-		} else {
-			flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
-				      xdes, xoffset, mtr);
-			count++;
-		}
-	}
+    buf_block_t *xdes= nullptr;
+    xdes_t *descr;
+    {
+      dberr_t err= DB_SUCCESS;
+      descr= xdes_get_descriptor_with_space_hdr(header, space, i, mtr,
+                                                &err, &xdes, init_space);
+      if (!descr)
+        return err;
+    }
+
+    if (xdes != header && !space->full_crc32())
+      fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
+    xdes_init(*xdes, descr, mtr);
+    const uint16_t xoffset=
+      static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
+    if (UNIV_UNLIKELY(init_xdes))
+    {
+      /* The first page in the extent is a descriptor page and the
+      second is an ibuf bitmap page: mark them used */
+      xdes_set_free<false>(*xdes, descr, 0, mtr);
+      xdes_set_free<false>(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr);
+      xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+      if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+                                     xdes, xoffset, mtr))
+        return err;
+      byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
+      mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used));
+    }
+    else
+    {
+      if (dberr_t err=
+          flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
+                        xdes, xoffset, mtr))
+        return err;
+      count++;
+    }
+  }
 
-	space->free_len += count;
+  space->free_len+= count;
+  return DB_SUCCESS;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Allocates a new free extent.
 @param[in,out]	space		tablespace
 @param[in]	hint		hint of which extent would be desirable: any
 page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT
 @param[out]	xdes		extent descriptor page
 @param[in,out]	mtr		mini-transaction
-@return extent descriptor, NULL if cannot be allocated */
-static
-xdes_t*
-fsp_alloc_free_extent(
-	fil_space_t*		space,
-	uint32_t		hint,
-	buf_block_t**		xdes,
-	mtr_t*			mtr)
+@return extent descriptor
+@retval nullptr if cannot be allocated */
+static xdes_t *fsp_alloc_free_extent(fil_space_t *space, uint32_t hint,
+                                     buf_block_t **xdes, mtr_t *mtr,
+                                     dberr_t *err)
 {
 	fil_addr_t	first;
 	xdes_t*		descr;
-	buf_block_t*	desc_block = NULL;
+	buf_block_t*	desc_block;
 
-	buf_block_t* header = fsp_get_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr, err);
+	if (!header) {
+corrupted:
+		space->set_corrupted();
+		return nullptr;
+	}
 
 	descr = xdes_get_descriptor_with_space_hdr(
-		header, space, hint, &desc_block, mtr);
+		header, space, hint, mtr, err, &desc_block);
+	if (!descr) {
+		goto corrupted;
+	}
 
 	if (desc_block != header && !space->full_crc32()) {
 		fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr);
 	}
 
-	if (descr && (xdes_get_state(descr) == XDES_FREE)) {
+	if (xdes_get_state(descr) == XDES_FREE) {
 		/* Ok, we can take this extent */
 	} else {
 		/* Take the first extent in the free list */
 		first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
-				       + header->frame);
+				       + header->page.frame);
 
 		if (first.page == FIL_NULL) {
-			fsp_fill_free_list(false, space, header, mtr);
+			*err = fsp_fill_free_list(false, space, header, mtr);
+			if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+				goto corrupted;
+			}
 
 			first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
-					       + header->frame);
+					       + header->page.frame);
 			if (first.page == FIL_NULL) {
 				return nullptr;	/* No free extents left */
 			}
 		}
 
-		descr = xdes_lst_get_descriptor(space, first, &desc_block,
-						mtr);
+		descr = xdes_lst_get_descriptor(*space, first, mtr,
+						&desc_block, err);
+		if (!descr) {
+			return descr;
+		}
+	}
+
+	*err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
+			   static_cast<uint16_t>(descr - desc_block->page.frame
+						 + XDES_FLST_NODE),
+			   mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
 	}
 
-	flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
-		    static_cast<uint16_t>(
-			    descr - desc_block->frame + XDES_FLST_NODE), mtr);
 	space->free_len--;
 	*xdes = desc_block;
 
 	return(descr);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Allocate a single free page.
 @param[in,out]	header	tablespace header
 @param[in,out]	xdes	extent descriptor page
 @param[in,out]	descr	extent descriptor
 @param[in]	bit	slot to allocate in the extent
-@param[in,out]	mtr	mini-transaction */
-static void
+@param[in,out]	mtr	mini-transaction
+@return error code */
+static dberr_t
 fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
 			 ulint bit, mtr_t *mtr)
 {
-	ut_ad(xdes_get_state(descr) == XDES_FREE_FRAG);
-	ut_a(xdes_is_free(descr, bit));
-	xdes_set_free<false>(*xdes, descr, bit, mtr);
-
-	/* Update the FRAG_N_USED field */
-	byte* n_used_p = FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->frame;
+  if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG ||
+                    !xdes_is_free(descr, bit)))
+    return DB_CORRUPTION;
+  xdes_set_free<false>(*xdes, descr, bit, mtr);
 
-	uint32_t n_used = mach_read_from_4(n_used_p) + 1;
+  /* Update the FRAG_N_USED field */
+  byte *n_used_p= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
+  uint32_t n_used = mach_read_from_4(n_used_p) + 1;
 
-	if (xdes_is_full(descr)) {
-		/* The fragment is full: move it to another list */
-		const uint16_t xoffset= static_cast<uint16_t>(
-			descr - xdes->frame + XDES_FLST_NODE);
-		flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
-			    xdes, xoffset, mtr);
-		xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
-
-		flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
-			      xdes, xoffset, mtr);
-		n_used -= FSP_EXTENT_SIZE;
-	}
+  if (xdes_is_full(descr))
+  {
+    /* The fragment is full: move it to another list */
+    const uint16_t xoffset=
+      static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
+    if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+                                 xdes, xoffset, mtr))
+      return err;
+    if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+                                   xdes, xoffset, mtr))
+      return err;
+    xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
+    n_used-= FSP_EXTENT_SIZE;
+  }
 
-	mtr->write<4>(*header, n_used_p, n_used);
+  mtr->write<4>(*header, n_used_p, n_used);
+  return DB_SUCCESS;
 }
 
 /** Gets a buffer block for an allocated page.
@@ -1059,10 +1039,10 @@ fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
   if (UNIV_UNLIKELY(space->is_being_truncated))
   {
     const page_id_t page_id{space->id, offset};
-    const ulint fold= page_id.fold();
+    buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
     mysql_mutex_lock(&buf_pool.mutex);
     block= reinterpret_cast<buf_block_t*>
-      (buf_pool.page_hash_get_low(page_id, fold));
+      (buf_pool.page_hash.get(page_id, chain));
     if (block && block->page.oldest_modification() <= 1)
       block= nullptr;
     mysql_mutex_unlock(&buf_pool.mutex);
@@ -1070,7 +1050,7 @@ fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
     if (block)
     {
       ut_ad(block->page.buf_fix_count() >= 1);
-      ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+      ut_ad(block->page.lock.x_lock_count() == 1);
       ut_ad(mtr->have_x_latch(*block));
       free_block= block;
       goto got_free_block;
@@ -1095,117 +1075,152 @@ The page is marked as used.
 @param[in,out]	mtr		mini-transaction
 @param[in,out]	init_mtr	mini-transaction in which the page should be
 initialized (may be the same as mtr)
-@retval NULL	if no page could be allocated */
+@param[out]	err		error code
+@return allocated block
+@retval nullptr	if no page could be allocated */
 static MY_ATTRIBUTE((warn_unused_result, nonnull))
-buf_block_t*
-fsp_alloc_free_page(
-	fil_space_t*		space,
-	uint32_t		hint,
-	mtr_t*			mtr,
-	mtr_t*			init_mtr)
+buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
+                                 mtr_t *mtr, mtr_t *init_mtr, dberr_t *err)
 {
-	fil_addr_t	first;
-	xdes_t*		descr;
-	const ulint	space_id = space->id;
-
-	ut_d(space->modify_check(*mtr));
-	buf_block_t* block = fsp_get_header(space, mtr);
-	buf_block_t *xdes;
-
-	/* Get the hinted descriptor */
-	descr = xdes_get_descriptor_with_space_hdr(block, space, hint, &xdes,
-						   mtr);
-
-	if (descr && (xdes_get_state(descr) == XDES_FREE_FRAG)) {
-		/* Ok, we can take this extent */
-	} else {
-		/* Else take the first extent in free_frag list */
-		first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG
-				       + block->frame);
-
-		if (first.page == FIL_NULL) {
-			/* There are no partially full fragments: allocate
-			a free extent and add it to the FREE_FRAG list. NOTE
-			that the allocation may have as a side-effect that an
-			extent containing a descriptor page is added to the
-			FREE_FRAG list. But we will allocate our page from the
-			the free extent anyway. */
-
-			descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
-
-			if (descr == NULL) {
-				/* No free space left */
-
-				return(NULL);
-			}
-
-			xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
-			flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
-				      xdes, static_cast<uint16_t>(
-					      descr - xdes->frame
-					      + XDES_FLST_NODE), mtr);
-		} else {
-			descr = xdes_lst_get_descriptor(space, first, &xdes,
-							mtr);
-		}
-
-		/* Reset the hint */
-		hint = 0;
-	}
-
-	/* Now we have in descr an extent with at least one free page. Look
-	for a free page in the extent. */
-
-	uint32_t free = xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
-	if (free == FIL_NULL) {
+  ut_d(space->modify_check(*mtr));
+  buf_block_t *block= fsp_get_header(space, mtr, err);
+  if (!block)
+    return block;
+
+  buf_block_t *xdes;
+  /* Get the hinted descriptor */
+  xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, hint, mtr,
+                                                    err, &xdes);
+  if (descr && xdes_get_state(descr) == XDES_FREE_FRAG)
+    /* Ok, we can take this extent */;
+  else if (*err != DB_SUCCESS)
+  {
+  err_exit:
+    space->set_corrupted();
+    return nullptr;
+  }
+  else
+  {
+    /* Else take the first extent in free_frag list */
+    fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG +
+                                      block->page.frame);
+    if (first.page == FIL_NULL)
+    {
+      /* There are no partially full fragments: allocate a free extent
+      and add it to the FREE_FRAG list. NOTE that the allocation may
+      have as a side-effect that an extent containing a descriptor
+      page is added to the FREE_FRAG list. But we will allocate our
+      page from the the free extent anyway. */
+      descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, err);
+      if (!descr)
+        return nullptr;
+      *err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes,
+                          static_cast<uint16_t>(descr - xdes->page.frame +
+                                                XDES_FLST_NODE), mtr);
+      if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+        return nullptr;
+      xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+    }
+    else
+    {
+      descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err);
+      if (!descr)
+        return nullptr;
+      /* Reset the hint */
+      hint= 0;
+    }
+  }
 
-		ut_print_buf(stderr, ((byte*) descr) - 500, 1000);
-		putc('\n', stderr);
+  /* Now we have in descr an extent with at least one free page. Look
+  for a free page in the extent. */
+  uint32_t free= xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
+  if (free == FIL_NULL)
+  {
+  corrupted:
+    *err= DB_CORRUPTION;
+    goto err_exit;
+  }
 
-		ut_error;
-	}
+  uint32_t page_no= xdes_get_offset(descr) + free;
+  uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE +
+                                         block->page.frame);
+  ut_ad(space_size == space->size_in_header ||
+        (space->id == TRX_SYS_SPACE &&
+         srv_startup_is_before_trx_rollback_phase));
 
-	uint32_t page_no = xdes_get_offset(descr) + free;
+  if (space_size <= page_no)
+  {
+    /* It must be that we are extending a single-table tablespace
+    whose size is still < 64 pages */
+    ut_ad(!is_system_tablespace(space->id));
+    if (page_no >= FSP_EXTENT_SIZE)
+    {
+      sql_print_error("InnoDB: Trying to extend %s"
+                      " by single page(s) though the size is " UINT32PF "."
+                      " Page no " UINT32PF ".",
+                      space->chain.start->name, space_size, page_no);
+      goto corrupted;
+    }
 
-	uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					       + block->frame);
-	ut_ad(space_size == space->size_in_header
-	      || (space_id == TRX_SYS_SPACE
-		  && srv_startup_is_before_trx_rollback_phase));
+    if (!fsp_try_extend_data_file_with_pages(space, page_no, block, mtr))
+    {
+      *err= DB_OUT_OF_FILE_SPACE;
+      return nullptr;
+    }
+  }
 
-	if (space_size <= page_no) {
-		/* It must be that we are extending a single-table tablespace
-		whose size is still < 64 pages */
+  *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    goto corrupted;
+  return fsp_page_create(space, page_no, init_mtr);
+}
 
-		ut_a(!is_predefined_tablespace(space_id));
-		if (page_no >= FSP_EXTENT_SIZE) {
-			ib::error() << "Trying to extend a single-table"
-				" tablespace " << space->name << " , by single"
-				" page(s) though the space size " << space_size
-				<< ". Page no " << page_no << ".";
-			return(NULL);
-		}
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Return an extent to the free list of a space.
+@param[in,out]  space   tablespace
+@param[in]      offset  page number in the extent
+@param[in,out]  mtr     mini-transaction
+@return error code */
+static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset,
+                               mtr_t* mtr)
+{
+  ut_ad(space->is_owner());
+  dberr_t err;
+  buf_block_t *block= fsp_get_header(space, mtr, &err);
+  if (!block)
+    return err;
+  buf_block_t *xdes;
+  xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
+                                                    &err, &xdes);
+  if (!descr)
+  {
+    ut_ad(err || space->is_stopping());
+    return err;
+  }
 
-		if (!fsp_try_extend_data_file_with_pages(space, page_no,
-							 block, mtr)) {
-			/* No disk space left */
-			return(NULL);
-		}
-	}
+  if (UNIV_UNLIKELY(xdes_get_state(descr) == XDES_FREE))
+  {
+    space->set_corrupted();
+    return DB_CORRUPTION;
+  }
 
-	fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
-	return fsp_page_create(space, page_no, init_mtr);
+  xdes_init(*xdes, descr, mtr);
+  space->free_len++;
+  return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
+                       xdes, static_cast<uint16_t>(descr - xdes->page.frame +
+                                                   XDES_FLST_NODE), mtr);
 }
 
+MY_ATTRIBUTE((nonnull))
 /** Frees a single page of a space.
 The page is marked as free and clean.
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in,out]	mtr		mini-transaction */
-static void fsp_free_page(fil_space_t* space, page_no_t offset, mtr_t* mtr)
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr)
 {
 	xdes_t*		descr;
-	ulint		state;
 	ulint		frag_n_used;
 
 	ut_ad(mtr);
@@ -1213,104 +1228,81 @@ static void fsp_free_page(fil_space_t* space, page_no_t offset, mtr_t* mtr)
 
 	/* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
 
-	buf_block_t* header = fsp_get_header(space, mtr);
-	buf_block_t* xdes= 0;
-
-	descr = xdes_get_descriptor_with_space_hdr(header, space, offset,
-						   &xdes, mtr);
-
-	state = xdes_get_state(descr);
-
-	if (UNIV_UNLIKELY(state != XDES_FREE_FRAG
-			  && state != XDES_FULL_FRAG)) {
-		ib::error() << "File space extent descriptor of page "
-			<< page_id_t(space->id, offset)
-			<< " has state " << state;
-		/* Crash in debug version, so that we get a core dump
-		of this corruption. */
-		ut_ad(0);
-
-		if (state == XDES_FREE) {
-			/* We put here some fault tolerance: if the page
-			is already free, return without doing anything! */
-
-			return;
-		}
-
-		ut_error;
+	dberr_t err;
+	buf_block_t* header = fsp_get_header(space, mtr, &err);
+	if (!header) {
+		ut_ad(space->is_stopping());
+		return err;
 	}
+	buf_block_t* xdes;
 
-	if (xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
-		ib::error() << "File space extent descriptor of page "
-			<< page_id_t(space->id, offset)
-			<< " says it is free.";
-		/* Crash in debug version, so that we get a core dump
-		of this corruption. */
-		ut_ad(0);
-
-		/* We put here some fault tolerance: if the page
-		is already free, return without doing anything! */
-
-		return;
+	descr = xdes_get_descriptor_with_space_hdr(header, space, offset, mtr,
+						   &err, &xdes);
+	if (!descr) {
+		ut_ad(err || space->is_stopping());
+		return err;
 	}
 
-	mtr->free(*space, static_cast<uint32_t>(offset));
-
-	const ulint	bit = offset % FSP_EXTENT_SIZE;
+	const auto state = xdes_get_state(descr);
 
-	xdes_set_free<true>(*xdes, descr, bit, mtr);
+	switch (state) {
+	case XDES_FREE_FRAG:
+	case XDES_FULL_FRAG:
+		if (!xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
+			break;
+		}
+		/* fall through */
+	default:
+		space->set_corrupted();
+		return DB_CORRUPTION;
+	}
 
 	frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
-				       + header->frame);
+				       + header->page.frame);
 
-	const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->frame
+	const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->page.frame
 						      + XDES_FLST_NODE);
 
 	if (state == XDES_FULL_FRAG) {
 		/* The fragment was full: move it to another list */
-		flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
-			    xdes, xoffset, mtr);
+		err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+				    xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
 		xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
-		flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
-			      xdes, xoffset, mtr);
 		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
-			      + header->frame,
+			      + header->page.frame,
 			      frag_n_used + FSP_EXTENT_SIZE - 1);
+	} else if (UNIV_UNLIKELY(!frag_n_used)) {
+		return DB_CORRUPTION;
 	} else {
-		ut_a(frag_n_used > 0);
 		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
-			      + header->frame, frag_n_used - 1);
+			      + header->page.frame, frag_n_used - 1);
 	}
 
 	if (!xdes_get_n_used(descr)) {
 		/* The extent has become free: move it to another list */
-		flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
-			    xdes, xoffset, mtr);
-		fsp_free_extent(space, offset, mtr);
+		err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = fsp_free_extent(space, offset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
 	}
-}
-
-/** Return an extent to the free list of a space.
-@param[in,out]  space   tablespace
-@param[in]      offset  page number in the extent
-@param[in,out]  mtr     mini-transaction */
-static void fsp_free_extent(fil_space_t* space, page_no_t offset, mtr_t* mtr)
-{
-  ut_ad(mtr->memo_contains(*space));
-
-  buf_block_t *block= fsp_get_header(space, mtr);
-  buf_block_t *xdes= 0;
 
-  xdes_t* descr= xdes_get_descriptor_with_space_hdr(block, space, offset,
-                                                    &xdes, mtr);
-  ut_a(xdes_get_state(descr) != XDES_FREE);
-
-  xdes_init(*xdes, descr, mtr);
+	mtr->free(*space, static_cast<uint32_t>(offset));
+	xdes_set_free<true>(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr);
 
-  flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
-                xdes, static_cast<uint16_t>(descr - xdes->frame +
-                                            XDES_FLST_NODE), mtr);
-  space->free_len++;
+	return DB_SUCCESS;
 }
 
 /** @return Number of segment inodes which fit on a single page */
@@ -1327,190 +1319,206 @@ inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size)
 	FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page
 
 /** Looks for a used segment inode on a segment inode page.
-@param[in]	page		segment inode page
-@param[in]	physical_size	page size
-@return segment inode index, or ULINT_UNDEFINED if not found */
+@param page             segment inode page
+@param physical_size    page size
+@return segment inode index
+@retval ULINT_UNDEFINED if not found */
 static
 ulint
-fsp_seg_inode_page_find_used(const page_t* page, ulint physical_size)
+fsp_seg_inode_page_find_used(const page_t *page, ulint physical_size)
 {
-	for (ulint i = 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
-		if (!mach_read_from_8(
-			    FSEG_ID
-			    + fsp_seg_inode_page_get_nth_inode(page, i))) {
-			continue;
-		}
-		/* This is used */
-		ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4(
-			      FSEG_MAGIC_N
-			      + fsp_seg_inode_page_get_nth_inode(page, i)));
-		return i;
-	}
+  for (ulint i= 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
+  {
+    const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
+    if (mach_read_from_8(FSEG_ID + inode))
+    {
+      ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+      return i;
+    }
+  }
 
-	return(ULINT_UNDEFINED);
+  return ULINT_UNDEFINED;
 }
 
 /** Looks for an unused segment inode on a segment inode page.
 @param[in]	page		segment inode page
 @param[in]	i		search forward starting from this index
 @param[in]	physical_size	page size
-@return segment inode index, or ULINT_UNDEFINED if not found */
+@return segment inode index
+@retval ULINT_UNDEFINED if not found */
 static
 ulint
-fsp_seg_inode_page_find_free(const page_t* page, ulint i, ulint physical_size)
+fsp_seg_inode_page_find_free(const page_t *page, ulint i, ulint physical_size)
 {
-	for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) {
-		if (!mach_read_from_8(
-			    FSEG_ID
-			    + fsp_seg_inode_page_get_nth_inode(page, i))) {
-			/* This is unused */
-			return i;
-		}
-
-		ut_ad(FSEG_MAGIC_N_VALUE == mach_read_from_4(
-			      FSEG_MAGIC_N
-			      + fsp_seg_inode_page_get_nth_inode(page, i)));
-	}
-
-	return ULINT_UNDEFINED;
+  for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
+  {
+    const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
+    if (mach_read_from_8(FSEG_ID + inode))
+      ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+    else
+      /* This is unused */
+      return i;
+  }
+  return ULINT_UNDEFINED;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Allocate a file segment inode page.
 @param[in,out]  space   tablespace
 @param[in,out]  header  tablespace header
 @param[in,out]  mtr     mini-transaction
-@return whether the allocation succeeded */
-MY_ATTRIBUTE((nonnull, warn_unused_result))
-static
-bool
-fsp_alloc_seg_inode_page(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
+@return error code */
+static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space,
+                                        buf_block_t *header, mtr_t *mtr)
 {
   ut_ad(header->page.id().space() == space->id);
-  buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr);
+  dberr_t err;
+  buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr, &err);
 
   if (!block)
-    return false;
+    return err;
 
-  buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-  ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+  ut_ad(block->page.lock.not_recursive());
 
-  mtr->write<2>(*block, block->frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
+  mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
 
 #ifdef UNIV_DEBUG
-  const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->frame;
+  const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->page.frame;
   for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
        inode += FSEG_INODE_SIZE)
     ut_ad(!mach_read_from_8(inode));
 #endif
 
-  flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
-                block, FSEG_INODE_PAGE_NODE, mtr);
-  return true;
+  return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                       block, FSEG_INODE_PAGE_NODE, mtr);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Allocate a file segment inode.
 @param[in,out]  space   tablespace
 @param[in,out]  header  tablespace header
 @param[out]     iblock  segment inode page
 @param[in,out]  mtr     mini-transaction
+@param[out]     err     error code
 @return segment inode
-@retval NULL if not enough space */
-MY_ATTRIBUTE((nonnull, warn_unused_result))
+@retval nullptr on failure */
 static fseg_inode_t*
 fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
-                    buf_block_t **iblock, mtr_t *mtr)
+                    buf_block_t **iblock, mtr_t *mtr, dberr_t *err)
 {
-	buf_block_t*	block;
-	fseg_inode_t*	inode;
-
-	/* Allocate a new segment inode page if needed. */
-	if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE
-			  + header->frame)
-	    && !fsp_alloc_seg_inode_page(space, header, mtr)) {
-		return(NULL);
-	}
-	const page_id_t		page_id(
-		space->id,
-		flst_get_first(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE
-			       + header->frame).page);
-
-	block = buf_page_get(page_id, space->zip_size(), RW_SX_LATCH, mtr);
-	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
-	if (!space->full_crc32()) {
-		fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
-	}
+  /* Allocate a new segment inode page if needed. */
+  if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE +
+                    header->page.frame))
+  {
+    *err= fsp_alloc_seg_inode_page(space, header, mtr);
+    if (*err != DB_SUCCESS)
+      return nullptr;
+  }
 
-	const ulint physical_size = space->physical_size();
+  const page_id_t page_id
+  {
+    space->id,
+    mach_read_from_4(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + FLST_FIRST +
+                     FIL_ADDR_PAGE + header->page.frame)
+  };
+
+  buf_block_t *block=
+    buf_page_get_gen(page_id, space->zip_size(), RW_SX_LATCH,
+                     nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
+  if (!block)
+    return nullptr;
 
-	ulint n = fsp_seg_inode_page_find_free(block->frame, 0, physical_size);
+  if (!space->full_crc32())
+    fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
 
-	ut_a(n < FSP_SEG_INODES_PER_PAGE(physical_size));
+  const ulint physical_size= space->physical_size();
+  ulint n= fsp_seg_inode_page_find_free(block->page.frame, 0, physical_size);
 
-	inode = fsp_seg_inode_page_get_nth_inode(block->frame, n);
+  if (UNIV_UNLIKELY(n >= FSP_SEG_INODES_PER_PAGE(physical_size)))
+  {
+    *err= DB_CORRUPTION;
+    return nullptr;
+  }
+  fseg_inode_t *inode= fsp_seg_inode_page_get_nth_inode(block->page.frame, n);
 
-	if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->frame,
-							    n + 1,
-							    physical_size)) {
-		/* There are no other unused headers left on the page: move it
-		to another list */
-		flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
-			    block, FSEG_INODE_PAGE_NODE, mtr);
-		flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
-			      block, FSEG_INODE_PAGE_NODE, mtr);
-	}
+  if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->page.frame, n + 1,
+                                                      physical_size))
+  {
+    /* There are no other unused headers left on the page: move it
+    to another list */
+    *err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                      block, FSEG_INODE_PAGE_NODE, mtr);
+    if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+      return nullptr;
+    *err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+                        block, FSEG_INODE_PAGE_NODE, mtr);
+    if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+      return nullptr;
+  }
 
-	ut_ad(!mach_read_from_8(inode + FSEG_ID)
-	      || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
-	*iblock = block;
-	return(inode);
+  ut_ad(!mach_read_from_8(inode + FSEG_ID) ||
+        !memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+  *iblock= block;
+  return inode;
 }
 
+MY_ATTRIBUTE((nonnull))
 /** Frees a file segment inode.
 @param[in,out]	space		tablespace
 @param[in,out]	inode		segment inode
 @param[in,out]	iblock		segment inode page
 @param[in,out]	mtr		mini-transaction */
-static void fsp_free_seg_inode(
-	fil_space_t*		space,
-	fseg_inode_t*		inode,
-	buf_block_t*		iblock,
-	mtr_t*			mtr)
+static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode,
+                               buf_block_t *iblock, mtr_t *mtr)
 {
-	ut_d(space->modify_check(*mtr));
+  ut_d(space->modify_check(*mtr));
 
-	buf_block_t* header = fsp_get_header(space, mtr);
+  dberr_t err;
+  buf_block_t *header= fsp_get_header(space, mtr, &err);
+  if (!header)
+    return;
+  if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+  {
+    space->set_corrupted();
+    return;
+  }
 
-	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+  const ulint physical_size= space->physical_size();
 
-	const ulint physical_size = space->physical_size();
+  if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0,
+                                                      physical_size))
+  {
+    /* Move the page to another list */
+    if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+                    iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+      return;
+    if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                      iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+      return;
+  }
 
-	if (ULINT_UNDEFINED
-	    == fsp_seg_inode_page_find_free(iblock->frame, 0, physical_size)) {
-		/* Move the page to another list */
-		flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
-			    iblock, FSEG_INODE_PAGE_NODE, mtr);
-		flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
-			      iblock, FSEG_INODE_PAGE_NODE, mtr);
-	}
+  mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
 
-	mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
+  if (ULINT_UNDEFINED != fsp_seg_inode_page_find_used(iblock->page.frame,
+                                                      physical_size))
+    return;
 
-	if (ULINT_UNDEFINED
-	    == fsp_seg_inode_page_find_used(iblock->frame, physical_size)) {
-		/* There are no other used headers left on the page: free it */
-		flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
-			    iblock, FSEG_INODE_PAGE_NODE, mtr);
-		fsp_free_page(space, iblock->page.id().page_no(), mtr);
-	}
+  /* There are no other used headers left on the page: free it */
+  if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                  iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS)
+    fsp_free_page(space, iblock->page.id().page_no(), mtr);
 }
 
+MY_ATTRIBUTE((nonnull(1,4,5), warn_unused_result))
 /** Returns the file segment inode, page x-latched.
 @param[in]	header		segment header
 @param[in]	space		space id
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]	mtr		mini-transaction
-@param[out]	block		inode block, or NULL to ignore
-@return segment inode, page x-latched; NULL if the inode is free */
+@param[out]	block		inode block
+@param[out]	err		error code
+@return segment inode, page x-latched
+@retrval nullptr if the inode is free or corruption was noticed */
 static
 fseg_inode_t*
 fseg_inode_try_get(
@@ -1518,49 +1526,35 @@ fseg_inode_try_get(
 	ulint			space,
 	ulint			zip_size,
 	mtr_t*			mtr,
-	buf_block_t**		block)
+	buf_block_t**		block,
+        dberr_t*		err = nullptr)
 {
-	fil_addr_t	inode_addr;
-	fseg_inode_t*	inode;
-
-	inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO);
-	inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET);
-	ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE));
-
-	inode = fut_get_ptr(space, zip_size, inode_addr, RW_SX_LATCH, mtr,
-			    block);
+  if (UNIV_UNLIKELY(space != mach_read_from_4(header + FSEG_HDR_SPACE)))
+  {
+  corrupted:
+    if (err)
+      *err= DB_CORRUPTION;
+    return nullptr;
+  }
 
-	if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID))) {
+  *block=
+    buf_page_get_gen(page_id_t(space,
+                               mach_read_from_4(header + FSEG_HDR_PAGE_NO)),
+                     zip_size, RW_SX_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+                     mtr, err);
+  if (!*block)
+    return nullptr;
 
-		inode = NULL;
-	} else {
-		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
-		      == FSEG_MAGIC_N_VALUE);
-	}
+  const uint16_t offset= mach_read_from_2(header + FSEG_HDR_OFFSET);
+  if (UNIV_UNLIKELY(offset >= (*block)->physical_size()))
+    goto corrupted;
 
-	return(inode);
-}
+  fseg_inode_t *inode= (*block)->page.frame + offset;
+  if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID) ||
+                    memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+    goto corrupted;
 
-/** Returns the file segment inode, page x-latched.
-@param[in]	header		segment header
-@param[in]	space		space id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out]	mtr		mini-transaction
-@param[out]	block		inode block
-@return segment inode, page x-latched */
-static
-fseg_inode_t*
-fseg_inode_get(
-	const fseg_header_t*	header,
-	ulint			space,
-	ulint			zip_size,
-	mtr_t*			mtr,
-	buf_block_t**		block = NULL)
-{
-	fseg_inode_t*	inode
-		= fseg_inode_try_get(header, space, zip_size, mtr, block);
-	ut_a(inode);
-	return(inode);
+  return inode;
 }
 
 /** Get the page number from the nth fragment page slot.
@@ -1572,7 +1566,7 @@ static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n)
 {
 	ut_ad(inode);
 	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
-	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
 	return(mach_read_from_4(inode + FSEG_FRAG_ARR
 				+ n * FSEG_FRAG_SLOT_SIZE));
 }
@@ -1588,7 +1582,7 @@ inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock,
 {
   ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
   ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX));
-  ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
 
   mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
                 page_no);
@@ -1664,13 +1658,14 @@ static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
 @param space                tablespace
 @param byte_offset          byte offset of the created segment header
 @param mtr                  mini-transaction
+@param err                  error code
 @param has_done_reservation whether fsp_reserve_free_extents() was invoked
 @param block                block where segment header is placed,
                             or NULL to allocate an additional page for that
 @return the block where the segment header is placed, x-latched
-@retval NULL if could not create segment because of lack of space */
+@retval nullptr if could not create segment */
 buf_block_t*
-fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
             bool has_done_reservation, buf_block_t *block)
 {
 	fseg_inode_t*	inode;
@@ -1685,36 +1680,30 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
 	ut_ad(byte_offset + FSEG_HEADER_SIZE
 	      <= srv_page_size - FIL_PAGE_DATA_END);
 
-	mtr_x_lock_space(space, mtr);
+	mtr->x_lock_space(space);
 	ut_d(space->modify_check(*mtr));
 
-	if (block) {
-		ut_ad(block->page.id().space() == space->id);
+	ut_ad(!block || block->page.id().space() == space->id);
 
-		if (!space->full_crc32()) {
-			fil_block_check_type(*block, block->page.id()
-					     == page_id_t(TRX_SYS_SPACE,
-							  TRX_SYS_PAGE_NO)
-					     ? FIL_PAGE_TYPE_TRX_SYS
-					     : FIL_PAGE_TYPE_SYS,
-					     mtr);
-		}
+	buf_block_t* header = fsp_get_header(space, mtr, err);
+	if (!header) {
+		block = nullptr;
+		goto funct_exit;
 	}
 
-	buf_block_t* header = fsp_get_header(space, mtr);
 	buf_block_t* iblock;
 
 inode_alloc:
-	inode = fsp_alloc_seg_inode(space, header, &iblock, mtr);
+	inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err);
 
-	if (inode == NULL) {
+	if (!inode) {
+		block = nullptr;
 reserve_extent:
 		if (!has_done_reservation && !reserved_extent) {
-
-			if (!fsp_reserve_free_extents(
-					&n_reserved, space, 2,
-					FSP_NORMAL, mtr)) {
-				DBUG_RETURN(NULL);
+			*err = fsp_reserve_free_extents(&n_reserved, space, 2,
+							FSP_NORMAL, mtr);
+			if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+				DBUG_RETURN(nullptr);
 			}
 
 			/* Extents reserved successfully. So
@@ -1737,9 +1726,10 @@ reserve_extent:
 	value in space header */
 
 	seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID
-				  + header->frame);
+				  + header->page.frame);
 
-	mtr->write<8>(*header, FSP_HEADER_OFFSET + FSP_SEG_ID + header->frame,
+	mtr->write<8>(*header,
+		      FSP_HEADER_OFFSET + FSP_SEG_ID + header->page.frame,
 		      seg_id + 1);
 	mtr->write<8>(*iblock, inode + FSEG_ID, seg_id);
 	ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED));
@@ -1748,10 +1738,11 @@ reserve_extent:
 	flst_init(*iblock, inode + FSEG_NOT_FULL, mtr);
 	flst_init(*iblock, inode + FSEG_FULL, mtr);
 
-	mtr->write<4>(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE);
+	mtr->memcpy(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4);
 	compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4);
 	compile_time_assert(FIL_NULL == 0xffffffff);
-	mtr->memset(iblock, uint16_t(inode - iblock->frame) + FSEG_FRAG_ARR,
+	mtr->memset(iblock,
+		    uint16_t(inode - iblock->page.frame) + FSEG_FRAG_ARR,
 		    FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff);
 
 	if (!block) {
@@ -1761,30 +1752,30 @@ page_alloc:
 #ifdef UNIV_DEBUG
 						 has_done_reservation,
 #endif /* UNIV_DEBUG */
-						 mtr, mtr);
+						 mtr, mtr, err);
 
 		if (!block) {
 			ut_ad(!has_done_reservation);
 			goto reserve_extent;
 		}
 
-		ut_d(const auto x = rw_lock_get_x_lock_count(&block->lock));
-		ut_ad(x > 0);
+		ut_d(const auto x = block->page.lock.x_lock_count());
+		ut_ad(x || block->page.lock.not_recursive());
 		ut_ad(x == 1 || space->is_being_truncated);
 		ut_ad(x <= 2);
-		ut_ad(!fil_page_get_type(block->frame));
-		mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+		ut_ad(!fil_page_get_type(block->page.frame));
+		mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
 			      FIL_PAGE_TYPE_SYS);
 	}
 
 	mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET
-		      + block->frame, page_offset(inode));
+		      + block->page.frame, page_offset(inode));
 
 	mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO
-		      + block->frame, iblock->page.id().page_no());
+		      + block->page.frame, iblock->page.id().page_no());
 
 	mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE
-				       + block->frame, space->id);
+				       + block->page.frame, space->id);
 
 funct_exit:
 	if (!has_done_reservation && reserved_extent) {
@@ -1827,143 +1818,128 @@ ulint fseg_n_reserved_pages(const buf_block_t &block,
                             const fseg_header_t *header, ulint *used,
                             mtr_t *mtr)
 {
-  ut_ad(page_align(header) == block.frame);
-  return fseg_n_reserved_pages_low(fseg_inode_get(header,
-                                                  block.page.id().space(),
-                                                  block.zip_size(), mtr),
-                                   used);
+  ut_ad(page_align(header) == block.page.frame);
+  buf_block_t *iblock;
+  if (fseg_inode_t *inode=
+      fseg_inode_try_get(header, block.page.id().space(), block.zip_size(),
+                         mtr, &iblock))
+    return fseg_n_reserved_pages_low(inode, used);
+  return *used= 0;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Tries to fill the free list of a segment with consecutive free extents.
 This happens if the segment is big enough to allow extents in the free list,
 the free list is empty, and the extents can be allocated consecutively from
 the hint onward.
-@param[in,out]	inode	segment inode
+@param[in]	inode	segment inode
 @param[in,out]	iblock	segment inode page
 @param[in]	space	tablespace
 @param[in]	hint	hint which extent would be good as the first extent
 @param[in,out]	mtr	mini-transaction */
-static
-void
-fseg_fill_free_list(
-	fseg_inode_t*	inode,
-	buf_block_t*	iblock,
-	fil_space_t*	space,
-	uint32_t	hint,
-	mtr_t*		mtr)
+static dberr_t fseg_fill_free_list(const fseg_inode_t *inode,
+                                   buf_block_t *iblock, fil_space_t *space,
+                                   uint32_t hint, mtr_t *mtr)
 {
-	xdes_t*	descr;
-	ulint	i;
-	ib_id_t	seg_id;
-	ulint	reserved;
-	ulint	used;
-
-	ut_ad(inode && mtr);
-	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
-	ut_d(space->modify_check(*mtr));
-
-	reserved = fseg_n_reserved_pages_low(inode, &used);
-
-	if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) {
-
-		/* The segment is too small to allow extents in free list */
+  ulint	used;
 
-		return;
-	}
-
-	if (flst_get_len(inode + FSEG_FREE) > 0) {
-		/* Free list is not empty */
-
-		return;
-	}
+  ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+  ut_d(space->modify_check(*mtr));
 
-	for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) {
-		buf_block_t* xdes;
-		descr = xdes_get_descriptor(space, hint, &xdes, mtr);
+  if (fseg_n_reserved_pages_low(inode, &used) <
+      FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE)
+    /* The segment is too small to allow extents in free list */
+    return DB_SUCCESS;
 
-		if (!descr || (XDES_FREE != xdes_get_state(descr))) {
-			/* We cannot allocate the desired extent: stop */
-			return;
-		}
-
-		descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
+  if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+  {
+    space->set_corrupted();
+    return DB_CORRUPTION;
+  }
 
-		xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
+  if (flst_get_len(inode + FSEG_FREE) > 0)
+    /* Free list is not empty */
+    return DB_SUCCESS;
 
-		seg_id = mach_read_from_8(inode + FSEG_ID);
-		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
-		      == FSEG_MAGIC_N_VALUE);
-		mtr->write<8>(*xdes, descr + XDES_ID, seg_id);
+  for (ulint i= 0; i < FSEG_FREE_LIST_MAX_LEN; i++, hint += FSP_EXTENT_SIZE)
+  {
+    buf_block_t *xdes;
+    dberr_t err;
+    xdes_t *descr= xdes_get_descriptor(space, hint, mtr, &err, &xdes);
+    if (!descr || XDES_FREE != xdes_get_state(descr))
+      /* We cannot allocate the desired extent: stop */
+      return err;
+
+    descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, &err);
+    if (UNIV_UNLIKELY(!descr))
+      return err;
+
+    if (dberr_t err=
+        flst_add_last(iblock,
+                      static_cast<uint16_t>(inode - iblock->page.frame +
+                                            FSEG_FREE), xdes,
+                      static_cast<uint16_t>(descr - xdes->page.frame +
+                                            XDES_FLST_NODE), mtr))
+      return err;
+    xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
+    mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8);
+  }
 
-		flst_add_last(iblock,
-			      static_cast<uint16_t>(inode - iblock->frame
-						    + FSEG_FREE), xdes,
-			      static_cast<uint16_t>(descr - xdes->frame
-						    + XDES_FLST_NODE), mtr);
-		hint += FSP_EXTENT_SIZE;
-	}
+  return DB_SUCCESS;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Allocates a free extent for the segment: looks first in the free list of
 the segment, then tries to allocate from the space free list.
 NOTE that the extent returned still resides in the segment free list, it is
 not yet taken off it!
-@param[in,out]	inode		segment inode
+@param[in]	inode		segment inode
 @param[in,out]	iblock		segment inode page
 @param[out]	xdes		extent descriptor page
 @param[in,out]	space		tablespace
 @param[in,out]	mtr		mini-transaction
-@retval NULL	if no page could be allocated */
+@param[out]	err		error code
+@retval nullptr	if no page could be allocated */
 static
 xdes_t*
 fseg_alloc_free_extent(
-	fseg_inode_t*		inode,
+	const fseg_inode_t*	inode,
 	buf_block_t*		iblock,
 	buf_block_t**		xdes,
 	fil_space_t*		space,
-	mtr_t*			mtr)
+	mtr_t*			mtr,
+	dberr_t*		err)
 {
-	xdes_t*		descr;
-	ib_id_t		seg_id;
-	fil_addr_t	first;
-
-	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
-	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
-	ut_d(space->modify_check(*mtr));
-
-	if (flst_get_len(inode + FSEG_FREE) > 0) {
-		/* Segment free list is not empty, allocate from it */
-
-		first = flst_get_first(inode + FSEG_FREE);
-
-		descr = xdes_lst_get_descriptor(space, first, xdes, mtr);
-	} else {
-		/* Segment free list was empty, allocate from space */
-		descr = fsp_alloc_free_extent(space, 0, xdes, mtr);
-
-		if (descr == NULL) {
-
-			return(NULL);
-		}
-
-		seg_id = mach_read_from_8(inode + FSEG_ID);
-
-		xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
-		mtr->write<8,mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID,
-					       seg_id);
-		flst_add_last(iblock,
-			      static_cast<uint16_t>(inode - iblock->frame
-						    + FSEG_FREE), *xdes,
-			      static_cast<uint16_t>(descr - (*xdes)->frame
-						    + XDES_FLST_NODE), mtr);
+  ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+  ut_d(space->modify_check(*mtr));
 
-		/* Try to fill the segment free list */
-		fseg_fill_free_list(inode, iblock, space,
-				    xdes_get_offset(descr) + FSP_EXTENT_SIZE,
-				    mtr);
-	}
+  if (flst_get_len(inode + FSEG_FREE))
+  {
+    /* Segment free list is not empty, allocate from it */
+    return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE),
+                                   mtr, xdes, err);
+  }
 
-	return(descr);
+  xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err);
+  if (UNIV_UNLIKELY(!descr))
+    return descr;
+  xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
+  mtr->memcpy<mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID, inode + FSEG_ID, 8);
+  *err= flst_add_last(iblock,
+                      static_cast<uint16_t>(inode - iblock->page.frame +
+                                            FSEG_FREE), *xdes,
+                      static_cast<uint16_t>(descr - (*xdes)->page.frame +
+                                            XDES_FLST_NODE), mtr);
+  if (UNIV_LIKELY(*err != DB_SUCCESS))
+    return nullptr;
+  /* Try to fill the segment free list */
+  *err= fseg_fill_free_list(inode, iblock, space,
+                            xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    return nullptr;
+
+  return descr;
 }
 
 /** Allocates a single free page from a segment.
@@ -1979,7 +1955,9 @@ direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
 @param[in,out]	mtr			mini-transaction
 @param[in,out]	init_mtr		mtr or another mini-transaction in
 which the page should be initialized.
-@retval NULL	if no page could be allocated */
+@param[out]	err			error code
+@return the allocated page
+@retval nullptr	if no page could be allocated */
 static
 buf_block_t*
 fseg_alloc_free_page_low(
@@ -1993,7 +1971,8 @@ fseg_alloc_free_page_low(
 	/*!< whether the space has already been reserved */
 #endif /* UNIV_DEBUG */
 	mtr_t*			mtr,
-	mtr_t*			init_mtr)
+	mtr_t*			init_mtr,
+	dberr_t*		err)
 {
 	ib_id_t		seg_id;
 	ulint		used;
@@ -2004,11 +1983,9 @@ fseg_alloc_free_page_low(
 	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
 	buf_block_t*	xdes;
 	ulint		n;
-	const ulint	space_id	= space->id;
 
 	ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
-	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
-	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
 	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
 	seg_id = mach_read_from_8(seg_inode + FSEG_ID);
 
@@ -2018,16 +1995,25 @@ fseg_alloc_free_page_low(
 
 	reserved = fseg_n_reserved_pages_low(seg_inode, &used);
 
-	buf_block_t* header = fsp_get_header(space, mtr);
+	buf_block_t* header = fsp_get_header(space, mtr, err);
+	if (!header) {
+		return header;
+	}
 
-	descr = xdes_get_descriptor_with_space_hdr(header, space, hint,
-						   &xdes, mtr);
-	if (descr == NULL) {
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr,
+						   err, &xdes);
+	if (!descr) {
+		if (*err != DB_SUCCESS) {
+			return nullptr;
+		}
 		/* Hint outside space or too high above free limit: reset
 		hint */
 		/* The file space header page is always allocated. */
 		hint = 0;
-		descr = xdes_get_descriptor(space, hint, &xdes, mtr);
+		descr = xdes_get_descriptor(space, hint, mtr, err, &xdes);
+		if (!descr) {
+			return nullptr;
+		}
 	}
 
 	/* In the big if-else below we look for ret_page and ret_descr */
@@ -2053,30 +2039,44 @@ take_hinted_page:
 		=========================================================
 		the hinted page
 		===============*/
-		ret_descr = fsp_alloc_free_extent(space, hint, &xdes, mtr);
+		ret_descr = fsp_alloc_free_extent(space, hint, &xdes,
+						  mtr, err);
 
-		ut_a(ret_descr == descr);
+		if (UNIV_UNLIKELY(ret_descr != descr)) {
+			if (*err != DB_SUCCESS) {
+				*err = DB_CORRUPTION;
+			}
+			return nullptr;
+		}
 
 		xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr);
 		mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID,
 					       seg_id);
-		flst_add_last(iblock,
-			      static_cast<uint16_t>(seg_inode - iblock->frame
-						    + FSEG_FREE), xdes,
-			      static_cast<uint16_t>(ret_descr - xdes->frame
-						    + XDES_FLST_NODE), mtr);
+		*err = flst_add_last(
+			iblock,
+			static_cast<uint16_t>(seg_inode - iblock->page.frame
+					      + FSEG_FREE), xdes,
+			static_cast<uint16_t>(ret_descr
+					      - xdes->page.frame
+					      + XDES_FLST_NODE), mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return nullptr;
+		}
 
 		/* Try to fill the segment free list */
-		fseg_fill_free_list(seg_inode, iblock, space,
-				    hint + FSP_EXTENT_SIZE, mtr);
+		*err = fseg_fill_free_list(seg_inode, iblock, space,
+					   hint + FSP_EXTENT_SIZE, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return nullptr;
+		}
 		goto take_hinted_page;
 		/*-----------------------------------------------------------*/
 	} else if ((direction != FSP_NO_DIR)
 		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
 		   && (used >= FSEG_FRAG_LIMIT)
-		   && !!(ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
-							    &xdes, space,
-							    mtr))) {
+		   && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
+							  &xdes, space,
+							  mtr, err))) {
 		/* 3. We take any free extent (which was already assigned above
 		===============================================================
 		in the if-condition to ret_descr) and take the lowest or
@@ -2090,6 +2090,8 @@ take_hinted_page:
 		}
 		ut_ad(!has_done_reservation || ret_page != FIL_NULL);
 		/*-----------------------------------------------------------*/
+	} else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
 	} else if ((xdes_get_state(descr) == XDES_FSEG)
 		   && mach_read_from_8(descr + XDES_ID) == seg_id
 		   && (!xdes_is_full(descr))) {
@@ -2122,7 +2124,11 @@ take_hinted_page:
 			return(NULL);
 		}
 
-		ret_descr = xdes_lst_get_descriptor(space, first, &xdes, mtr);
+		ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes);
+		if (!ret_descr) {
+			return nullptr;
+		}
+
 		ret_page = xdes_find_free(ret_descr);
 		if (ret_page == FIL_NULL) {
 			ut_ad(!has_done_reservation);
@@ -2134,15 +2140,18 @@ take_hinted_page:
 		/* 6. We allocate an individual page from the space
 		===================================================*/
 		buf_block_t* block = fsp_alloc_free_page(
-			space, hint, mtr, init_mtr);
+			space, hint, mtr, init_mtr, err);
 
-		ut_ad(!has_done_reservation || block);
+		ut_ad(block || !has_done_reservation || *err);
 
 		if (block) {
 			/* Put the page in the fragment page array of the
 			segment */
 			n = fseg_find_free_frag_page_slot(seg_inode);
-			ut_a(n != ULINT_UNDEFINED);
+			if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
+				*err = DB_CORRUPTION;
+				return nullptr;
+			}
 
 			fseg_set_nth_frag_page_no(
 				seg_inode, iblock, n,
@@ -2157,14 +2166,13 @@ take_hinted_page:
 		/* 7. We allocate a new extent and take its first page
 		======================================================*/
 		ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
-						   space, mtr);
+						   space, mtr, err);
 
-		if (ret_descr == NULL) {
-			ret_page = FIL_NULL;
-			ut_ad(!has_done_reservation);
+		if (!ret_descr) {
+			ut_ad(!has_done_reservation || *err);
+			return nullptr;
 		} else {
 			ret_page = xdes_get_offset(ret_descr);
-			ut_ad(!has_done_reservation || ret_page != FIL_NULL);
 		}
 	}
 
@@ -2175,16 +2183,17 @@ take_hinted_page:
 		return(NULL);
 	}
 
-	if (space->size <= ret_page && !is_predefined_tablespace(space_id)) {
+	if (space->size <= ret_page && !is_predefined_tablespace(space->id)) {
 		/* It must be that we are extending a single-table
 		tablespace whose size is still < 64 pages */
 
 		if (ret_page >= FSP_EXTENT_SIZE) {
-			ib::error() << "Trying to extend '"
-			<< space->chain.start->name
-			<< "' by single page(s) though the"
-			<< " space size " << space->size
-			<< ". Page no " << ret_page << ".";
+			sql_print_error("InnoDB: Trying to extend '%s'"
+					" by single page(s) though the"
+					" space size " UINT32PF "."
+					" Page no " UINT32PF ".",
+					space->chain.start->name, space->size,
+					ret_page);
 			ut_ad(!has_done_reservation);
 			return(NULL);
 		}
@@ -2206,13 +2215,16 @@ got_hinted_page:
 		or FSEG_FREE), and the page is not yet marked as used. */
 
 		ut_d(buf_block_t* xxdes);
-		ut_ad(xdes_get_descriptor(space, ret_page, &xxdes, mtr)
+		ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes)
 		      == ret_descr);
 		ut_ad(xdes == xxdes);
 		ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
 
-		fseg_mark_page_used(seg_inode, iblock, ret_page, ret_descr,
-				    xdes, mtr);
+		*err = fseg_mark_page_used(seg_inode, iblock, ret_page,
+                                           ret_descr, xdes, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return nullptr;
+		}
 	}
 
 	return fsp_page_create(space, ret_page, init_mtr);
@@ -2240,8 +2252,9 @@ fseg_alloc_free_page_general(
 				is no need to do the check for this individual
 				page */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
-	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+	mtr_t*		init_mtr,/*!< in/out: mtr or another mini-transaction
 				in which the page should be initialized. */
+	dberr_t*	err)	/*!< out: error code */
 {
 	fseg_inode_t*	inode;
 	ulint		space_id;
@@ -2251,17 +2264,22 @@ fseg_alloc_free_page_general(
 	uint32_t	n_reserved;
 
 	space_id = page_get_space_id(page_align(seg_header));
-	space = mtr_x_lock_space(space_id, mtr);
-	inode = fseg_inode_get(seg_header, space_id, space->zip_size(),
-			       mtr, &iblock);
+	space = mtr->x_lock_space(space_id);
+	inode = fseg_inode_try_get(seg_header, space_id, space->zip_size(),
+				   mtr, &iblock, err);
+	if (!inode) {
+		return nullptr;
+	}
 	if (!space->full_crc32()) {
 		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
 	}
 
-	if (!has_done_reservation
-	    && !fsp_reserve_free_extents(&n_reserved, space, 2,
-					 FSP_NORMAL, mtr)) {
-		return(NULL);
+	if (!has_done_reservation) {
+		*err = fsp_reserve_free_extents(&n_reserved, space, 2,
+						FSP_NORMAL, mtr);
+		if (*err != DB_SUCCESS) {
+			return nullptr;
+		}
 	}
 
 	block = fseg_alloc_free_page_low(space,
@@ -2269,11 +2287,11 @@ fseg_alloc_free_page_general(
 #ifdef UNIV_DEBUG
 					 has_done_reservation,
 #endif /* UNIV_DEBUG */
-					 mtr, init_mtr);
+					 mtr, init_mtr, err);
 
 	/* The allocation cannot fail if we have already reserved a
 	space for the page. */
-	ut_ad(!has_done_reservation || block != NULL);
+	ut_ad(block || !has_done_reservation || *err);
 
 	if (!has_done_reservation) {
 		space->release_free_extents(n_reserved);
@@ -2282,6 +2300,7 @@ fseg_alloc_free_page_general(
 	return(block);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Check that we have at least n_pages frag pages free in the first extent
 of a single-table tablespace, and they are also physically initialized to
 the data file. That is we have already extended the data file so that those
@@ -2292,10 +2311,9 @@ with pages.
 @param[in]	size	tablespace size in pages, less than FSP_EXTENT_SIZE
 @param[in,out]	mtr	mini-transaction
 @param[in]	n_pages	number of pages to reserve
-@return true if there were at least n_pages free pages, or we were able
-to extend */
+@return error code */
 static
-bool
+dberr_t
 fsp_reserve_free_pages(
 	fil_space_t*	space,
 	buf_block_t*	header,
@@ -2303,21 +2321,23 @@ fsp_reserve_free_pages(
 	mtr_t*		mtr,
 	uint32_t	n_pages)
 {
-	xdes_t*	descr;
-
-	ut_a(!is_system_tablespace(space->id));
-	ut_a(size < FSP_EXTENT_SIZE);
-
-	buf_block_t* xdes;
-	descr = xdes_get_descriptor_with_space_hdr(header, space, 0, &xdes,
-						   mtr);
-	uint32_t n_used = xdes_get_n_used(descr);
-
-	ut_a(n_used <= size);
-
-	return(size >= n_used + n_pages
-	       || fsp_try_extend_data_file_with_pages(
-		       space, n_used + n_pages - 1, header, mtr));
+  ut_ad(space != fil_system.sys_space && space != fil_system.temp_space);
+  ut_ad(size < FSP_EXTENT_SIZE);
+
+  dberr_t err= DB_OUT_OF_FILE_SPACE;
+  const xdes_t *descr=
+    xdes_get_descriptor_with_space_hdr(header, space, 0, mtr, &err);
+  if (!descr)
+    return err;
+  const uint32_t n_used= xdes_get_n_used(descr);
+  if (size >= n_used + n_pages)
+    return DB_SUCCESS;
+  if (n_used > size)
+    return DB_CORRUPTION;
+  return fsp_try_extend_data_file_with_pages(space, n_used + n_pages - 1,
+                                             header, mtr)
+    ? DB_SUCCESS
+    : DB_OUT_OF_FILE_SPACE;
 }
 
 /** Reserves free pages from a tablespace. All mini-transactions which may
@@ -2357,8 +2377,9 @@ free pages available.
 @param[in]	n_pages		for small tablespaces (tablespace size is
 				less than FSP_EXTENT_SIZE), number of free
 				pages to reserve.
-@return true if we were able to make the reservation */
-bool
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
 fsp_reserve_free_extents(
 	uint32_t*	n_reserved,
 	fil_space_t*	space,
@@ -2374,29 +2395,33 @@ fsp_reserve_free_extents(
 
 	const uint32_t extent_size = FSP_EXTENT_SIZE;
 
-	mtr_x_lock_space(space, mtr);
+	mtr->x_lock_space(space);
 	const unsigned physical_size = space->physical_size();
 
-	buf_block_t* header = fsp_get_header(space, mtr);
+	dberr_t err;
+	buf_block_t* header = fsp_get_header(space, mtr, &err);
+	if (!header) {
+		return err;
+	}
 try_again:
 	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
-					 + header->frame);
+					 + header->page.frame);
 	ut_ad(size == space->size_in_header);
 
 	if (size < extent_size && n_pages < extent_size / 2) {
 		/* Use different rules for small single-table tablespaces */
 		*n_reserved = 0;
-		return(fsp_reserve_free_pages(space, header, size,
-					      mtr, n_pages));
+		return fsp_reserve_free_pages(space, header, size,
+					      mtr, n_pages);
 	}
 
 	uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
-						+ header->frame);
+						+ header->page.frame);
 	ut_ad(space->free_len == n_free_list_ext);
 
 	uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET
 					       + FSP_FREE_LIMIT
-					       + header->frame);
+					       + header->page.frame);
 	ut_ad(space->free_limit == free_limit);
 
 	/* Below we play safe when counting free extents above the free limit:
@@ -2450,54 +2475,62 @@ try_again:
 	}
 
 	if (space->reserve_free_extents(n_free, n_ext)) {
-		return(true);
+		return DB_SUCCESS;
 	}
 try_to_extend:
 	if (fsp_try_extend_data_file(space, header, mtr)) {
 		goto try_again;
 	}
 
-	return(false);
+	return DB_OUT_OF_FILE_SPACE;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Frees a single page of a segment.
 @param[in]	seg_inode	segment inode
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in,out]	mtr		mini-transaction */
+@param[in,out]	mtr		mini-transaction
+@param[in]	ahi		Drop adaptive hash index
+@return error code */
 static
-void
+dberr_t
 fseg_free_page_low(
 	fseg_inode_t*		seg_inode,
 	buf_block_t*		iblock,
 	fil_space_t*		space,
 	page_no_t		offset,
-	mtr_t*			mtr)
+	mtr_t*			mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool			ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
 {
-	ib_id_t	descr_id;
-	ib_id_t	seg_id;
-
-	ut_ad(seg_inode != NULL);
-	ut_ad(mtr != NULL);
-	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
-	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
 	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
-	ut_ad(iblock->frame == page_align(seg_inode));
+	ut_ad(iblock->page.frame == page_align(seg_inode));
 	ut_d(space->modify_check(*mtr));
 
+#ifdef BTR_CUR_HASH_ADAPT
+	if (ahi) {
+		btr_search_drop_page_hash_when_freed(
+			page_id_t(space->id, offset));
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
 	const uint32_t extent_size = FSP_EXTENT_SIZE;
 	ut_ad(ut_is_2pow(extent_size));
 	buf_block_t* xdes;
-	xdes_t* descr = xdes_get_descriptor(space, offset, &xdes, mtr);
+	dberr_t err;
+	xdes_t* descr = xdes_get_descriptor(space, offset, mtr, &err, &xdes);
 
-	if (xdes_is_free(descr, offset & (extent_size - 1))) {
-		ib::fatal() << "InnoDB is trying to free page "
-			<< page_id_t(space->id, offset)
-			<< " though it is already marked as free in the"
-			" tablespace! The tablespace free space info is"
-			" corrupt. You may need to dump your tables and"
-			" recreate the whole database!"
-			<< FORCE_RECOVERY_MSG;
+	if (!descr) {
+		return err;
+	}
+	if (UNIV_UNLIKELY(xdes_is_free(descr, offset & (extent_size - 1)))) {
+corrupted:
+		space->set_corrupted();
+		return DB_CORRUPTION;
 	}
 
 	if (xdes_get_state(descr) != XDES_FSEG) {
@@ -2509,240 +2542,310 @@ fseg_free_page_low(
 			}
 
 			compile_time_assert(FIL_NULL == 0xffffffff);
-			mtr->memset(iblock, uint16_t(seg_inode - iblock->frame)
+			mtr->memset(iblock, uint16_t(seg_inode
+						     - iblock->page.frame)
 				    + FSEG_FRAG_ARR
 				    + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff);
 			break;
 		}
 
-		fsp_free_page(space, offset, mtr);
-		return;
+		return fsp_free_page(space, offset, mtr);
 	}
 
 	/* If we get here, the page is in some extent of the segment */
 
-	descr_id = mach_read_from_8(descr + XDES_ID);
-	seg_id = mach_read_from_8(seg_inode + FSEG_ID);
-
-	if (UNIV_UNLIKELY(descr_id != seg_id)) {
-		fputs("InnoDB: Dump of the tablespace extent descriptor: ",
-		      stderr);
-		ut_print_buf(stderr, descr, 40);
-		fputs("\nInnoDB: Dump of the segment inode: ", stderr);
-		ut_print_buf(stderr, seg_inode, 40);
-		putc('\n', stderr);
-
-		ib::fatal() << "InnoDB is trying to free page "
-			<< page_id_t(space->id, offset)
-			<< ", which does not belong to segment " << descr_id
-			<< " but belongs to segment " << seg_id << "."
-			<< FORCE_RECOVERY_MSG;
+	if (UNIV_UNLIKELY(memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8))) {
+		goto corrupted;
 	}
 
 	byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED;
 	uint32_t not_full_n_used = mach_read_from_4(p_not_full);
-	const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
-	const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+	const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+					 + XDES_FLST_NODE);
+	const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
 
 	if (xdes_is_full(descr)) {
 		/* The fragment is full: move it to another list */
-		flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset),
-			    xdes, xoffset, mtr);
-		flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
-							    + ioffset),
-			      xdes, xoffset, mtr);
+		err = flst_remove(iblock,
+				  static_cast<uint16_t>(FSEG_FULL + ioffset),
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+								  + ioffset),
+				    xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
 		not_full_n_used += extent_size - 1;
 	} else {
-		ut_a(not_full_n_used > 0);
+		if (!not_full_n_used) {
+			goto corrupted;
+		}
 		not_full_n_used--;
 	}
 
 	mtr->write<4>(*iblock, p_not_full, not_full_n_used);
-
-	const ulint	bit = offset & (extent_size - 1);
-
-	xdes_set_free<true>(*xdes, descr, bit, mtr);
+	xdes_set_free<true>(*xdes, descr, offset & (extent_size - 1), mtr);
 
 	if (!xdes_get_n_used(descr)) {
-		/* The extent has become free: free it to space */
-		flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
-							  + ioffset),
-			    xdes, xoffset, mtr);
-		fsp_free_extent(space, offset, mtr);
+		err = flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+								+ ioffset),
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = fsp_free_extent(space, offset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
 	}
 
 	mtr->free(*space, static_cast<uint32_t>(offset));
+	return DB_SUCCESS;
 }
 
 /** Free a page in a file segment.
 @param[in,out]	seg_header	file segment header
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in,out]	mtr		mini-transaction */
-void
-fseg_free_page(
-	fseg_header_t*	seg_header,
-	fil_space_t*	space,
-	uint32_t	offset,
-	mtr_t*		mtr)
+@param[in,out]	mtr		mini-transaction
+@param[in]	have_latch	whether space->x_lock() was already called
+@return error code */
+dberr_t fseg_free_page(fseg_header_t *seg_header, fil_space_t *space,
+                       uint32_t offset, mtr_t *mtr, bool have_latch)
 {
-	DBUG_ENTER("fseg_free_page");
-	fseg_inode_t*		seg_inode;
-	buf_block_t*		iblock;
-	mtr_x_lock_space(space, mtr);
-
-	DBUG_LOG("fseg_free_page", "space_id: " << space->id
-		 << ", page_no: " << offset);
-
-	seg_inode = fseg_inode_get(seg_header, space->id, space->zip_size(),
-				   mtr,
-				   &iblock);
-	if (!space->full_crc32()) {
-		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
-	}
-
-	fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
+  buf_block_t *iblock;
+  if (have_latch)
+    ut_ad(space->is_owner());
+  else
+    mtr->x_lock_space(space);
+
+  DBUG_PRINT("fseg_free_page",
+             ("space_id: " ULINTPF ", page_no: %u", space->id, offset));
+
+  dberr_t err;
+  if (fseg_inode_t *seg_inode= fseg_inode_try_get(seg_header,
+                                                  space->id, space->zip_size(),
+                                                  mtr, &iblock, &err))
+  {
+    if (!space->full_crc32())
+      fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+    return fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
+  }
 
-	DBUG_VOID_RETURN;
+  return err;
 }
 
-/** Determine whether a page is free.
-@param[in,out]	space	tablespace
-@param[in]	page	page number
-@return whether the page is marked as free */
-bool
-fseg_page_is_free(fil_space_t* space, unsigned page)
+/** Determine whether a page is allocated.
+@param space   tablespace
+@param page    page number
+@return error code
+@retval DB_SUCCESS             if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC  if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
 {
-	bool		is_free;
-	mtr_t		mtr;
-	page_no_t	dpage = xdes_calc_descriptor_page(space->zip_size(),
-							  page);
-
-	mtr.start();
-	mtr_sx_lock_space(space, &mtr);
-
-	if (page >= space->free_limit || page >= space->size_in_header) {
-		is_free = true;
-	} else if (const xdes_t* descr = xdes_get_descriptor_const(
-			   space, dpage, page, &mtr)) {
-		is_free = xdes_is_free(descr, page % FSP_EXTENT_SIZE);
-	} else {
-		is_free = true;
-	}
-	mtr.commit();
+  mtr_t mtr;
+  uint32_t dpage= xdes_calc_descriptor_page(space->zip_size(), page);
+  const unsigned zip_size= space->zip_size();
+  dberr_t err= DB_SUCCESS;
+
+  mtr.start();
+  if (!space->is_owner())
+    mtr.x_lock_space(space);
+
+  if (page >= space->free_limit || page >= space->size_in_header);
+  else if (const buf_block_t *b=
+           buf_page_get_gen(page_id_t(space->id, dpage), space->zip_size(),
+                            RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+                            &mtr, &err))
+  {
+    if (!dpage &&
+        (space->free_limit !=
+         mach_read_from_4(FSP_FREE_LIMIT + FSP_HEADER_OFFSET +
+                          b->page.frame) ||
+         space->size_in_header !=
+         mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + b->page.frame)))
+      err= DB_CORRUPTION;
+    else
+      err= xdes_is_free(b->page.frame + XDES_ARR_OFFSET + XDES_SIZE
+                        * xdes_calc_descriptor_index(zip_size, page),
+                        page & (FSP_EXTENT_SIZE - 1))
+        ? DB_SUCCESS
+        : DB_SUCCESS_LOCKED_REC;
+  }
 
-	return(is_free);
+  mtr.commit();
+  return err;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Free an extent of a segment to the space free list.
 @param[in,out]	seg_inode	segment inode
 @param[in,out]	space		tablespace
 @param[in]	page		page number in the extent
-@param[in,out]	mtr		mini-transaction */
-MY_ATTRIBUTE((nonnull))
+@param[in,out]	mtr		mini-transaction
+@return error code */
 static
-void
+dberr_t
 fseg_free_extent(
 	fseg_inode_t*		seg_inode,
 	buf_block_t*		iblock,
 	fil_space_t*		space,
 	uint32_t		page,
-	mtr_t*			mtr)
+	mtr_t*			mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool			ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
 {
-
-	ut_ad(mtr != NULL);
-
 	buf_block_t* xdes;
-	xdes_t*	descr = xdes_get_descriptor(space, page, &xdes, mtr);
+	dberr_t err;
+	xdes_t*	descr = xdes_get_descriptor(space, page, mtr, &err, &xdes);
+
+	if (!descr) {
+		return err;
+	}
 
-	ut_a(xdes_get_state(descr) == XDES_FSEG);
-	ut_a(!memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8));
-	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
-	      == FSEG_MAGIC_N_VALUE);
+	if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FSEG
+			  || memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)
+			  || memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N
+				    + seg_inode, 4))) {
+		return DB_CORRUPTION;
+	}
 	ut_d(space->modify_check(*mtr));
 	const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
 
-	const uint16_t xoffset= uint16_t(descr - xdes->frame + XDES_FLST_NODE);
-	const uint16_t ioffset= uint16_t(seg_inode - iblock->frame);
+	const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+					 + XDES_FLST_NODE);
+	const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (ahi) {
+		for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+			if (!xdes_is_free(descr, i)) {
+				/* Drop search system page hash index
+				if the page is found in the pool and
+				is hashed */
+				btr_search_drop_page_hash_when_freed(
+					page_id_t(space->id,
+						 first_page_in_extent + i));
+			}
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	uint16_t lst;
 
 	if (xdes_is_full(descr)) {
-		flst_remove(iblock, static_cast<uint16_t>(FSEG_FULL + ioffset),
-			    xdes, xoffset, mtr);
+		lst = static_cast<uint16_t>(FSEG_FULL + ioffset);
+remove:
+		err = flst_remove(iblock, lst, xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
 	} else if (!xdes_get_n_used(descr)) {
-		flst_remove(iblock, static_cast<uint16_t>(FSEG_FREE + ioffset),
-			    xdes, xoffset, mtr);
+		lst = static_cast<uint16_t>(FSEG_FREE + ioffset);
+                goto remove;
 	} else {
-		flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
-							  + ioffset),
-			    xdes, xoffset, mtr);
+		err = flst_remove(
+			iblock, static_cast<uint16_t>(FSEG_NOT_FULL + ioffset),
+			xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
 		uint32_t not_full_n_used = mach_read_from_4(
 			FSEG_NOT_FULL_N_USED + seg_inode);
 		uint32_t descr_n_used = xdes_get_n_used(descr);
-		ut_a(not_full_n_used >= descr_n_used);
+		if (not_full_n_used < descr_n_used) {
+			return DB_CORRUPTION;
+		}
 		mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
 			      not_full_n_used - descr_n_used);
 	}
 
-	fsp_free_extent(space, page, mtr);
+	err = fsp_free_extent(space, page, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return err;
+	}
 
 	for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
 		if (!xdes_is_free(descr, i)) {
-			buf_page_free(space, first_page_in_extent + i, mtr,
-				      __FILE__, __LINE__);
+			buf_page_free(space, first_page_in_extent + i, mtr);
 		}
 	}
+
+	return DB_SUCCESS;
 }
 
-/**********************************************************************//**
-Frees part of a segment. This function can be used to free a segment by
-repeatedly calling this function in different mini-transactions. Doing
-the freeing in a single mini-transaction might result in too big a
-mini-transaction.
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param	header	segment header; NOTE: if the header resides on first
+		page of the frag list of the segment, this pointer
+		becomes obsolete after the last freeing step
+@param	mtr	mini-transaction
+@param	ahi	Drop the adaptive hash index
 @return whether the freeing was completed */
 bool
 fseg_free_step(
-	fseg_header_t*	header,	/*!< in, own: segment header; NOTE: if the header
-				resides on the first page of the frag list
-				of the segment, this pointer becomes obsolete
-				after the last freeing step */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
 {
 	ulint		n;
 	fseg_inode_t*	inode;
 
-	DBUG_ENTER("fseg_free_step");
-
 	const uint32_t space_id = page_get_space_id(page_align(header));
 	const uint32_t header_page = page_get_page_no(page_align(header));
 
-	fil_space_t* space = mtr_x_lock_space(space_id, mtr);
-	buf_block_t* xdes;
-	xdes_t* descr = xdes_get_descriptor(space, header_page, &xdes, mtr);
+	fil_space_t* space = mtr->x_lock_space(space_id);
+	xdes_t* descr = xdes_get_descriptor(space, header_page, mtr);
+
+	if (!descr) {
+		return true;
+	}
 
 	/* Check that the header resides on a page which has not been
 	freed yet */
 
-	ut_a(!xdes_is_free(descr, header_page % FSP_EXTENT_SIZE));
+	if (UNIV_UNLIKELY(xdes_is_free(descr,
+				       header_page & (FSP_EXTENT_SIZE - 1)))) {
+		/* Some corruption was detected: stop the freeing
+		in order to prevent a crash. */
+		return true;
+	}
 	buf_block_t* iblock;
 	const ulint zip_size = space->zip_size();
 	inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock);
-
-	if (inode == NULL) {
-		ib::info() << "Double free of inode from "
-			<< page_id_t(space_id, header_page);
-		DBUG_RETURN(true);
+	if (!inode || space->is_stopping()) {
+		return true;
 	}
 
 	if (!space->full_crc32()) {
 		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
 	}
-	descr = fseg_get_first_extent(inode, space, mtr);
 
-	if (descr != NULL) {
+	dberr_t err;
+	descr = fseg_get_first_extent(inode, space, mtr, &err);
+
+	if (descr) {
 		/* Free the extent held by the segment */
-		fseg_free_extent(inode, iblock, space, xdes_get_offset(descr),
-				 mtr);
-		DBUG_RETURN(false);
+		return fseg_free_extent(inode, iblock, space,
+					xdes_get_offset(descr), mtr
+#ifdef BTR_CUR_HASH_ADAPT
+					, ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+					) != DB_SUCCESS;
+	}
+
+	if (err != DB_SUCCESS || space->is_stopping()) {
+		return true;
 	}
 
 	/* Free a frag page */
@@ -2751,15 +2854,20 @@ fseg_free_step(
 	if (n == ULINT_UNDEFINED) {
 		/* Freeing completed: free the segment inode */
 		fsp_free_seg_inode(space, inode, iblock, mtr);
-
-		DBUG_RETURN(true);
+		return true;
 	}
 
 	page_no_t page_no = fseg_get_nth_frag_page_no(inode, n);
 
-	fseg_free_page_low(inode, iblock, space, page_no, mtr);
+	if (fseg_free_page_low(inode, iblock, space, page_no, mtr
+#ifdef BTR_CUR_HASH_ADAPT
+			       , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+			       ) != DB_SUCCESS) {
+		return true;
+	}
 
-	buf_page_free(space, page_no, mtr, __FILE__, __LINE__);
+	buf_page_free(space, page_no, mtr);
 
 	n = fseg_find_last_used_frag_page_slot(inode);
 
@@ -2767,52 +2875,67 @@ fseg_free_step(
 		/* Freeing completed: free the segment inode */
 		fsp_free_seg_inode(space, inode, iblock, mtr);
 
-		DBUG_RETURN(true);
+		return true;
 	}
 
-	DBUG_RETURN(false);
+	return false;
 }
 
-/**********************************************************************//**
-Frees part of a segment. Differs from fseg_free_step because this function
-leaves the header page unfreed.
-@return whether the freeing was completed, except for the header page */
 bool
 fseg_free_step_not_header(
-	fseg_header_t*	header,	/*!< in: segment header which must reside on
-				the first fragment page of the segment */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
 {
-	ulint		n;
-	xdes_t*		descr;
 	fseg_inode_t*	inode;
 
 	const uint32_t space_id = page_get_space_id(page_align(header));
 	ut_ad(mtr->is_named_space(space_id));
 
-	fil_space_t*		space = mtr_x_lock_space(space_id, mtr);
+	fil_space_t*		space = mtr->x_lock_space(space_id);
 	buf_block_t*		iblock;
 
-	inode = fseg_inode_get(header, space_id, space->zip_size(), mtr,
-			       &iblock);
+	inode = fseg_inode_try_get(header, space_id, space->zip_size(),
+				   mtr, &iblock);
+	if (space->is_stopping()) {
+		return true;
+	}
+
+	if (!inode) {
+		ib::warn() << "Double free of "
+			   << page_id_t(space_id,
+					page_get_page_no(page_align(header)));
+		return true;
+	}
+
 	if (!space->full_crc32()) {
 		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
 	}
 
-	descr = fseg_get_first_extent(inode, space, mtr);
-
-	if (descr != NULL) {
+	dberr_t err;
+	if (xdes_t* descr = fseg_get_first_extent(inode, space, mtr, &err)) {
 		/* Free the extent held by the segment */
-		fseg_free_extent(inode, iblock, space, xdes_get_offset(descr),
-				 mtr);
-		return false;
+		return fseg_free_extent(inode, iblock, space,
+					xdes_get_offset(descr),
+					mtr
+#ifdef BTR_CUR_HASH_ADAPT
+					, ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+					) != DB_SUCCESS;
+	} else if (err != DB_SUCCESS) {
+		return true;
 	}
 
 	/* Free a frag page */
 
-	n = fseg_find_last_used_frag_page_slot(inode);
+	ulint n = fseg_find_last_used_frag_page_slot(inode);
 
-	ut_a(n != ULINT_UNDEFINED);
+	if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
+		return true;
+	}
 
 	uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
 
@@ -2820,8 +2943,14 @@ fseg_free_step_not_header(
 		return true;
 	}
 
-	fseg_free_page_low(inode, iblock, space, page_no, mtr);
-	buf_page_free(space, page_no, mtr, __FILE__, __LINE__);
+	if (fseg_free_page_low(inode, iblock, space, page_no, mtr
+#ifdef BTR_CUR_HASH_ADAPT
+			       , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+			       ) != DB_SUCCESS) {
+		return true;
+	}
+	buf_page_free(space, page_no, mtr);
 	return false;
 }
 
@@ -2831,36 +2960,43 @@ FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
 @param[in]	inode		segment inode
 @param[in]	space		tablespace
 @param[in,out]	mtr		mini-transaction
-@return the first extent descriptor, or NULL if none */
+@return the first extent descriptor
+@retval nullptr if none, or on corruption */
 MY_ATTRIBUTE((nonnull, warn_unused_result))
 static
 xdes_t*
 fseg_get_first_extent(
 	fseg_inode_t*		inode,
 	const fil_space_t*	space,
-	mtr_t*			mtr)
+	mtr_t*			mtr,
+	dberr_t*		err)
 {
-	fil_addr_t	first;
-
-	ut_ad(space->id == page_get_space_id(page_align(inode)));
-	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+  if (UNIV_UNLIKELY(space->id != page_get_space_id(page_align(inode)) ||
+                    memcmp(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4)))
+  {
+  corrupted:
+    *err= DB_CORRUPTION;
+    return nullptr;
+  }
 
-	if (flst_get_len(inode + FSEG_FULL) > 0) {
-		first = flst_get_first(inode + FSEG_FULL);
-	} else if (flst_get_len(inode + FSEG_NOT_FULL) > 0) {
-		first = flst_get_first(inode + FSEG_NOT_FULL);
-	} else if (flst_get_len(inode + FSEG_FREE) > 0) {
-		first = flst_get_first(inode + FSEG_FREE);
-	} else {
-		return(NULL);
-	}
+  fil_addr_t first;
 
-	DBUG_ASSERT(first.page != FIL_NULL);
+  if (flst_get_len(inode + FSEG_FULL))
+    first= flst_get_first(inode + FSEG_FULL);
+  else if (flst_get_len(inode + FSEG_NOT_FULL))
+    first= flst_get_first(inode + FSEG_NOT_FULL);
+  else if (flst_get_len(inode + FSEG_FREE))
+    first= flst_get_first(inode + FSEG_FREE);
+  else
+  {
+    *err= DB_SUCCESS;
+    return nullptr;
+  }
 
-	buf_block_t *xdes;
+  if (first.page == FIL_NULL)
+    goto corrupted;
 
-	return(first.page == FIL_NULL ? NULL
-	       : xdes_lst_get_descriptor(space, first, &xdes, mtr));
+  return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err);
 }
 
 #ifdef UNIV_BTR_PRINT
@@ -2900,7 +3036,7 @@ static void fseg_print_low(const fseg_inode_t *inode)
 		<< " free extents " << n_free << ";"
 		<< " not full extents " << n_not_full << ": pages " << n_used;
 
-	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
 }
 
 /*******************************************************************//**
@@ -2911,15 +3047,12 @@ fseg_print(
 	fseg_header_t*	header, /*!< in: segment header */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	fseg_inode_t*	inode;
-	ulint		space_id;
-
-	space_id = page_get_space_id(page_align(header));
-	const fil_space_t*	space = mtr_x_lock_space(space_id, mtr);
-
-	inode = fseg_inode_get(header, space_id, space->zip_size(), mtr);
-
-	fseg_print_low(inode);
+  const fil_space_t *space=
+    mtr->x_lock_space(page_get_space_id(page_align(header)));
+  buf_block_t *block;
+  if (fseg_inode_t *inode=
+      fseg_inode_try_get(header, space->id, space->zip_size(), mtr, &block))
+    fseg_print_low(inode);
 }
 #endif /* UNIV_BTR_PRINT */
 
diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc
index b0a80efe7c4..b069250ff9f 100644
--- a/storage/innobase/fsp/fsp0space.cc
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -131,7 +131,7 @@ Tablespace::open_or_create(bool is_temp)
 			}
 
 			space = fil_space_t::create(
-				m_name, m_space_id, fsp_flags,
+				m_space_id, fsp_flags,
 				is_temp
 				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
 				NULL);
@@ -178,7 +178,7 @@ Tablespace::delete_files()
 
 		if (success && file_pre_exists) {
 			ib::info() << "Removed temporary tablespace data"
-				" file: \"" << it->m_name << "\"";
+				" file: \"" << it->m_filepath << "\"";
 		}
 	}
 }
@@ -191,18 +191,13 @@ must end with the extension .ibd and have a basename of at least 1 byte.
 
 Set tablespace m_path member and add a Datafile with the filename.
 @param[in]	datafile_path	full path of the tablespace file. */
-dberr_t
-Tablespace::add_datafile(
-	const char*	datafile_added)
+dberr_t Tablespace::add_datafile(const char *filepath)
 {
 	/* The path provided ends in ".ibd".  This was assured by
 	validate_create_tablespace_info() */
-	ut_d(const char* dot = strrchr(datafile_added, '.'));
+	ut_d(const char* dot = strrchr(filepath, '.'));
 	ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD));
 
-	char* filepath = mem_strdup(datafile_added);
-	os_normalize_path(filepath);
-
 	/* If the path is an absolute path, separate it onto m_path and a
 	basename. For relative paths, make the whole thing a basename so that
 	it can be appended to the datadir. */
@@ -219,12 +214,9 @@ Tablespace::add_datafile(
 
 	/* Now add a new Datafile and set the filepath
 	using the m_path created above. */
-	m_files.push_back(Datafile(m_name, m_flags,
-				   FIL_IBD_FILE_INITIAL_SIZE, 0));
-	Datafile* datafile = &m_files.back();
-	datafile->make_filepath(m_path, basename, IBD);
-
-	ut_free(filepath);
+	m_files.push_back(Datafile(m_flags, FIL_IBD_FILE_INITIAL_SIZE, 0));
+	m_files.back().make_filepath(m_path, {basename, strlen(basename) - 4},
+				     IBD);
 
 	return(DB_SUCCESS);
 }
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
index 07a8295a94e..497e4100557 100644
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -275,10 +275,10 @@ SysTablespace::parse_params(
 			}
 		}
 
-		m_files.push_back(Datafile(filepath, flags(), uint32_t(size),
-					   order));
-		Datafile* datafile = &m_files.back();
-		datafile->make_filepath(path(), filepath, NO_EXT);
+		m_files.push_back(Datafile(flags(), uint32_t(size), order));
+		m_files.back().make_filepath(path(),
+					     {filepath, strlen(filepath)},
+					     NO_EXT);
 
 		if (::strlen(str) >= 6
 		    && *str == 'n'
@@ -361,13 +361,12 @@ SysTablespace::check_size(
 		if (file.m_size > rounded_size_pages
 		    || (m_last_file_size_max > 0
 			&& m_last_file_size_max < rounded_size_pages)) {
-			ib::error() << "The Auto-extending " << name()
-				<< " data file '" << file.filepath() << "' is"
-				" of a different size " << rounded_size_pages
-				<< " pages than specified"
-				" in the .cnf file: initial " << file.m_size
-				<< " pages, max " << m_last_file_size_max
-				<< " (relevant if non-zero) pages!";
+			ib::error() << "The Auto-extending data file '"
+				    << file.filepath()
+				    << "' is of a different size "
+				    << rounded_size_pages
+				    << " pages than specified"
+				" by innodb_data_file_path";
 			return(DB_ERROR);
 		}
 
@@ -375,11 +374,11 @@ SysTablespace::check_size(
 	}
 
 	if (rounded_size_pages != file.m_size) {
-		ib::error() << "The " << name() << " data file '"
+		ib::error() << "The data file '"
 			<< file.filepath() << "' is of a different size "
 			<< rounded_size_pages << " pages"
-			" than the " << file.m_size << " pages specified in"
-			" the .cnf file!";
+			" than the " << file.m_size << " pages specified by"
+			" innodb_data_file_path";
 		return(DB_ERROR);
 	}
 
@@ -607,7 +606,7 @@ SysTablespace::read_lsn_and_check_flags(lsn_t* flushed_lsn)
 	if (space_id() != it->m_space_id) {
 
 		ib::error()
-			<< "The " << name() << " data file '" << it->name()
+			<< "The data file '" << it->filepath()
 			<< "' has the wrong space ID. It should be "
 			<< space_id() << ", but " << it->m_space_id
 			<< " was found";
@@ -651,20 +650,16 @@ SysTablespace::check_file_status(
 		break;
 
 	case DB_SUCCESS:
-
 		/* Note: stat.rw_perm is only valid for "regular" files */
 
 		if (stat.type == OS_FILE_TYPE_FILE) {
-
 			if (!stat.rw_perm) {
-				const char	*p = (!srv_read_only_mode
-						      || m_ignore_read_only)
-						     ? "writable"
-						     : "readable";
-
-				ib::error() << "The " << name() << " data file"
-					<< " '" << file.name() << "' must be "
-					<< p;
+				ib::error() << "The data file"
+					    << " '" << file.filepath()
+					    << ((!srv_read_only_mode
+						 || m_ignore_read_only)
+						? "' must be writable"
+						: "' must be readable");
 
 				err = DB_ERROR;
 				reason = FILE_STATUS_READ_WRITE_ERROR;
@@ -672,9 +667,8 @@ SysTablespace::check_file_status(
 
 		} else {
 			/* Not a regular file, bail out. */
-			ib::error() << "The " << name() << " data file '"
-				<< file.name() << "' is not a regular"
-				" InnoDB data file.";
+			ib::error() << "The data file '" << file.filepath()
+				    << "' is not a regular file.";
 
 			err = DB_ERROR;
 			reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR;
@@ -720,14 +714,14 @@ SysTablespace::file_not_found(
 		*create_new_db = TRUE;
 
 		if (space_id() == TRX_SYS_SPACE) {
-			ib::info() << "The first " << name() << " data file '"
-				<< file.name() << "' did not exist."
+			ib::info() << "The first data file '"
+				<< file.filepath() << "' did not exist."
 				" A new tablespace will be created!";
 		}
 
 	} else {
-		ib::info() << "Need to create a new " << name()
-			<< " data file '" << file.name() << "'.";
+		ib::info() << "Need to create a new data file '"
+			   << file.filepath() << "'.";
 	}
 
 	/* Set the file create mode. */
@@ -786,8 +780,8 @@ SysTablespace::check_file_spec(
 	*create_new_db = FALSE;
 
 	if (m_files.size() >= 1000) {
-		ib::error() << "There must be < 1000 data files in "
-			<< name() << " but " << m_files.size() << " have been"
+		ib::error() << "There must be < 1000 data files "
+			" but " << m_files.size() << " have been"
 			" defined.";
 
 		return(DB_ERROR);
@@ -826,22 +820,23 @@ SysTablespace::check_file_spec(
 
 		} else if (err != DB_SUCCESS) {
 			if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) {
-				const char*	p = (!srv_read_only_mode
-						     || m_ignore_read_only)
-						    ? "writable" : "readable";
-				ib::error() << "The " << name() << " data file"
-					<< " '" << it->name() << "' must be "
-					<< p;
+				ib::error() << "The data file '"
+					    << it->filepath()
+					    << ((!srv_read_only_mode
+						 || m_ignore_read_only)
+						? "' must be writable"
+						: "' must be readable");
 			}
 
 			ut_a(err != DB_FAIL);
 			break;
 
 		} else if (*create_new_db) {
-			ib::error() << "The " << name() << " data file '"
-				<< begin->m_name << "' was not found but"
-				" one of the other data files '" << it->m_name
-				<< "' exists.";
+			ib::error() << "The data file '"
+				    << begin->filepath()
+				    << "' was not found but"
+				" one of the other data files '"
+				    << it->filepath() << "' exists.";
 
 			err = DB_ERROR;
 			break;
@@ -935,7 +930,7 @@ SysTablespace::open_or_create(
 		} else if (is_temp) {
 			ut_ad(space_id() == SRV_TMP_SPACE_ID);
 			space = fil_space_t::create(
-				name(), SRV_TMP_SPACE_ID, flags(),
+				SRV_TMP_SPACE_ID, flags(),
 				FIL_TYPE_TEMPORARY, NULL);
 			ut_ad(space == fil_system.temp_space);
 			if (!space) {
@@ -946,7 +941,7 @@ SysTablespace::open_or_create(
 		} else {
 			ut_ad(space_id() == TRX_SYS_SPACE);
 			space = fil_space_t::create(
-				name(), TRX_SYS_SPACE, it->flags(),
+				TRX_SYS_SPACE, it->flags(),
 				FIL_TYPE_TABLESPACE, NULL);
 			ut_ad(space == fil_system.sys_space);
 			if (!space) {
@@ -993,8 +988,7 @@ uint32_t SysTablespace::get_increment() const
 
   if (!is_valid_size())
   {
-     ib::error() << "The last data file in " << name()
-                 << " has a size of " << last_file_size()
+     ib::error() << "The last data file has a size of " << last_file_size()
                  << " but the max size allowed is "
                  << m_last_file_size_max;
   }
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
index bb42f7c9f54..74d02d63817 100644
--- a/storage/innobase/fts/fts0ast.cc
+++ b/storage/innobase/fts/fts0ast.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,6 +28,7 @@ Created 2007/3/16 Sunny Bains.
 #include "fts0ast.h"
 #include "fts0pars.h"
 #include "fts0fts.h"
+#include "trx0trx.h"
 
 /* The FTS ast visit pass. */
 enum fts_ast_visit_pass_t {
diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc
index f95159dc5b7..4566224e171 100644
--- a/storage/innobase/fts/fts0config.cc
+++ b/storage/innobase/fts/fts0config.cc
@@ -119,11 +119,7 @@ fts_config_get_value(
 	trx->op_info = "getting FTS config value";
 
 	error = fts_eval_sql(trx, graph);
-
-	mutex_enter(&dict_sys.mutex);
 	que_graph_free(graph);
-	mutex_exit(&dict_sys.mutex);
-
 	return(error);
 }
 
@@ -230,7 +226,7 @@ fts_config_set_value(
 
 	error = fts_eval_sql(trx, graph);
 
-	fts_que_graph_free_check_lock(fts_table, NULL, graph);
+	que_graph_free(graph);
 
 	n_rows_updated = trx->undo_no - undo_no;
 
@@ -256,7 +252,7 @@ fts_config_set_value(
 
 		error = fts_eval_sql(trx, graph);
 
-		fts_que_graph_free_check_lock(fts_table, NULL, graph);
+		que_graph_free(graph);
 	}
 
 	return(error);
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
index d0931de9614..eed2eb72cd1 100644
--- a/storage/innobase/fts/fts0fts.cc
+++ b/storage/innobase/fts/fts0fts.cc
@@ -23,6 +23,7 @@ Full Text Search interface
 ***********************************************************************/
 
 #include "trx0roll.h"
+#include "trx0purge.h"
 #include "row0mysql.h"
 #include "row0upd.h"
 #include "dict0types.h"
@@ -34,10 +35,24 @@ Full Text Search interface
 #include "fts0types.inl"
 #include "fts0vlc.h"
 #include "fts0plugin.h"
-#include "dict0priv.h"
 #include "dict0stats.h"
 #include "btr0pcur.h"
-#include "sync0sync.h"
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+        /** Transaction used for SYNCing the cache to disk */
+        trx_t   *trx;
+        /** Table with FTS index(es) */
+        dict_table_t    *table;
+        /** Max size in bytes of the cache */
+        ulint           max_cache_size;
+        /** The doc id at which the cache was noted as being
+        full, we use this to set the upper_limit field */
+        doc_id_t        max_doc_id;
+        /** SYNC start time; only used if fts_enable_diag_print */
+        time_t          start_time;
+};
 
 static const ulint FTS_MAX_ID_LEN = 32;
 
@@ -86,7 +101,7 @@ static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024;
 #endif
 
 /** Time to sleep after DEADLOCK error before retrying operation. */
-static const ulint FTS_DEADLOCK_RETRY_WAIT = 100000;
+static const std::chrono::milliseconds FTS_DEADLOCK_RETRY_WAIT(100);
 
 /** InnoDB default stopword list:
 There are different versions of stopwords, the stop words listed
@@ -135,14 +150,6 @@ const char *fts_default_stopword[] =
 	NULL
 };
 
-/** For storing table info when checking for orphaned tables. */
-struct fts_aux_table_t {
-	table_id_t	id;		/*!< Table id */
-	table_id_t	parent_id;	/*!< Parent table id */
-	table_id_t	index_id;	/*!< Table FT index id */
-	char*		name;		/*!< Name of the table */
-};
-
 /** FTS auxiliary table suffixes that are common to all FT indexes. */
 const char* fts_common_tables[] = {
 	"BEING_DELETED",
@@ -166,6 +173,7 @@ const  fts_index_selector_t fts_index_selector[] = {
 
 /** Default config values for FTS indexes on a table. */
 static const char* fts_config_table_insert_values_sql =
+	"PROCEDURE P() IS\n"
 	"BEGIN\n"
 	"\n"
 	"INSERT INTO $config_table VALUES('"
@@ -181,7 +189,8 @@ static const char* fts_config_table_insert_values_sql =
 		FTS_TOTAL_DELETED_COUNT "', '0');\n"
 	"" /* Note: 0 == FTS_TABLE_STATE_RUNNING */
 	"INSERT INTO $config_table VALUES ('"
-		FTS_TABLE_STATE "', '0');\n";
+		FTS_TABLE_STATE "', '0');\n"
+	"END;\n";
 
 /** FTS tokenize parmameter for plugin parser */
 struct fts_tokenize_param_t {
@@ -192,15 +201,8 @@ struct fts_tokenize_param_t {
 /** Run SYNC on the table, i.e., write out data from the cache to the
 FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	sync		sync state
-@param[in]	unlock_cache	whether unlock cache lock when write node
-@param[in]	wait		whether wait when a sync is in progress
 @return DB_SUCCESS if all OK */
-static
-dberr_t
-fts_sync(
-	fts_sync_t*	sync,
-	bool		unlock_cache,
-	bool		wait);
+static dberr_t fts_sync(fts_sync_t *sync);
 
 /****************************************************************//**
 Release all resources help by the words rb tree e.g., the node ilist. */
@@ -223,10 +225,9 @@ fts_update_max_cache_size(
 /*********************************************************************//**
 This function fetches the document just inserted right before
 we commit the transaction, and tokenize the inserted text data
-and insert into FTS auxiliary table and its cache.
-@return TRUE if successful */
+and insert into FTS auxiliary table and its cache. */
 static
-ulint
+void
 fts_add_doc_by_id(
 /*==============*/
 	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
@@ -270,11 +271,10 @@ static
 void
 fts_cache_destroy(fts_cache_t* cache)
 {
-	rw_lock_free(&cache->lock);
-	rw_lock_free(&cache->init_lock);
-	mutex_free(&cache->deleted_lock);
-	mutex_free(&cache->doc_id_lock);
-	os_event_destroy(cache->sync->event);
+	mysql_mutex_destroy(&cache->lock);
+	mysql_mutex_destroy(&cache->init_lock);
+	mysql_mutex_destroy(&cache->deleted_lock);
+	mysql_mutex_destroy(&cache->doc_id_lock);
 
 	if (cache->stopword_info.cached_stopword) {
 		rbt_free(cache->stopword_info.cached_stopword);
@@ -463,7 +463,7 @@ fts_load_user_stopword(
 	fts_stopword_t*	stopword_info)		/*!< in: Stopword info */
 {
 	if (!fts->dict_locked) {
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 	}
 
 	/* Validate the user table existence in the right format */
@@ -474,7 +474,7 @@ fts_load_user_stopword(
 	if (!stopword_info->charset) {
 cleanup:
 		if (!fts->dict_locked) {
-			mutex_exit(&dict_sys.mutex);
+			dict_sys.unlock();
 		}
 
 		return ret;
@@ -500,8 +500,9 @@ cleanup:
 	pars_info_bind_function(info, "my_func", fts_read_stopword,
 				stopword_info);
 
-	que_t* graph = fts_parse_sql_no_dict_lock(
+	que_t* graph = pars_sql(
 		info,
+		"PROCEDURE P() IS\n"
 		"DECLARE FUNCTION my_func;\n"
 		"DECLARE CURSOR c IS"
 		" SELECT value, $row_end"
@@ -515,7 +516,8 @@ cleanup:
 		"    EXIT;\n"
 		"  END IF;\n"
 		"END LOOP;\n"
-		"CLOSE c;");
+		"CLOSE c;"
+		"END;\n");
 
 	for (;;) {
 		dberr_t error = fts_eval_sql(trx, graph);
@@ -572,7 +574,6 @@ fts_index_cache_init(
 
 	for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) {
 		ut_a(index_cache->ins_graph[i] == NULL);
-		ut_a(index_cache->sel_graph[i] == NULL);
 	}
 }
 
@@ -593,10 +594,10 @@ fts_cache_init(
 	cache->total_size = 0;
 	cache->total_size_at_sync = 0;
 
-	mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+	mysql_mutex_lock(&cache->deleted_lock);
 	cache->deleted_doc_ids = ib_vector_create(
 		cache->sync_heap, sizeof(doc_id_t), 4);
-	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+	mysql_mutex_unlock(&cache->deleted_lock);
 
 	/* Reset the cache data for all the FTS indexes. */
 	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
@@ -626,15 +627,10 @@ fts_cache_create(
 
 	cache->cache_heap = heap;
 
-	rw_lock_create(fts_cache_rw_lock_key, &cache->lock, SYNC_FTS_CACHE);
-
-	rw_lock_create(
-		fts_cache_init_rw_lock_key, &cache->init_lock,
-		SYNC_FTS_CACHE_INIT);
-
-	mutex_create(LATCH_ID_FTS_DELETE, &cache->deleted_lock);
-
-	mutex_create(LATCH_ID_FTS_DOC_ID, &cache->doc_id_lock);
+	mysql_mutex_init(fts_cache_mutex_key, &cache->lock, nullptr);
+	mysql_mutex_init(fts_cache_init_mutex_key, &cache->init_lock, nullptr);
+	mysql_mutex_init(fts_delete_mutex_key, &cache->deleted_lock, nullptr);
+	mysql_mutex_init(fts_doc_id_mutex_key, &cache->doc_id_lock, nullptr);
 
 	/* This is the heap used to create the cache itself. */
 	cache->self_heap = ib_heap_allocator_create(heap);
@@ -647,7 +643,6 @@ fts_cache_create(
 		mem_heap_zalloc(heap, sizeof(fts_sync_t)));
 
 	cache->sync->table = table;
-	cache->sync->event = os_event_create(0);
 
 	/* Create the index cache vector that will hold the inverted indexes. */
 	cache->indexes = ib_vector_create(
@@ -680,7 +675,7 @@ fts_add_index(
 	ut_ad(fts);
 	cache = table->fts->cache;
 
-	rw_lock_x_lock(&cache->init_lock);
+	mysql_mutex_lock(&cache->init_lock);
 
 	ib_vector_push(fts->indexes, &index);
 
@@ -691,7 +686,7 @@ fts_add_index(
 		index_cache = fts_cache_index_cache_create(table, index);
 	}
 
-	rw_lock_x_unlock(&cache->init_lock);
+	mysql_mutex_unlock(&cache->init_lock);
 }
 
 /*******************************************************************//**
@@ -705,7 +700,7 @@ fts_reset_get_doc(
 	fts_get_doc_t*  get_doc;
 	ulint		i;
 
-	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&cache->init_lock);
 
 	ib_vector_reset(cache->get_docs);
 
@@ -816,9 +811,8 @@ fts_check_cached_index(
 
 /** Clear all fts resources when there is no internal DOC_ID
 and there are no new fts index to add.
-@param[in,out]	table	table  where fts is to be freed
-@param[in]	trx	transaction to drop all fts tables */
-void fts_clear_all(dict_table_t *table, trx_t *trx)
+@param[in,out]	table	table  where fts is to be freed */
+void fts_clear_all(dict_table_t *table)
 {
   if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
       !table->fts ||
@@ -832,7 +826,6 @@ void fts_clear_all(dict_table_t *table, trx_t *trx)
 
   fts_optimize_remove_table(table);
 
-  fts_drop_tables(trx, table);
   table->fts->~fts_t();
   table->fts= nullptr;
   DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
@@ -874,7 +867,7 @@ fts_drop_index(
 		fts_cache_t*            cache = table->fts->cache;
 		fts_index_cache_t*      index_cache;
 
-		rw_lock_x_lock(&cache->init_lock);
+		mysql_mutex_lock(&cache->init_lock);
 
 		index_cache = fts_find_index_cache(cache, index);
 
@@ -891,10 +884,10 @@ fts_drop_index(
 			fts_reset_get_doc(cache);
 		}
 
-		rw_lock_x_unlock(&cache->init_lock);
+		mysql_mutex_unlock(&cache->init_lock);
 	}
 
-	err = fts_drop_index_tables(trx, index);
+	err = fts_drop_index_tables(trx, *index);
 
 	ib_vector_remove(indexes, (const void*) index);
 
@@ -902,41 +895,6 @@ fts_drop_index(
 }
 
 /****************************************************************//**
-Free the query graph but check whether dict_sys.mutex is already
-held */
-void
-fts_que_graph_free_check_lock(
-/*==========================*/
-	fts_table_t*		fts_table,	/*!< in: FTS table */
-	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
-	que_t*			graph)		/*!< in: query graph */
-{
-	bool	has_dict = FALSE;
-
-	if (fts_table && fts_table->table) {
-		ut_ad(fts_table->table->fts);
-
-		has_dict = fts_table->table->fts->dict_locked;
-	} else if (index_cache) {
-		ut_ad(index_cache->index->table->fts);
-
-		has_dict = index_cache->index->table->fts->dict_locked;
-	}
-
-	if (!has_dict) {
-		mutex_enter(&dict_sys.mutex);
-	}
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	que_graph_free(graph);
-
-	if (!has_dict) {
-		mutex_exit(&dict_sys.mutex);
-	}
-}
-
-/****************************************************************//**
 Create an FTS index cache. */
 CHARSET_INFO*
 fts_index_get_charset(
@@ -990,7 +948,7 @@ fts_cache_index_cache_create(
 
 	ut_a(cache != NULL);
 
-	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&cache->init_lock);
 
 	/* Must not already exist in the cache vector. */
 	ut_a(fts_find_index_cache(cache, index) == NULL);
@@ -1010,10 +968,6 @@ fts_cache_index_cache_create(
 		mem_heap_zalloc(static_cast<mem_heap_t*>(
 			cache->self_heap->arg), n_bytes));
 
-	index_cache->sel_graph = static_cast<que_t**>(
-		mem_heap_zalloc(static_cast<mem_heap_t*>(
-			cache->self_heap->arg), n_bytes));
-
 	fts_index_cache_init(cache->sync_heap, index_cache);
 
 	if (cache->get_docs) {
@@ -1083,21 +1037,10 @@ fts_cache_clear(
 
 			if (index_cache->ins_graph[j] != NULL) {
 
-				fts_que_graph_free_check_lock(
-					NULL, index_cache,
-					index_cache->ins_graph[j]);
+				que_graph_free(index_cache->ins_graph[j]);
 
 				index_cache->ins_graph[j] = NULL;
 			}
-
-			if (index_cache->sel_graph[j] != NULL) {
-
-				fts_que_graph_free_check_lock(
-					NULL, index_cache,
-					index_cache->sel_graph[j]);
-
-				index_cache->sel_graph[j] = NULL;
-			}
 		}
 
 		index_cache->doc_stats = NULL;
@@ -1107,9 +1050,9 @@ fts_cache_clear(
 
 	cache->total_size = 0;
 
-	mutex_enter((ib_mutex_t*) &cache->deleted_lock);
+	mysql_mutex_lock(&cache->deleted_lock);
 	cache->deleted_doc_ids = NULL;
-	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+	mysql_mutex_unlock(&cache->deleted_lock);
 
 	mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
 	cache->sync_heap->arg = NULL;
@@ -1125,12 +1068,12 @@ fts_get_index_cache(
 	fts_cache_t*		cache,		/*!< in: cache to search */
 	const dict_index_t*	index)		/*!< in: index to search for */
 {
-	ulint			i;
-
-	ut_ad(rw_lock_own((rw_lock_t*) &cache->lock, RW_LOCK_X)
-	      || rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_X));
+#ifdef SAFE_MUTEX
+	ut_ad(mysql_mutex_is_owner(&cache->lock)
+	      || mysql_mutex_is_owner(&cache->init_lock));
+#endif /* SAFE_MUTEX */
 
-	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
 		fts_index_cache_t*	index_cache;
 
 		index_cache = static_cast<fts_index_cache_t*>(
@@ -1158,7 +1101,7 @@ fts_get_index_get_doc(
 {
 	ulint			i;
 
-	ut_ad(rw_lock_own((rw_lock_t*) &cache->init_lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&cache->init_lock);
 
 	for (i = 0; i < ib_vector_size(cache->get_docs); ++i) {
 		fts_get_doc_t*	get_doc;
@@ -1191,7 +1134,7 @@ fts_tokenizer_word_get(
 	fts_tokenizer_word_t*	word;
 	ib_rbt_bound_t		parent;
 
-	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&cache->lock);
 
 	/* If it is a stopword, do not index it */
 	if (!fts_check_token(text,
@@ -1249,11 +1192,11 @@ fts_cache_node_add_positions(
 	byte*		ptr_start;
 	doc_id_t	doc_id_delta;
 
-#ifdef UNIV_DEBUG
+#ifdef SAFE_MUTEX
 	if (cache) {
-		ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+		mysql_mutex_assert_owner(&cache->lock);
 	}
-#endif /* UNIV_DEBUG */
+#endif /* SAFE_MUTEX */
 
 	ut_ad(doc_id >= node->last_doc_id);
 
@@ -1366,7 +1309,7 @@ fts_cache_add_doc(
 		return;
 	}
 
-	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&cache->lock);
 
 	n_words = rbt_size(tokens);
 
@@ -1390,8 +1333,7 @@ fts_cache_add_doc(
 				ib_vector_last(word->nodes));
 		}
 
-		if (fts_node == NULL || fts_node->synced
-		    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
+		if (!fts_node || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
 		    || doc_id < fts_node->last_doc_id) {
 
 			fts_node = static_cast<fts_node_t*>(
@@ -1425,46 +1367,50 @@ fts_cache_add_doc(
 	}
 }
 
-/****************************************************************//**
-Drops a table. If the table can't be found we return a SUCCESS code.
-@return DB_SUCCESS or error code */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-dberr_t
-fts_drop_table(
-/*===========*/
-	trx_t*		trx,			/*!< in: transaction */
-	const char*	table_name)		/*!< in: table to drop */
+/** Drop a table.
+@param trx          transaction
+@param table_name   FTS_ table name
+@param rename       whether to rename before dropping
+@return error code
+@retval DB_SUCCESS  if the table was dropped
+@retval DB_FAIL     if the table did not exist */
+static dberr_t fts_drop_table(trx_t *trx, const char *table_name, bool rename)
 {
-	dict_table_t*	table;
-	dberr_t		error = DB_SUCCESS;
-
-	/* Check that the table exists in our data dictionary.
-	Similar to regular drop table case, we will open table with
-	DICT_ERR_IGNORE_INDEX_ROOT and DICT_ERR_IGNORE_CORRUPT option */
-	table = dict_table_open_on_name(
-		table_name, TRUE, FALSE,
-		static_cast<dict_err_ignore_t>(
-                        DICT_ERR_IGNORE_INDEX_ROOT | DICT_ERR_IGNORE_CORRUPT));
-
-	if (table != 0) {
-
-		dict_table_close(table, TRUE, FALSE);
-
-		/* Pass nonatomic=false (don't allow data dict unlock),
-		because the transaction may hold locks on SYS_* tables from
-		previous calls to fts_drop_table(). */
-		error = row_drop_table_for_mysql(table_name, trx,
-						 SQLCOM_DROP_DB, false, false);
+  if (dict_table_t *table= dict_table_open_on_name(table_name, true,
+                                                   DICT_ERR_IGNORE_TABLESPACE))
+  {
+    table->release();
+    if (rename)
+    {
+      mem_heap_t *heap= mem_heap_create(FN_REFLEN);
+      char *tmp= dict_mem_create_temporary_tablename(heap, table->name.m_name,
+                                                     table->id);
+      dberr_t err= row_rename_table_for_mysql(table->name.m_name, tmp, trx,
+                                              false);
+      mem_heap_free(heap);
+      if (err != DB_SUCCESS)
+      {
+        ib::error() << "Unable to rename table " << table_name << ": " << err;
+        return err;
+      }
+    }
+    if (dberr_t err= trx->drop_table(*table))
+    {
+      ib::error() << "Unable to drop table " << table->name << ": " << err;
+      return err;
+    }
 
-		if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
-			ib::error() << "Unable to drop FTS index aux table "
-				<< table_name << ": " << error;
-		}
-	} else {
-		error = DB_FAIL;
-	}
+#ifdef UNIV_DEBUG
+    for (auto &p : trx->mod_tables)
+    {
+      if (p.first == table)
+	p.second.set_aux_table();
+    }
+#endif /* UNIV_DEBUG */
+    return DB_SUCCESS;
+  }
 
-	return(error);
+  return DB_FAIL;
 }
 
 /****************************************************************//**
@@ -1498,7 +1444,7 @@ fts_rename_one_aux_table(
 	fts_table_new_name[table_new_name_len] = 0;
 
 	return row_rename_table_for_mysql(
-		fts_table_old_name, fts_table_new_name, trx, false, false);
+		fts_table_old_name, fts_table_new_name, trx, false);
 }
 
 /****************************************************************//**
@@ -1564,68 +1510,203 @@ fts_rename_aux_tables(
 	return(DB_SUCCESS);
 }
 
-/** Drops the common ancillary tables needed for supporting an FTS index
-on the given table. row_mysql_lock_data_dictionary must have been called
-before this.
-@param[in]	trx		transaction to drop fts common table
-@param[in]	fts_table	table with an FTS index
-@param[in]	drop_orphan	True if the function is used to drop
-				orphaned table
+/** Lock an internal FTS_ table, before fts_drop_table() */
+static dberr_t fts_lock_table(trx_t *trx, const char *table_name)
+{
+  ut_ad(purge_sys.must_wait_FTS());
+
+  if (dict_table_t *table= dict_table_open_on_name(table_name, false,
+                                                   DICT_ERR_IGNORE_TABLESPACE))
+  {
+    dberr_t err= lock_table_for_trx(table, trx, LOCK_X);
+    /* Wait for purge threads to stop using the table. */
+    for (uint n= 15; table->get_ref_count() > 1; )
+    {
+      if (!--n)
+      {
+        err= DB_LOCK_WAIT_TIMEOUT;
+        goto fail;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    }
+fail:
+    table->release();
+    return err;
+  }
+  return DB_SUCCESS;
+}
+
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx   transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index)
+{
+  ut_ad(index.type & DICT_FTS);
+  fts_table_t fts_table;
+  char table_name[MAX_FULL_NAME_LEN];
+  FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, (&index));
+  for (const fts_index_selector_t *s= fts_index_selector; s->suffix; s++)
+  {
+    fts_table.suffix= s->suffix;
+    fts_get_table_name(&fts_table, table_name, false);
+    if (dberr_t err= fts_lock_table(trx, table_name))
+      return err;
+  }
+  return DB_SUCCESS;
+}
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
 @return DB_SUCCESS or error code */
-static dberr_t
-fts_drop_common_tables(
-	trx_t*		trx,
-	fts_table_t*	fts_table,
-	bool		drop_orphan=false)
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table)
 {
-	ulint		i;
-	dberr_t		error = DB_SUCCESS;
+  fts_table_t fts_table;
+  char table_name[MAX_FULL_NAME_LEN];
 
-	for (i = 0; fts_common_tables[i] != NULL; ++i) {
-		dberr_t	err;
-		char	table_name[MAX_FULL_NAME_LEN];
+  FTS_INIT_FTS_TABLE(&fts_table, nullptr, FTS_COMMON_TABLE, (&table));
 
-		fts_table->suffix = fts_common_tables[i];
-		fts_get_table_name(fts_table, table_name, true);
+  for (const char **suffix= fts_common_tables; *suffix; suffix++)
+  {
+    fts_table.suffix= *suffix;
+    fts_get_table_name(&fts_table, table_name, false);
+    if (dberr_t err= fts_lock_table(trx, table_name))
+      return err;
+  }
+  return DB_SUCCESS;
+}
 
-		err = fts_drop_table(trx, table_name);
+/** This function make sure that table doesn't
+have any other reference count.
+@param	table_name	table name */
+static void fts_table_no_ref_count(const char *table_name)
+{
+  dict_table_t *table= dict_table_open_on_name(
+    table_name, true, DICT_ERR_IGNORE_TABLESPACE);
+  if (!table)
+    return;
 
-		/* We only return the status of the last error. */
-		if (err != DB_SUCCESS && err != DB_FAIL) {
-			error = err;
-		}
+  while (table->get_ref_count() > 1)
+  {
+    dict_sys.unlock();
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    dict_sys.lock(SRW_LOCK_CALL);
+  }
 
-		if (drop_orphan && err == DB_FAIL) {
-			char* path = fil_make_filepath(
-					NULL, table_name, IBD, false);
-			if (path != NULL) {
-				os_file_delete_if_exists(
-					innodb_data_file_key, path, NULL);
-				ut_free(path);
-			}
-		}
-	}
+  table->release();
+}
 
-	return(error);
+/** Stop the purge thread and check n_ref_count of all auxiliary
+and common table associated with the fts table.
+@param	table		parent FTS table
+@param	already_stopped	True indicates purge threads were
+			already stopped*/
+void purge_sys_t::stop_FTS(const dict_table_t &table, bool already_stopped)
+{
+  dict_sys.lock(SRW_LOCK_CALL);
+  if (!already_stopped)
+    purge_sys.stop_FTS();
+
+  fts_table_t fts_table;
+  char table_name[MAX_FULL_NAME_LEN];
+
+  FTS_INIT_FTS_TABLE(&fts_table, nullptr, FTS_COMMON_TABLE, (&table));
+
+  for (const char **suffix= fts_common_tables; *suffix; suffix++)
+  {
+    fts_table.suffix= *suffix;
+    fts_get_table_name(&fts_table, table_name, true);
+    fts_table_no_ref_count(table_name);
+  }
+
+  if (table.fts)
+  {
+    if (auto indexes= table.fts->indexes)
+    {
+      for (ulint i= 0;i < ib_vector_size(indexes); ++i)
+      {
+        const dict_index_t *index= static_cast<const dict_index_t*>(
+          ib_vector_getp(indexes, i));
+        FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, index);
+        for (const fts_index_selector_t *s= fts_index_selector;
+             s->suffix; s++)
+        {
+          fts_table.suffix= s->suffix;
+          fts_get_table_name(&fts_table, table_name, true);
+          fts_table_no_ref_count(table_name);
+        }
+      }
+    }
+  }
+
+  dict_sys.unlock();
 }
 
-/****************************************************************//**
-Since we do a horizontal split on the index table, we need to drop
-all the split tables.
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
 @return DB_SUCCESS or error code */
-static
-dberr_t
-fts_drop_index_split_tables(
-/*========================*/
-	trx_t*		trx,			/*!< in: transaction */
-	dict_index_t*	index)			/*!< in: fts instance */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table)
+{
+  if (dberr_t err= fts_lock_common_tables(trx, table))
+    return err;
+
+  if (!table.fts)
+    return DB_SUCCESS;
+
+  auto indexes= table.fts->indexes;
+  if (!indexes)
+    return DB_SUCCESS;
+
+  for (ulint i= 0; i < ib_vector_size(indexes); ++i)
+    if (dberr_t err=
+        fts_lock_index_tables(trx, *static_cast<const dict_index_t*>
+                              (ib_vector_getp(indexes, i))))
+      return err;
+  return DB_SUCCESS;
+}
+
+/** Drops the common ancillary tables needed for supporting an FTS index
+on the given table.
+@param trx          transaction to drop fts common table
+@param fts_table    table with an FTS index
+@param rename       whether to rename before dropping
+@return DB_SUCCESS or error code */
+static dberr_t fts_drop_common_tables(trx_t *trx, fts_table_t *fts_table,
+                                      bool rename)
+{
+  dberr_t error= DB_SUCCESS;
+
+  for (ulint i= 0; fts_common_tables[i]; ++i)
+  {
+    char table_name[MAX_FULL_NAME_LEN];
+
+    fts_table->suffix= fts_common_tables[i];
+    fts_get_table_name(fts_table, table_name, true);
+
+    if (dberr_t err= fts_drop_table(trx, table_name, rename))
+    {
+      if (trx->state != TRX_STATE_ACTIVE)
+        return err;
+      /* We only return the status of the last error. */
+      if (err != DB_FAIL)
+        error= err;
+    }
+  }
 
+  return error;
+}
+
+/****************************************************************//**
+Drops FTS auxiliary tables for an FTS index
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
 {
 	ulint		i;
 	fts_table_t	fts_table;
 	dberr_t		error = DB_SUCCESS;
 
-	FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+	FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, (&index));
 
 	for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) {
 		dberr_t	err;
@@ -1634,7 +1715,7 @@ fts_drop_index_split_tables(
 		fts_table.suffix = fts_get_suffix(i);
 		fts_get_table_name(&fts_table, table_name, true);
 
-		err = fts_drop_table(trx, table_name);
+		err = fts_drop_table(trx, table_name, false);
 
 		/* We only return the status of the last error. */
 		if (err != DB_SUCCESS && err != DB_FAIL) {
@@ -1646,73 +1727,44 @@ fts_drop_index_split_tables(
 }
 
 /****************************************************************//**
-Drops FTS auxiliary tables for an FTS index
-@return DB_SUCCESS or error code */
-dberr_t
-fts_drop_index_tables(
-/*==================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_index_t*	index)		/*!< in: Index to drop */
-{
-	return(fts_drop_index_split_tables(trx, index));
-}
-
-/****************************************************************//**
 Drops FTS ancillary tables needed for supporting an FTS index
-on the given table. row_mysql_lock_data_dictionary must have been called
-before this.
+on the given table.
 @return DB_SUCCESS or error code */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
 fts_drop_all_index_tables(
 /*======================*/
 	trx_t*		trx,			/*!< in: transaction */
-	fts_t*		fts)			/*!< in: fts instance */
+	const fts_t*	fts)			/*!< in: fts instance */
 {
-	dberr_t		error = DB_SUCCESS;
-
-	for (ulint i = 0;
-	     fts->indexes != 0 && i < ib_vector_size(fts->indexes);
-	     ++i) {
-
-		dberr_t		err;
-		dict_index_t*	index;
-
-		index = static_cast<dict_index_t*>(
-			ib_vector_getp(fts->indexes, i));
-
-		err = fts_drop_index_tables(trx, index);
-
-		if (err != DB_SUCCESS) {
-			error = err;
-		}
-	}
-
-	return(error);
+  dberr_t error= DB_SUCCESS;
+  auto indexes= fts->indexes;
+  if (!indexes)
+    return DB_SUCCESS;
+
+  for (ulint i= 0; i < ib_vector_size(indexes); ++i)
+    if (dberr_t err= fts_drop_index_tables(trx,
+                                           *static_cast<const dict_index_t*>
+                                           (ib_vector_getp(indexes, i))))
+      error= err;
+  return error;
 }
 
-/*********************************************************************//**
-Drops the ancillary tables needed for supporting an FTS index on a
-given table. row_mysql_lock_data_dictionary must have been called before
-this.
+/** Drop the internal FTS_ tables for table.
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
 @return DB_SUCCESS or error code */
-dberr_t
-fts_drop_tables(
-/*============*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table)		/*!< in: table has the FTS index */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table)
 {
 	dberr_t		error;
 	fts_table_t	fts_table;
 
-	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
-
-	/* TODO: This is not atomic and can cause problems during recovery. */
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, (&table));
 
-	error = fts_drop_common_tables(trx, &fts_table);
+	error = fts_drop_common_tables(trx, &fts_table, false);
 
-	if (error == DB_SUCCESS && table->fts) {
-		error = fts_drop_all_index_tables(trx, table->fts);
+	if (error == DB_SUCCESS && table.fts) {
+		error = fts_drop_all_index_tables(trx, table.fts);
 	}
 
 	return(error);
@@ -1730,8 +1782,9 @@ fts_create_in_mem_aux_table(
 	const dict_table_t*	table,
 	ulint			n_cols)
 {
-	dict_table_t*	new_table = dict_mem_table_create(
-		aux_table_name, NULL, n_cols, 0, table->flags,
+	dict_table_t*	new_table = dict_table_t::create(
+		{aux_table_name,strlen(aux_table_name)},
+		nullptr, n_cols, 0, table->flags,
 		table->space_id == TRX_SYS_SPACE
 		? 0 : table->space_id == SRV_TMP_SPACE_ID
 		? DICT_TF2_TEMPORARY : DICT_TF2_USE_FILE_PER_TABLE);
@@ -1788,8 +1841,8 @@ fts_create_one_common_table(
 	}
 
 	dict_table_add_system_columns(new_table, heap);
-	error = row_create_table_for_mysql(new_table, trx,
-		FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+	error = row_create_table_for_mysql(new_table, trx);
+
 	if (error == DB_SUCCESS) {
 
 		dict_index_t*	index = dict_mem_index_create(
@@ -1802,36 +1855,21 @@ fts_create_one_common_table(
 			dict_mem_index_add_field(index, "key", 0);
 		}
 
-		/* We save and restore trx->dict_operation because
-		row_create_index_for_mysql() changes the operation to
-		TRX_DICT_OP_TABLE. */
-		trx_dict_op_t op = trx_get_dict_operation(trx);
-
-		error =	row_create_index_for_mysql(index, trx, NULL);
-
-		trx->dict_operation = op;
-	} else {
-err_exit:
-		new_table = NULL;
-		ib::warn() << "Failed to create FTS common table "
-			<< fts_table_name;
-		trx->error_state = error;
-		return NULL;
-	}
-
-	if (error != DB_SUCCESS) {
-		dict_mem_table_free(new_table);
-		trx->error_state = DB_SUCCESS;
-		row_drop_table_for_mysql(fts_table_name, trx, SQLCOM_DROP_DB);
-		goto err_exit;
+		error =	row_create_index_for_mysql(index, trx, NULL,
+						   FIL_ENCRYPTION_DEFAULT,
+						   FIL_DEFAULT_ENCRYPTION_KEY);
+		if (error == DB_SUCCESS) {
+			return new_table;
+		}
 	}
 
-	return(new_table);
+	ib::warn() << "Failed to create FTS common table " << fts_table_name;
+	trx->error_state = error;
+	return NULL;
 }
 
 /** Creates the common auxiliary tables needed for supporting an FTS index
-on the given table. row_mysql_lock_data_dictionary must have been called
-before this.
+on the given table.
 The following tables are created.
 CREATE TABLE $FTS_PREFIX_DELETED
 	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
@@ -1863,17 +1901,10 @@ fts_create_common_tables(
 				[MAX_FULL_NAME_LEN];
 
 	dict_index_t*					index = NULL;
-	trx_dict_op_t					op;
-	/* common_tables vector is used for dropping FTS common tables
-	on error condition. */
-	std::vector<dict_table_t*>			common_tables;
-	std::vector<dict_table_t*>::const_iterator	it;
 
 	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
 
-	op = trx_get_dict_operation(trx);
-
-	error = fts_drop_common_tables(trx, &fts_table);
+	error = fts_drop_common_tables(trx, &fts_table, true);
 
 	if (error != DB_SUCCESS) {
 
@@ -1892,20 +1923,9 @@ fts_create_common_tables(
 			trx->error_state = DB_SUCCESS;
 			error = DB_ERROR;
 			goto func_exit;
-		} else {
-			common_tables.push_back(common_table);
 		}
 
 		mem_heap_empty(heap);
-
-		DBUG_EXECUTE_IF("ib_fts_aux_table_error",
-			/* Return error after creating FTS_AUX_CONFIG table. */
-			if (i == 4) {
-				error = DB_ERROR;
-				goto func_exit;
-			}
-		);
-
 	}
 
 	/* Write the default settings to the config table. */
@@ -1915,7 +1935,7 @@ fts_create_common_tables(
 	fts_get_table_name(&fts_table, fts_name, true);
 	pars_info_bind_id(info, "config_table", fts_name);
 
-	graph = fts_parse_sql_no_dict_lock(
+	graph = pars_sql(
 		info, fts_config_table_insert_values_sql);
 
 	error = fts_eval_sql(trx, graph);
@@ -1938,22 +1958,11 @@ fts_create_common_tables(
 		dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0);
 	}
 
-	op = trx_get_dict_operation(trx);
-
-	error =	row_create_index_for_mysql(index, trx, NULL);
+	error =	row_create_index_for_mysql(index, trx, NULL,
+					   FIL_ENCRYPTION_DEFAULT,
+					   FIL_DEFAULT_ENCRYPTION_KEY);
 
 func_exit:
-	if (error != DB_SUCCESS) {
-		for (it = common_tables.begin(); it != common_tables.end();
-		     ++it) {
-			row_drop_table_for_mysql((*it)->name.m_name, trx,
-						 SQLCOM_DROP_DB);
-		}
-	}
-
-	trx->dict_operation = op;
-
-	common_tables.clear();
 	mem_heap_free(heap);
 
 	return(error);
@@ -2021,8 +2030,7 @@ fts_create_one_index_table(
 		FTS_INDEX_ILIST_LEN);
 
 	dict_table_add_system_columns(new_table, heap);
-	error = row_create_table_for_mysql(new_table, trx,
-		FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+	error = row_create_table_for_mysql(new_table, trx);
 
 	if (error == DB_SUCCESS) {
 		dict_index_t*	index = dict_mem_index_create(
@@ -2031,33 +2039,22 @@ fts_create_one_index_table(
 		dict_mem_index_add_field(index, "word", 0);
 		dict_mem_index_add_field(index, "first_doc_id", 0);
 
-		trx_dict_op_t op = trx_get_dict_operation(trx);
-
-		error =	row_create_index_for_mysql(index, trx, NULL);
+		error =	row_create_index_for_mysql(index, trx, NULL,
+						   FIL_ENCRYPTION_DEFAULT,
+						   FIL_DEFAULT_ENCRYPTION_KEY);
 
-		trx->dict_operation = op;
-	} else {
-err_exit:
-		new_table = NULL;
-		ib::warn() << "Failed to create FTS index table "
-			<< table_name;
-		trx->error_state = error;
-		return NULL;
-	}
-
-	if (error != DB_SUCCESS) {
-		dict_mem_table_free(new_table);
-		trx->error_state = DB_SUCCESS;
-		row_drop_table_for_mysql(table_name, trx, SQLCOM_DROP_DB);
-		goto err_exit;
+		if (error == DB_SUCCESS) {
+			return new_table;
+		}
 	}
 
-	return(new_table);
+	ib::warn() << "Failed to create FTS index table " << table_name;
+	trx->error_state = error;
+	return NULL;
 }
 
 /** Creates the column specific ancillary tables needed for supporting an
-FTS index on the given table. row_mysql_lock_data_dictionary must have
-been called before this.
+FTS index on the given table.
 
 All FTS AUX Index tables have the following schema.
 CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
@@ -2084,17 +2081,11 @@ fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
 	fts_table.table_id = id;
 	fts_table.table = index->table;
 
-	/* aux_idx_tables vector is used for dropping FTS AUX INDEX
-	tables on error condition. */
-	std::vector<dict_table_t*>			aux_idx_tables;
-	std::vector<dict_table_t*>::const_iterator	it;
-
 	for (i = 0; i < FTS_NUM_AUX_INDEX && error == DB_SUCCESS; ++i) {
 		dict_table_t*	new_table;
 
 		/* Create the FTS auxiliary tables that are specific
-		to an FTS index. We need to preserve the table_id %s
-		which fts_parse_sql_no_dict_lock() will fill in for us. */
+		to an FTS index. */
 		fts_table.suffix = fts_get_suffix(i);
 
 		new_table = fts_create_one_index_table(
@@ -2103,32 +2094,11 @@ fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
 		if (new_table == NULL) {
 			error = DB_FAIL;
 			break;
-		} else {
-			aux_idx_tables.push_back(new_table);
 		}
 
 		mem_heap_empty(heap);
-
-		DBUG_EXECUTE_IF("ib_fts_index_table_error",
-			/* Return error after creating FTS_INDEX_5
-			aux table. */
-			if (i == 4) {
-				error = DB_FAIL;
-				break;
-			}
-		);
 	}
 
-	if (error != DB_SUCCESS) {
-
-		for (it = aux_idx_tables.begin(); it != aux_idx_tables.end();
-		     ++it) {
-			row_drop_table_for_mysql((*it)->name.m_name, trx,
-						 SQLCOM_DROP_DB);
-		}
-	}
-
-	aux_idx_tables.clear();
 	mem_heap_free(heap);
 
 	return(error);
@@ -2571,9 +2541,9 @@ fts_get_next_doc_id(
 	}
 
 	DEBUG_SYNC_C("get_next_FTS_DOC_ID");
-	mutex_enter(&cache->doc_id_lock);
+	mysql_mutex_lock(&cache->doc_id_lock);
 	*doc_id = cache->next_doc_id++;
-	mutex_exit(&cache->doc_id_lock);
+	mysql_mutex_unlock(&cache->doc_id_lock);
 
 	return(DB_SUCCESS);
 }
@@ -2595,6 +2565,10 @@ fts_cmp_set_sync_doc_id(
 						to the one stored in CONFIG
 						table */
 {
+	if (srv_read_only_mode) {
+		return DB_READ_ONLY;
+	}
+
 	trx_t*		trx;
 	pars_info_t*	info;
 	dberr_t		error;
@@ -2602,7 +2576,6 @@ fts_cmp_set_sync_doc_id(
 	que_t*		graph = NULL;
 	fts_cache_t*	cache = table->fts->cache;
 	char		table_name[MAX_FULL_NAME_LEN];
-retry:
 	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
 
 	fts_table.suffix = "CONFIG";
@@ -2610,12 +2583,9 @@ retry:
 	fts_table.type = FTS_COMMON_TABLE;
 	fts_table.table = table;
 
-	trx = trx_create();
-	if (srv_read_only_mode) {
-		trx_start_internal_read_only(trx);
-	} else {
-		trx_start_internal(trx);
-	}
+	trx= trx_create();
+retry:
+	trx_start_internal(trx);
 
 	trx->op_info = "update the next FTS document id";
 
@@ -2647,7 +2617,7 @@ retry:
 
 	error = fts_eval_sql(trx, graph);
 
-	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+	que_graph_free(graph);
 
 	// FIXME: We need to retry deadlock errors
 	if (error != DB_SUCCESS) {
@@ -2668,13 +2638,13 @@ retry:
 		cache->synced_doc_id = ut_max(cmp_doc_id, *doc_id);
 	}
 
-	mutex_enter(&cache->doc_id_lock);
+	mysql_mutex_lock(&cache->doc_id_lock);
 	/* For each sync operation, we will add next_doc_id by 1,
 	so to mark a sync operation */
 	if (cache->next_doc_id < cache->synced_doc_id + 1) {
 		cache->next_doc_id = cache->synced_doc_id + 1;
 	}
-	mutex_exit(&cache->doc_id_lock);
+	mysql_mutex_unlock(&cache->doc_id_lock);
 
 	if (cmp_doc_id && cmp_doc_id >= *doc_id) {
 		error = fts_update_sync_doc_id(
@@ -2694,8 +2664,9 @@ func_exit:
 			"for table " << table->name;
 		fts_sql_rollback(trx);
 
-		if (error == DB_DEADLOCK) {
-			os_thread_sleep(FTS_DEADLOCK_RETRY_WAIT);
+		if (error == DB_DEADLOCK || error == DB_LOCK_WAIT_TIMEOUT) {
+			DEBUG_SYNC_C("fts_cmp_set_sync_doc_id_retry");
+			std::this_thread::sleep_for(FTS_DEADLOCK_RETRY_WAIT);
 			goto retry;
 		}
 	}
@@ -2763,7 +2734,7 @@ fts_update_sync_doc_id(
 
 	error = fts_eval_sql(trx, graph);
 
-	fts_que_graph_free_check_lock(&fts_table, NULL, graph);
+	que_graph_free(graph);
 
 	if (local_trx) {
 		if (UNIV_LIKELY(error == DB_SUCCESS)) {
@@ -2818,9 +2789,9 @@ fts_add(
 
 	fts_add_doc_by_id(ftt, doc_id);
 
-	mutex_enter(&table->fts->cache->deleted_lock);
+	mysql_mutex_lock(&table->fts->cache->deleted_lock);
 	++table->fts->cache->added;
-	mutex_exit(&table->fts->cache->deleted_lock);
+	mysql_mutex_unlock(&table->fts->cache->deleted_lock);
 
 	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
 	    && doc_id >= table->fts->cache->next_doc_id) {
@@ -2840,7 +2811,6 @@ fts_delete(
 {
 	que_t*		graph;
 	fts_table_t	fts_table;
-	dberr_t		error = DB_SUCCESS;
 	doc_id_t	write_doc_id;
 	dict_table_t*	table = ftt->table;
 	doc_id_t	doc_id = row->doc_id;
@@ -2851,7 +2821,7 @@ fts_delete(
 	/* we do not index Documents whose Doc ID value is 0 */
 	if (doc_id == FTS_NULL_DOC_ID) {
 		ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID));
-		return(error);
+		return DB_SUCCESS;
 	}
 
 	ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
@@ -2868,7 +2838,7 @@ fts_delete(
 	is re-established and sync-ed */
 	if (table->fts->added_synced
 	    && doc_id > cache->synced_doc_id) {
-		mutex_enter(&table->fts->cache->deleted_lock);
+		mysql_mutex_lock(&table->fts->cache->deleted_lock);
 
 		/* The Doc ID could belong to those left in
 		ADDED table from last crash. So need to check
@@ -2879,45 +2849,36 @@ fts_delete(
 			--table->fts->cache->added;
 		}
 
-		mutex_exit(&table->fts->cache->deleted_lock);
+		mysql_mutex_unlock(&table->fts->cache->deleted_lock);
 
 		/* Only if the row was really deleted. */
 		ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
 	}
 
 	/* Note the deleted document for OPTIMIZE to purge. */
-	if (error == DB_SUCCESS) {
-		char	table_name[MAX_FULL_NAME_LEN];
-
-		trx->op_info = "adding doc id to FTS DELETED";
+	char	table_name[MAX_FULL_NAME_LEN];
 
-		info->graph_owns_us = TRUE;
+	trx->op_info = "adding doc id to FTS DELETED";
 
-		fts_table.suffix = "DELETED";
+	fts_table.suffix = "DELETED";
 
-		fts_get_table_name(&fts_table, table_name);
-		pars_info_bind_id(info, "deleted", table_name);
-
-		graph = fts_parse_sql(
-			&fts_table,
-			info,
-			"BEGIN INSERT INTO $deleted VALUES (:doc_id);");
+	fts_get_table_name(&fts_table, table_name);
+	pars_info_bind_id(info, "deleted", table_name);
 
-		error = fts_eval_sql(trx, graph);
+	graph = fts_parse_sql(&fts_table, info,
+			      "BEGIN INSERT INTO $deleted VALUES (:doc_id);");
 
-		fts_que_graph_free(graph);
-	} else {
-		pars_info_free(info);
-	}
+	dberr_t error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
 
 	/* Increment the total deleted count, this is used to calculate the
 	number of documents indexed. */
 	if (error == DB_SUCCESS) {
-		mutex_enter(&table->fts->cache->deleted_lock);
+		mysql_mutex_lock(&table->fts->cache->deleted_lock);
 
 		++table->fts->cache->deleted;
 
-		mutex_exit(&table->fts->cache->deleted_lock);
+		mysql_mutex_unlock(&table->fts->cache->deleted_lock);
 	}
 
 	return(error);
@@ -2973,11 +2934,11 @@ fts_commit_table(
 	ftt->fts_trx->trx = trx;
 
 	if (cache->get_docs == NULL) {
-		rw_lock_x_lock(&cache->init_lock);
+		mysql_mutex_lock(&cache->init_lock);
 		if (cache->get_docs == NULL) {
 			cache->get_docs = fts_get_docs_create(cache);
 		}
-		rw_lock_x_unlock(&cache->init_lock);
+		mysql_mutex_unlock(&cache->init_lock);
 	}
 
 	for (node = rbt_first(rows);
@@ -3342,7 +3303,7 @@ fts_add_doc_from_tuple(
 
                if (doc.found) {
                        mtr_commit(&mtr);
-                       rw_lock_x_lock(&table->fts->cache->lock);
+                       mysql_mutex_lock(&table->fts->cache->lock);
 
                        if (table->fts->cache->stopword_info.status
                            & STOPWORD_NOT_INIT) {
@@ -3355,11 +3316,11 @@ fts_add_doc_from_tuple(
                                get_doc->index_cache,
                                doc_id, doc.tokens);
 
-                       rw_lock_x_unlock(&table->fts->cache->lock);
+                       mysql_mutex_unlock(&table->fts->cache->lock);
 
                        if (cache->total_size > fts_max_cache_size / 5
                            || fts_need_sync) {
-                               fts_sync(cache->sync, true, false);
+                               fts_sync(cache->sync);
                        }
 
                        mtr_start(&mtr);
@@ -3375,10 +3336,9 @@ fts_add_doc_from_tuple(
 /*********************************************************************//**
 This function fetches the document inserted during the committing
 transaction, and tokenize the inserted text data and insert into
-FTS auxiliary table and its cache.
-@return TRUE if successful */
+FTS auxiliary table and its cache. */
 static
-ulint
+void
 fts_add_doc_by_id(
 /*==============*/
 	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
@@ -3396,7 +3356,7 @@ fts_add_doc_by_id(
 	dict_index_t*	fts_id_index;
 	ibool		is_id_cluster;
 	fts_cache_t*   	cache = ftt->table->fts->cache;
-
+	bool		need_sync= false;
 	ut_ad(cache->get_docs);
 
 	/* If Doc ID has been supplied by the user, then the table
@@ -3422,7 +3382,6 @@ fts_add_doc_by_id(
 	is_id_cluster = (clust_index == fts_id_index);
 
 	mtr_start(&mtr);
-	btr_pcur_init(&pcur);
 
 	/* Search based on Doc ID. Here, we'll need to consider the case
 	when there is no primary index on Doc ID */
@@ -3434,6 +3393,7 @@ fts_add_doc_by_id(
 
 	mach_write_to_8((byte*) &temp_doc_id, doc_id);
 	dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id));
+	pcur.btr_cur.page_cur.index = fts_id_index;
 
 	if (n_uniq == 2) {
 		ut_ad(table->versioned());
@@ -3450,12 +3410,11 @@ fts_add_doc_by_id(
 		}
 	}
 
-	btr_pcur_open_with_no_init(
-		fts_id_index, tuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
-		&pcur, &mtr);
-
 	/* If we have a match, add the data to doc structure */
-	if (btr_pcur_get_low_match(&pcur) == n_uniq) {
+	if (btr_pcur_open_with_no_init(tuple, PAGE_CUR_LE,
+				       BTR_SEARCH_LEAF, &pcur, &mtr)
+	    == DB_SUCCESS
+	    && btr_pcur_get_low_match(&pcur) == n_uniq) {
 		const rec_t*	rec;
 		btr_pcur_t*	doc_pcur;
 		const rec_t*	clust_rec;
@@ -3479,7 +3438,6 @@ fts_add_doc_by_id(
 			dtuple_t*	clust_ref;
 			ulint		n_fields;
 
-			btr_pcur_init(&clust_pcur);
 			n_fields = dict_index_get_n_unique(clust_index);
 
 			clust_ref = dtuple_create(heap, n_fields);
@@ -3487,14 +3445,18 @@ fts_add_doc_by_id(
 
 			row_build_row_ref_in_tuple(
 				clust_ref, rec, fts_id_index, NULL);
+			clust_pcur.btr_cur.page_cur.index = clust_index;
 
-			btr_pcur_open_with_no_init(
-				clust_index, clust_ref, PAGE_CUR_LE,
-				BTR_SEARCH_LEAF, &clust_pcur, &mtr);
+			if (btr_pcur_open_with_no_init(clust_ref,
+						       PAGE_CUR_LE,
+						       BTR_SEARCH_LEAF,
+						       &clust_pcur, &mtr)
+			    != DB_SUCCESS) {
+				goto func_exit;
+			}
 
 			doc_pcur = &clust_pcur;
 			clust_rec = btr_pcur_get_rec(&clust_pcur);
-
 		}
 
 		offsets = rec_get_offsets(clust_rec, clust_index, NULL,
@@ -3521,7 +3483,7 @@ fts_add_doc_by_id(
 				btr_pcur_store_position(doc_pcur, &mtr);
 				mtr_commit(&mtr);
 
-				rw_lock_x_lock(&table->fts->cache->lock);
+				mysql_mutex_lock(&table->fts->cache->lock);
 
 				if (table->fts->cache->stopword_info.status
 				    & STOPWORD_NOT_INIT) {
@@ -3534,46 +3496,41 @@ fts_add_doc_by_id(
 					get_doc->index_cache,
 					doc_id, doc.tokens);
 
-				bool	need_sync = !cache->sync->in_progress
-					&& (fts_need_sync
-					    || (cache->total_size
-						- cache->total_size_at_sync)
-					    > fts_max_cache_size / 10);
-				if (need_sync) {
-					cache->total_size_at_sync =
-						cache->total_size;
-				}
+				/** FTS cache sync should happen
+				frequently. Because user thread
+				shouldn't hold the cache lock for
+				longer time. So cache should sync
+				whenever cache size exceeds 512 KB */
+				need_sync =
+					cache->total_size > 512*1024;
 
-				rw_lock_x_unlock(&table->fts->cache->lock);
+				mysql_mutex_unlock(&table->fts->cache->lock);
 
 				DBUG_EXECUTE_IF(
 					"fts_instrument_sync",
-					fts_optimize_request_sync_table(table);
-					os_event_wait(cache->sync->event);
+					fts_sync_table(table);
 				);
 
 				DBUG_EXECUTE_IF(
 					"fts_instrument_sync_debug",
-					fts_sync(cache->sync, true, true);
+					fts_sync(cache->sync);
 				);
 
 				DEBUG_SYNC_C("fts_instrument_sync_request");
 				DBUG_EXECUTE_IF(
 					"fts_instrument_sync_request",
-					fts_optimize_request_sync_table(table);
+					need_sync= true;
 				);
 
-				if (need_sync) {
-					fts_optimize_request_sync_table(table);
-				}
-
 				mtr_start(&mtr);
 
 				if (i < num_idx - 1) {
-					ut_d(auto status=)
-					  btr_pcur_restore_position(
-					      BTR_SEARCH_LEAF, doc_pcur, &mtr);
-					ut_ad(status == btr_pcur_t::SAME_ALL);
+					if (doc_pcur->restore_position(
+					      BTR_SEARCH_LEAF, &mtr)
+					    != btr_pcur_t::SAME_ALL) {
+						ut_ad("invalid state" == 0);
+						i = num_idx - 1;
+					}
 				}
 			}
 
@@ -3581,16 +3538,19 @@ fts_add_doc_by_id(
 		}
 
 		if (!is_id_cluster) {
-			btr_pcur_close(doc_pcur);
+			ut_free(doc_pcur->old_rec_buf);
 		}
 	}
 func_exit:
 	mtr_commit(&mtr);
 
-	btr_pcur_close(&pcur);
+	ut_free(pcur.old_rec_buf);
 
 	mem_heap_free(heap);
-	return(TRUE);
+
+	if (need_sync) {
+		fts_sync_table(table);
+	}
 }
 
 
@@ -3643,13 +3603,11 @@ fts_get_max_doc_id(
 	ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0);
 #endif
 
-	mtr_start(&mtr);
+	mtr.start();
 
 	/* fetch the largest indexes value */
-	btr_pcur_open_at_index_side(
-		false, index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
-
-	if (!page_is_empty(btr_pcur_get_page(&pcur))) {
+	if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr) == DB_SUCCESS
+	    && !page_is_empty(btr_pcur_get_page(&pcur))) {
 		const rec_t*    rec = NULL;
 		constexpr ulint	doc_id_len= 8;
 
@@ -3689,8 +3647,7 @@ fts_get_max_doc_id(
 	}
 
 func_exit:
-	btr_pcur_close(&pcur);
-	mtr_commit(&mtr);
+	mtr.commit();
 	return(doc_id);
 }
 
@@ -3810,7 +3767,7 @@ fts_doc_fetch_by_doc_id(
 	trx->free();
 
 	if (!get_doc) {
-		fts_que_graph_free(graph);
+		que_graph_free(graph);
 	}
 
 	return(error);
@@ -3939,7 +3896,7 @@ fts_sync_add_deleted_cache(
 		error = fts_eval_sql(sync->trx, graph);
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	return(error);
 }
@@ -3953,15 +3910,13 @@ static MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
 fts_sync_write_words(
 	trx_t*			trx,
-	fts_index_cache_t*	index_cache,
-	bool			unlock_cache)
+	fts_index_cache_t*	index_cache)
 {
 	fts_table_t	fts_table;
 	ulint		n_nodes = 0;
 	ulint		n_words = 0;
 	const ib_rbt_node_t* rbt_node;
 	dberr_t		error = DB_SUCCESS;
-	ibool		print_error = FALSE;
 	dict_table_t*	table = index_cache->index->table;
 
 	FTS_INIT_INDEX_TABLE(
@@ -3981,8 +3936,10 @@ fts_sync_write_words(
 
 		word = rbt_value(fts_tokenizer_word_t, rbt_node);
 
-		DBUG_EXECUTE_IF("fts_instrument_write_words_before_select_index",
-				os_thread_sleep(300000););
+		DBUG_EXECUTE_IF(
+			"fts_instrument_write_words_before_select_index",
+			std::this_thread::sleep_for(
+				std::chrono::milliseconds(300)););
 
 		selected = fts_select_index(
 			index_cache->charset, word->text.f_str,
@@ -3990,52 +3947,36 @@ fts_sync_write_words(
 
 		fts_table.suffix = fts_get_suffix(selected);
 
-		/* We iterate over all the nodes even if there was an error */
 		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
 
 			fts_node_t* fts_node = static_cast<fts_node_t*>(
 				ib_vector_get(word->nodes, i));
 
-			if (fts_node->synced) {
-				continue;
-			} else {
-				fts_node->synced = true;
-			}
-
-			/*FIXME: we need to handle the error properly. */
-			if (error == DB_SUCCESS) {
-				if (unlock_cache) {
-					rw_lock_x_unlock(
-						&table->fts->cache->lock);
-				}
-
-				error = fts_write_node(
-					trx,
-					&index_cache->ins_graph[selected],
-					&fts_table, &word->text, fts_node);
+			error = fts_write_node(
+				trx, &index_cache->ins_graph[selected],
+				&fts_table, &word->text, fts_node);
 
-				DEBUG_SYNC_C("fts_write_node");
-				DBUG_EXECUTE_IF("fts_write_node_crash",
+			DEBUG_SYNC_C("fts_write_node");
+			DBUG_EXECUTE_IF("fts_write_node_crash",
 					DBUG_SUICIDE(););
 
-				DBUG_EXECUTE_IF("fts_instrument_sync_sleep",
-					os_thread_sleep(1000000);
-				);
+			DBUG_EXECUTE_IF("fts_instrument_sync_sleep",
+					std::this_thread::sleep_for(
+						std::chrono::seconds(1)););
 
-				if (unlock_cache) {
-					rw_lock_x_lock(
-						&table->fts->cache->lock);
-				}
+			if (error != DB_SUCCESS) {
+				goto err_exit;
 			}
 		}
 
 		n_nodes += ib_vector_size(word->nodes);
 
-		if (UNIV_UNLIKELY(error != DB_SUCCESS) && !print_error) {
+		if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+err_exit:
 			ib::error() << "(" << error << ") writing"
 				" word node to FTS auxiliary index table "
 				<< table->name;
-			print_error = TRUE;
+			break;
 		}
 	}
 
@@ -4094,58 +4035,44 @@ fts_sync_index(
 
 	ut_ad(rbt_validate(index_cache->words));
 
-	return(fts_sync_write_words(trx, index_cache, sync->unlock_cache));
+	return(fts_sync_write_words(trx, index_cache));
 }
 
-/** Check if index cache has been synced completely
-@param[in,out]	index_cache	index cache
-@return true if index is synced, otherwise false. */
+/** Rollback a sync operation
+@param[in,out]	sync	sync state */
 static
-bool
-fts_sync_index_check(
-	fts_index_cache_t*	index_cache)
+void
+fts_sync_rollback(
+	fts_sync_t*	sync)
 {
-	const ib_rbt_node_t*	rbt_node;
-
-	for (rbt_node = rbt_first(index_cache->words);
-	     rbt_node != NULL;
-	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
 
-		fts_tokenizer_word_t*	word;
-		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		ulint			j;
+		fts_index_cache_t*	index_cache;
 
-		fts_node_t*	fts_node;
-		fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
 
-		if (!fts_node->synced) {
-			return(false);
-		}
-	}
+		for (j = 0; fts_index_selector[j].value; ++j) {
 
-	return(true);
-}
+			if (index_cache->ins_graph[j] != NULL) {
 
-/** Reset synced flag in index cache when rollback
-@param[in,out]	index_cache	index cache */
-static
-void
-fts_sync_index_reset(
-	fts_index_cache_t*	index_cache)
-{
-	const ib_rbt_node_t*	rbt_node;
+				que_graph_free(index_cache->ins_graph[j]);
 
-	for (rbt_node = rbt_first(index_cache->words);
-	     rbt_node != NULL;
-	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+				index_cache->ins_graph[j] = NULL;
+			}
+		}
+	}
 
-		fts_tokenizer_word_t*	word;
-		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+	mysql_mutex_unlock(&cache->lock);
 
-		fts_node_t*	fts_node;
-		fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+	fts_sql_rollback(trx);
 
-		fts_node->synced = false;
-	}
+	/* Avoid assertion in trx_t::free(). */
+	trx->dict_operation_lock_mode = false;
+	trx->free();
 }
 
 /** Commit the SYNC, change state of processed doc ids etc.
@@ -4178,19 +4105,20 @@ fts_sync_commit(
 			sync, cache->deleted_doc_ids);
 	}
 
-	/* We need to do this within the deleted lock since fts_delete() can
-	attempt to add a deleted doc id to the cache deleted id array. */
-	fts_cache_clear(cache);
-	DEBUG_SYNC_C("fts_deleted_doc_ids_clear");
-	fts_cache_init(cache);
-	rw_lock_x_unlock(&cache->lock);
-
 	if (UNIV_LIKELY(error == DB_SUCCESS)) {
+		/* We need to do this within the deleted lock
+		since fts_delete() can attempt to add a deleted
+		doc id to the cache deleted id array. */
+		fts_cache_clear(cache);
+		DEBUG_SYNC_C("fts_deleted_doc_ids_clear");
+		fts_cache_init(cache);
+		mysql_mutex_unlock(&cache->lock);
 		fts_sql_commit(trx);
 	} else {
-		fts_sql_rollback(trx);
 		ib::error() << "(" << error << ") during SYNC of "
 			"table " << sync->table->name;
+		fts_sync_rollback(sync);
+		return error;
 	}
 
 	if (UNIV_UNLIKELY(fts_enable_diag_print) && elapsed_time) {
@@ -4204,76 +4132,19 @@ fts_sync_commit(
 	}
 
 	/* Avoid assertion in trx_t::free(). */
-	trx->dict_operation_lock_mode = 0;
+	trx->dict_operation_lock_mode = false;
 	trx->free();
 
 	return(error);
 }
 
-/** Rollback a sync operation
-@param[in,out]	sync	sync state */
-static
-void
-fts_sync_rollback(
-	fts_sync_t*	sync)
-{
-	trx_t*		trx = sync->trx;
-	fts_cache_t*	cache = sync->table->fts->cache;
-
-	for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
-		ulint			j;
-		fts_index_cache_t*	index_cache;
-
-		index_cache = static_cast<fts_index_cache_t*>(
-			ib_vector_get(cache->indexes, i));
-
-		/* Reset synced flag so nodes will not be skipped
-		in the next sync, see fts_sync_write_words(). */
-		fts_sync_index_reset(index_cache);
-
-		for (j = 0; fts_index_selector[j].value; ++j) {
-
-			if (index_cache->ins_graph[j] != NULL) {
-
-				fts_que_graph_free_check_lock(
-					NULL, index_cache,
-					index_cache->ins_graph[j]);
-
-				index_cache->ins_graph[j] = NULL;
-			}
-
-			if (index_cache->sel_graph[j] != NULL) {
-
-				fts_que_graph_free_check_lock(
-					NULL, index_cache,
-					index_cache->sel_graph[j]);
-
-				index_cache->sel_graph[j] = NULL;
-			}
-		}
-	}
-
-	rw_lock_x_unlock(&cache->lock);
-
-	fts_sql_rollback(trx);
-
-	/* Avoid assertion in trx_t::free(). */
-	trx->dict_operation_lock_mode = 0;
-	trx->free();
-}
-
 /** Run SYNC on the table, i.e., write out data from the cache to the
 FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	sync		sync state
 @param[in]	unlock_cache	whether unlock cache lock when write node
 @param[in]	wait		whether wait when a sync is in progress
 @return DB_SUCCESS if all OK */
-static
-dberr_t
-fts_sync(
-	fts_sync_t*	sync,
-	bool		unlock_cache,
-	bool		wait)
+static dberr_t fts_sync(fts_sync_t *sync)
 {
 	if (srv_read_only_mode) {
 		return DB_READ_ONLY;
@@ -4282,37 +4153,15 @@ fts_sync(
 	ulint		i;
 	dberr_t		error = DB_SUCCESS;
 	fts_cache_t*	cache = sync->table->fts->cache;
-	size_t		fts_cache_size= 0;
-	rw_lock_x_lock(&cache->lock);
-
-	/* Check if cache is being synced.
-	Note: we release cache lock in fts_sync_write_words() to
-	avoid long wait for the lock by other threads. */
-	while (sync->in_progress) {
-		rw_lock_x_unlock(&cache->lock);
-
-		if (wait) {
-			os_event_wait(sync->event);
-		} else {
-			return(DB_SUCCESS);
-		}
-
-		rw_lock_x_lock(&cache->lock);
-	}
-
-	sync->unlock_cache = unlock_cache;
-	sync->in_progress = true;
 
+	mysql_mutex_lock(&cache->lock);
 	DEBUG_SYNC_C("fts_sync_begin");
 	fts_sync_begin(sync);
 
-begin_sync:
-	fts_cache_size= fts_max_cache_size;
+	const size_t fts_cache_size= fts_max_cache_size;
 	if (cache->total_size > fts_cache_size) {
 		/* Avoid the case: sync never finish when
 		insert/update keeps comming. */
-		ut_ad(sync->unlock_cache);
-		sync->unlock_cache = false;
 		ib::warn() << "Total InnoDB FTS size "
 			<< cache->total_size << " for the table "
 			<< cache->sync->table->name
@@ -4326,74 +4175,45 @@ begin_sync:
 		index_cache = static_cast<fts_index_cache_t*>(
 			ib_vector_get(cache->indexes, i));
 
-		if (index_cache->index->to_be_dropped
-		   || index_cache->index->table->to_be_dropped) {
+		if (index_cache->index->to_be_dropped) {
 			continue;
 		}
 
 		DBUG_EXECUTE_IF("fts_instrument_sync_before_syncing",
-				os_thread_sleep(300000););
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(300)););
 		error = fts_sync_index(sync, index_cache);
 
 		if (error != DB_SUCCESS) {
-			goto end_sync;
-		}
-
-		if (!sync->unlock_cache
-		    && cache->total_size < fts_max_cache_size) {
-			/* Reset the unlock cache if the value
-			is less than innodb_ft_cache_size */
-			sync->unlock_cache = true;
+			goto err_exit;
 		}
 	}
 
 	DBUG_EXECUTE_IF("fts_instrument_sync_interrupted",
-			sync->interrupted = true;
 			error = DB_INTERRUPTED;
-			goto end_sync;
+			goto err_exit;
 	);
 
-	/* Make sure all the caches are synced. */
-	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
-		fts_index_cache_t*	index_cache;
-
-		index_cache = static_cast<fts_index_cache_t*>(
-			ib_vector_get(cache->indexes, i));
-
-		if (index_cache->index->to_be_dropped
-		    || index_cache->index->table->to_be_dropped
-		    || fts_sync_index_check(index_cache)) {
-			continue;
-		}
-
-		goto begin_sync;
-	}
-
-end_sync:
-	if (error == DB_SUCCESS && !sync->interrupted) {
+	if (error == DB_SUCCESS) {
 		error = fts_sync_commit(sync);
 	} else {
+err_exit:
 		fts_sync_rollback(sync);
+		return error;
 	}
 
-	rw_lock_x_lock(&cache->lock);
-
-	sync->interrupted = false;
-	sync->in_progress = false;
-	os_event_set(sync->event);
-	rw_lock_x_unlock(&cache->lock);
-
 	/* We need to check whether an optimize is required, for that
 	we make copies of the two variables that control the trigger. These
 	variables can change behind our back and we don't want to hold the
 	lock for longer than is needed. */
-	mutex_enter(&cache->deleted_lock);
+	mysql_mutex_lock(&cache->deleted_lock);
 
 	cache->added = 0;
 	cache->deleted = 0;
 
-	mutex_exit(&cache->deleted_lock);
+	mysql_mutex_unlock(&cache->deleted_lock);
 
+	DEBUG_SYNC_C("fts_sync_end");
 	return(error);
 }
 
@@ -4402,18 +4222,13 @@ FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	table		fts table
 @param[in]	wait		whether wait for existing sync to finish
 @return DB_SUCCESS on success, error code on failure. */
-dberr_t fts_sync_table(dict_table_t* table, bool wait)
+dberr_t fts_sync_table(dict_table_t* table)
 {
-	dberr_t	err = DB_SUCCESS;
-
-	ut_ad(table->fts);
-
-	if (table->space && table->fts->cache
-	    && !dict_table_is_corrupted(table)) {
-		err = fts_sync(table->fts->cache->sync, !wait, wait);
-	}
+  ut_ad(table->fts);
 
-	return(err);
+  return table->space && !table->corrupted && table->fts->cache
+    ? fts_sync(table->fts->cache->sync)
+    : DB_SUCCESS;
 }
 
 /** Check if a fts token is a stopword or less than fts_min_token_size
@@ -4776,7 +4591,7 @@ fts_get_docs_create(
 {
 	ib_vector_t*	get_docs;
 
-	ut_ad(rw_lock_own(&cache->init_lock, RW_LOCK_X));
+	mysql_mutex_assert_owner(&cache->init_lock);
 
 	/* We need one instance of fts_get_doc_t per index. */
 	get_docs = ib_vector_create(cache->self_heap, sizeof(fts_get_doc_t), 4);
@@ -4826,7 +4641,7 @@ fts_get_docs_clear(
 
 			ut_a(get_doc->index_cache);
 
-			fts_que_graph_free(get_doc->get_document_graph);
+			que_graph_free(get_doc->get_document_graph);
 			get_doc->get_document_graph = NULL;
 		}
 	}
@@ -4842,11 +4657,11 @@ fts_init_doc_id(
 {
 	doc_id_t	max_doc_id = 0;
 
-	rw_lock_x_lock(&table->fts->cache->lock);
+	mysql_mutex_lock(&table->fts->cache->lock);
 
 	/* Return if the table is already initialized for DOC ID */
 	if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) {
-		rw_lock_x_unlock(&table->fts->cache->lock);
+		mysql_mutex_unlock(&table->fts->cache->lock);
 		return(0);
 	}
 
@@ -4867,7 +4682,7 @@ fts_init_doc_id(
 
 	table->fts->cache->first_doc_id = max_doc_id;
 
-	rw_lock_x_unlock(&table->fts->cache->lock);
+	mysql_mutex_unlock(&table->fts->cache->lock);
 
 	ut_ad(max_doc_id > 0);
 
@@ -4974,7 +4789,7 @@ fts_get_rows_count(
 		}
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	trx->free();
 
@@ -5074,7 +4889,7 @@ fts_savepoint_free(
 
 		/* The default savepoint name must be NULL. */
 		if (ftt->docs_added_graph) {
-			fts_que_graph_free(ftt->docs_added_graph);
+			que_graph_free(ftt->docs_added_graph);
 		}
 
 		/* NOTE: We are responsible for free'ing the node */
@@ -5200,12 +5015,8 @@ fts_cache_find_word(
 {
 	ib_rbt_bound_t		parent;
 	const ib_vector_t*	nodes = NULL;
-#ifdef UNIV_DEBUG
-	dict_table_t*		table = index_cache->index->table;
-	fts_cache_t*		cache = table->fts->cache;
 
-	ut_ad(rw_lock_own(&cache->lock, RW_LOCK_X));
-#endif /* UNIV_DEBUG */
+	mysql_mutex_assert_owner(&index_cache->index->table->fts->cache->lock);
 
 	/* Lookup the word in the rb tree */
 	if (rbt_search(index_cache->words, &parent, text) == 0) {
@@ -5224,27 +5035,20 @@ Append deleted doc ids to vector. */
 void
 fts_cache_append_deleted_doc_ids(
 /*=============================*/
-	const fts_cache_t*	cache,		/*!< in: cache to use */
+	fts_cache_t*		cache,		/*!< in: cache to use */
 	ib_vector_t*		vector)		/*!< in: append to this vector */
 {
-	mutex_enter(const_cast<ib_mutex_t*>(&cache->deleted_lock));
-
-	if (cache->deleted_doc_ids == NULL) {
-		mutex_exit((ib_mutex_t*) &cache->deleted_lock);
-		return;
-	}
-
-
-	for (ulint i = 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) {
-		doc_id_t*	update;
+  mysql_mutex_lock(&cache->deleted_lock);
 
-		update = static_cast<doc_id_t*>(
-			ib_vector_get(cache->deleted_doc_ids, i));
-
-		ib_vector_push(vector, &update);
-	}
+  if (cache->deleted_doc_ids)
+    for (ulint i= 0; i < ib_vector_size(cache->deleted_doc_ids); ++i)
+    {
+      doc_id_t *update= static_cast<doc_id_t*>(
+        ib_vector_get(cache->deleted_doc_ids, i));
+      ib_vector_push(vector, &update);
+    }
 
-	mutex_exit((ib_mutex_t*) &cache->deleted_lock);
+  mysql_mutex_unlock(&cache->deleted_lock);
 }
 
 /*********************************************************************//**
@@ -5704,17 +5508,18 @@ bool fts_check_aux_table(const char *name,
 
   ut_ad(len <= MAX_FULL_NAME_LEN);
   ptr= static_cast<const char*>(memchr(name, '/', len));
+  IF_WIN(if (!ptr) ptr= static_cast<const char*>(memchr(name, '\\', len)), );
 
-  if (ptr != NULL)
-  {
-    /* We will start the match after the '/' */
-    ++ptr;
-    len = end - ptr;
-  }
+  if (!ptr)
+    return false;
+
+  /* We will start the match after the '/' */
+  ++ptr;
+  len= end - ptr;
 
   /* All auxiliary tables are prefixed with "FTS_" and the name
   length will be at the very least greater than 20 bytes. */
-  if (ptr && len > 20 && !memcmp(ptr, "FTS_", 4))
+  if (len > 24 && !memcmp(ptr, "FTS_", 4))
   {
     /* Skip the prefix. */
     ptr+= 4;
@@ -5758,6 +5563,11 @@ bool fts_check_aux_table(const char *name,
     ut_a(end > ptr);
     len= end - ptr;
 
+    if (len <= 4)
+      return false;
+
+    len-= 4; /* .ibd suffix */
+
     if (len > 7)
       return false;
 
@@ -5776,165 +5586,6 @@ bool fts_check_aux_table(const char *name,
   return false;
 }
 
-typedef std::pair<table_id_t,index_id_t> fts_aux_id;
-typedef std::set<fts_aux_id> fts_space_set_t;
-
-/** Iterate over all the spaces in the space list and fetch the
-fts parent table id and index id.
-@param[in,out]	fts_space_set	store the list of tablespace id and
-				index id */
-static void fil_get_fts_spaces(fts_space_set_t& fts_space_set)
-{
-  mutex_enter(&fil_system.mutex);
-
-  for (fil_space_t *space= UT_LIST_GET_FIRST(fil_system.space_list);
-       space;
-       space= UT_LIST_GET_NEXT(space_list, space))
-  {
-    index_id_t index_id= 0;
-    table_id_t table_id= 0;
-
-    if (space->purpose == FIL_TYPE_TABLESPACE
-        && fts_check_aux_table(space->name, &table_id, &index_id))
-      fts_space_set.insert(std::make_pair(table_id, index_id));
-  }
-
-  mutex_exit(&fil_system.mutex);
-}
-
-/** Check whether the parent table id and index id of fts auxilary
-tables with SYS_INDEXES. If it exists then we can safely ignore the
-fts table from orphaned tables.
-@param[in,out]	fts_space_set	fts space set contains set of auxiliary
-				table ids */
-static void fts_check_orphaned_tables(fts_space_set_t& fts_space_set)
-{
-  btr_pcur_t pcur;
-  mtr_t	     mtr;
-  trx_t*     trx = trx_create();
-  trx->op_info = "checking fts orphaned tables";
-
-  row_mysql_lock_data_dictionary(trx);
-
-  mtr.start();
-  btr_pcur_open_at_index_side(
-    true, dict_table_get_first_index(dict_sys.sys_indexes),
-    BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
-
-  do
-  {
-    const rec_t *rec;
-    const byte *tbl_field;
-    const byte *index_field;
-    ulint len;
-
-    btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-    if (!btr_pcur_is_on_user_rec(&pcur))
-      break;
-
-    rec= btr_pcur_get_rec(&pcur);
-    if (rec_get_deleted_flag(rec, 0))
-      continue;
-
-    tbl_field= rec_get_nth_field_old(rec, 0, &len);
-    if (len != 8)
-      continue;
-
-    index_field= rec_get_nth_field_old(rec, 1, &len);
-    if (len != 8)
-      continue;
-
-    table_id_t table_id = mach_read_from_8(tbl_field);
-    index_id_t index_id = mach_read_from_8(index_field);
-
-    fts_space_set_t::iterator it = fts_space_set.find(
-	fts_aux_id(table_id, index_id));
-
-    if (it != fts_space_set.end())
-      fts_space_set.erase(*it);
-    else
-    {
-      it= fts_space_set.find(fts_aux_id(table_id, 0));
-      if (it != fts_space_set.end())
-        fts_space_set.erase(*it);
-    }
-  } while(!fts_space_set.empty());
-
-  btr_pcur_close(&pcur);
-  mtr.commit();
-  row_mysql_unlock_data_dictionary(trx);
-  trx->free();
-}
-
-/** Drop all fts auxilary table for the respective fts_id
-@param[in]	fts_id	fts auxilary table ids */
-static void fts_drop_all_aux_tables(trx_t *trx, fts_table_t *fts_table)
-{
-  char fts_table_name[MAX_FULL_NAME_LEN];
-  for (ulint i= 0;i < FTS_NUM_AUX_INDEX; i++)
-  {
-    fts_table->suffix= fts_get_suffix(i);
-    fts_get_table_name(fts_table, fts_table_name, true);
-
-    /* Drop all fts aux and common table */
-    dberr_t err= fts_drop_table(trx, fts_table_name);
-
-    if (err == DB_FAIL)
-    {
-      char *path= fil_make_filepath(NULL, fts_table_name, IBD, false);
-
-      if (path != NULL)
-      {
-        os_file_delete_if_exists(innodb_data_file_key, path , NULL);
-        ut_free(path);
-      }
-    }
-  }
-}
-
-/** Drop all orphaned FTS auxiliary tables, those that don't have
-a parent table or FTS index defined on them. */
-void fts_drop_orphaned_tables()
-{
-  fts_space_set_t fts_space_set;
-  fil_get_fts_spaces(fts_space_set);
-
-  if (fts_space_set.empty())
-    return;
-
-  fts_check_orphaned_tables(fts_space_set);
-
-  if (fts_space_set.empty())
-    return;
-
-  trx_t* trx= trx_create();
-  trx->op_info= "Drop orphaned aux FTS tables";
-  row_mysql_lock_data_dictionary(trx);
-
-  for (fts_space_set_t::iterator it = fts_space_set.begin();
-       it != fts_space_set.end(); it++)
-  {
-    fts_table_t fts_table;
-    dict_table_t *table= dict_table_open_on_id(it->first, TRUE,
-                                               DICT_TABLE_OP_NORMAL);
-    if (!table)
-      continue;
-
-    FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
-    fts_drop_common_tables(trx, &fts_table, true);
-
-    fts_table.type= FTS_INDEX_TABLE;
-    fts_table.index_id= it->second;
-    fts_drop_all_aux_tables(trx, &fts_table);
-
-    dict_table_close(table, true, false);
-  }
-  trx_commit_for_mysql(trx);
-  row_mysql_unlock_data_dictionary(trx);
-  trx->dict_operation_lock_mode= 0;
-  trx->free();
-}
-
 /**********************************************************************//**
 Check whether user supplied stopword table is of the right format.
 Caller is responsible to hold dictionary locks.
@@ -5956,7 +5607,8 @@ fts_valid_stopword_table(
 		return(NULL);
 	}
 
-	table = dict_table_get_low(stopword_table_name);
+	table = dict_sys.load_table(
+		{stopword_table_name, strlen(stopword_table_name)});
 
 	if (!table) {
 		ib::error() << "User stopword table " << stopword_table_name
@@ -6034,11 +5686,11 @@ fts_load_stopword(
 
 	if (!trx) {
 		trx = trx_create();
-		if (srv_read_only_mode) {
-			trx_start_internal_read_only(trx);
-		} else {
-			trx_start_internal(trx);
-		}
+#ifdef UNIV_DEBUG
+		trx->start_line = __LINE__;
+		trx->start_file = __FILE__;
+#endif
+		trx_start_internal_low(trx, !high_level_read_only);
 		trx->op_info = "upload FTS stopword";
 		new_trx = TRUE;
 	}
@@ -6292,13 +5944,12 @@ fts_init_recover_doc(
 This function brings FTS index in sync when FTS index is first
 used. There are documents that have not yet sync-ed to auxiliary
 tables from last server abnormally shutdown, we will need to bring
-such document into FTS cache before any further operations
-@return TRUE if all OK */
-ibool
+such document into FTS cache before any further operations */
+void
 fts_init_index(
 /*===========*/
 	dict_table_t*	table,		/*!< in: Table with FTS */
-	ibool		has_cache_lock)	/*!< in: Whether we already have
+	bool		has_cache_lock)	/*!< in: Whether we already have
 					cache lock */
 {
 	dict_index_t*   index;
@@ -6307,18 +5958,16 @@ fts_init_index(
 	fts_cache_t*    cache = table->fts->cache;
 	bool		need_init = false;
 
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
 	/* First check cache->get_docs is initialized */
 	if (!has_cache_lock) {
-		rw_lock_x_lock(&cache->lock);
+		mysql_mutex_lock(&cache->lock);
 	}
 
-	rw_lock_x_lock(&cache->init_lock);
+	mysql_mutex_lock(&cache->init_lock);
 	if (cache->get_docs == NULL) {
 		cache->get_docs = fts_get_docs_create(cache);
 	}
-	rw_lock_x_unlock(&cache->init_lock);
+	mysql_mutex_unlock(&cache->init_lock);
 
 	if (table->fts->added_synced) {
 		goto func_exit;
@@ -6368,15 +6017,13 @@ fts_init_index(
 
 func_exit:
 	if (!has_cache_lock) {
-		rw_lock_x_unlock(&cache->lock);
+		mysql_mutex_unlock(&cache->lock);
 	}
 
 	if (need_init) {
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 		/* Register the table with the optimize thread. */
 		fts_optimize_add_table(table);
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unlock();
 	}
-
-	return(TRUE);
 }
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
index 9e3b4b3121d..7c40a25e6e7 100644
--- a/storage/innobase/fts/fts0opt.cc
+++ b/storage/innobase/fts/fts0opt.cc
@@ -66,8 +66,9 @@ static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
 /** Server is shutting down, so does we exiting the optimize thread */
 static bool fts_opt_start_shutdown = false;
 
-/** Event to wait for shutdown of the optimize thread */
-static os_event_t fts_opt_shutdown_event = NULL;
+/** Condition variable for shutting down the optimize thread.
+Protected by fts_optimize_wq->mutex. */
+static pthread_cond_t fts_opt_shutdown_cond;
 
 /** Initial size of nodes in fts_word_t. */
 static const ulint FTS_WORD_NODES_INIT_SIZE = 64;
@@ -82,9 +83,8 @@ enum fts_msg_type_t {
 	FTS_MSG_ADD_TABLE,		/*!< Add table to the optimize thread's
 					work queue */
 
-	FTS_MSG_DEL_TABLE,		/*!< Remove a table from the optimize
+	FTS_MSG_DEL_TABLE		/*!< Remove a table from the optimize
 					threads work queue */
-	FTS_MSG_SYNC_TABLE		/*!< Sync fts cache of a table */
 };
 
 /** Compressed list of words that have been read from FTS INDEX
@@ -203,12 +203,12 @@ struct fts_slot_t {
 };
 
 /** A table remove message for the FTS optimize thread. */
-struct fts_msg_del_t {
-	dict_table_t*	table;		/*!< The table to remove */
-
-	os_event_t	event;		/*!< Event to synchronize acknowledgement
-					of receipt and processing of the
-					this message by the consumer */
+struct fts_msg_del_t
+{
+  /** the table to remove */
+  dict_table_t *table;
+  /** condition variable to signal message consumption */
+  pthread_cond_t *cond;
 };
 
 /** The FTS optimize message work queue message type. */
@@ -899,7 +899,7 @@ fts_index_fetch_words(
 			}
 		}
 
-		fts_que_graph_free(graph);
+		que_graph_free(graph);
 
 		/* Check if max word to fetch is exceeded */
 		if (optim->zip->n_words >= n_words) {
@@ -1012,10 +1012,7 @@ fts_table_fetch_doc_ids(
 
 	error = fts_eval_sql(trx, graph);
 	fts_sql_commit(trx);
-
-	mutex_enter(&dict_sys.mutex);
 	que_graph_free(graph);
-	mutex_exit(&dict_sys.mutex);
 
 	if (error == DB_SUCCESS) {
 		ib_vector_sort(doc_ids->doc_ids, fts_doc_id_cmp);
@@ -1470,7 +1467,7 @@ fts_optimize_write_word(
 			" when deleting a word from the FTS index.";
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 	graph = NULL;
 
 	/* Even if the operation needs to be rolled back and redone,
@@ -1502,7 +1499,7 @@ fts_optimize_write_word(
 	}
 
 	if (graph != NULL) {
-		fts_que_graph_free(graph);
+		que_graph_free(graph);
 	}
 
 	return(error);
@@ -1614,8 +1611,21 @@ fts_optimize_create(
 	optim->fts_index_table.table = table;
 
 	/* The common prefix for all this parent table's aux tables. */
-	optim->name_prefix = fts_get_table_name_prefix(
-		&optim->fts_common_table);
+	char table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+	const size_t table_id_len = 1
+		+ size_t(fts_get_table_id(&optim->fts_common_table, table_id));
+	dict_sys.freeze(SRW_LOCK_CALL);
+	/* Include the separator as well. */
+	const size_t dbname_len = table->name.dblen() + 1;
+	ut_ad(dbname_len > 1);
+	const size_t prefix_name_len = dbname_len + 4 + table_id_len;
+	char* prefix_name = static_cast<char*>(
+		ut_malloc_nokey(prefix_name_len));
+	memcpy(prefix_name, table->name.m_name, dbname_len);
+	dict_sys.unfreeze();
+	memcpy(prefix_name + dbname_len, "FTS_", 4);
+	memcpy(prefix_name + dbname_len + 4, table_id, table_id_len);
+	optim->name_prefix =prefix_name;
 
 	return(optim);
 }
@@ -1827,7 +1837,7 @@ fts_optimize_words(
 						charset, word->f_str,
 						word->f_len)
 					  && graph) {
-					fts_que_graph_free(graph);
+					que_graph_free(graph);
 					graph = NULL;
 				}
 			}
@@ -1846,7 +1856,7 @@ fts_optimize_words(
 	}
 
 	if (graph != NULL) {
-		fts_que_graph_free(graph);
+		que_graph_free(graph);
 	}
 }
 
@@ -2079,7 +2089,7 @@ fts_optimize_purge_deleted_doc_ids(
 		}
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	return(error);
 }
@@ -2116,7 +2126,7 @@ fts_optimize_purge_deleted_doc_id_snapshot(
 	graph = fts_parse_sql(NULL, info, fts_end_delete_sql);
 
 	error = fts_eval_sql(optim->trx, graph);
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	return(error);
 }
@@ -2184,7 +2194,7 @@ fts_optimize_create_deleted_doc_id_snapshot(
 
 	error = fts_eval_sql(optim->trx, graph);
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	if (error != DB_SUCCESS) {
 		fts_sql_rollback(optim->trx);
@@ -2537,9 +2547,9 @@ fts_optimize_create_msg(
 }
 
 /** Add message to wqueue, signal thread pool*/
-static void add_msg(fts_msg_t *msg, bool wq_locked= false)
+static void add_msg(fts_msg_t *msg)
 {
-  ib_wqueue_add(fts_optimize_wq, msg, msg->heap, wq_locked);
+  ib_wqueue_add(fts_optimize_wq, msg, msg->heap, true);
   srv_thread_pool->submit_task(&task);
 }
 
@@ -2563,17 +2573,17 @@ void fts_optimize_add_table(dict_table_t* table)
 	}
 
 	/* Make sure table with FTS index cannot be evicted */
-	dict_table_prevent_eviction(table);
+	dict_sys.prevent_eviction(table);
 
 	msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table);
 
-	mutex_enter(&fts_optimize_wq->mutex);
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
 
-	add_msg(msg, true);
+	add_msg(msg);
 
 	table->fts->in_queue = true;
 
-	mutex_exit(&fts_optimize_wq->mutex);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
 }
 
 /**********************************************************************//**
@@ -2584,102 +2594,34 @@ fts_optimize_remove_table(
 /*======================*/
 	dict_table_t*	table)			/*!< in: table to remove */
 {
-	fts_msg_t*	msg;
-	os_event_t	event;
-	fts_msg_del_t*	remove;
-
-	/* if the optimize system not yet initialized, return */
-	if (!fts_optimize_wq) {
-		return;
-	}
-
-	/* FTS optimizer thread is already exited */
-	if (fts_opt_start_shutdown) {
-		ib::info() << "Try to remove table " << table->name
-			<< " after FTS optimize thread exiting.";
-		/* If the table can't be removed then wait till
-		fts optimize thread shuts down */
-		while (fts_optimize_wq) {
-			os_thread_sleep(10000);
-		}
-		return;
-	}
-
-	mutex_enter(&fts_optimize_wq->mutex);
-
-	if (!table->fts->in_queue) {
-		mutex_exit(&fts_optimize_wq->mutex);
-		return;
-	}
-
-	msg = fts_optimize_create_msg(FTS_MSG_DEL_TABLE, NULL);
-
-	/* We will wait on this event until signalled by the consumer. */
-	event = os_event_create(0);
-
-	remove = static_cast<fts_msg_del_t*>(
-		mem_heap_alloc(msg->heap, sizeof(*remove)));
-
-	remove->table = table;
-	remove->event = event;
-	msg->ptr = remove;
-
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
-	add_msg(msg, true);
-
-	mutex_exit(&fts_optimize_wq->mutex);
-
-	os_event_wait(event);
-
-	os_event_destroy(event);
-
-#ifdef UNIV_DEBUG
-	if (!fts_opt_start_shutdown) {
-		mutex_enter(&fts_optimize_wq->mutex);
-		ut_ad(!table->fts->in_queue);
-		mutex_exit(&fts_optimize_wq->mutex);
-	}
-#endif /* UNIV_DEBUG */
-}
-
-/** Send sync fts cache for the table.
-@param[in]	table	table to sync */
-void
-fts_optimize_request_sync_table(
-	dict_table_t*	table)
-{
-	/* if the optimize system not yet initialized, return */
-	if (!fts_optimize_wq) {
-		return;
-	}
-
-	/* FTS optimizer thread is already exited */
-	if (fts_opt_start_shutdown) {
-		ib::info() << "Try to sync table " << table->name
-			<< " after FTS optimize thread exiting.";
-		return;
-	}
-
-	mutex_enter(&fts_optimize_wq->mutex);
-
-	if (table->fts->sync_message) {
-		/* If the table already has SYNC message in
-		fts_optimize_wq queue then ignore it */
-		mutex_exit(&fts_optimize_wq->mutex);
-		return;
-	}
-
-	fts_msg_t* msg = fts_optimize_create_msg(FTS_MSG_SYNC_TABLE, table);
+  if (!fts_optimize_wq)
+    return;
 
-	add_msg(msg, true);
+  if (fts_opt_start_shutdown)
+  {
+    ib::info() << "Try to remove table " << table->name
+               << " after FTS optimize thread exiting.";
+    while (fts_optimize_wq)
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    return;
+  }
 
-	table->fts->sync_message = true;
+  mysql_mutex_lock(&fts_optimize_wq->mutex);
 
-	DBUG_EXECUTE_IF("fts_optimize_wq_count_check",
-			DBUG_ASSERT(fts_optimize_wq->length <= 1000););
+  if (table->fts->in_queue)
+  {
+    fts_msg_t *msg= fts_optimize_create_msg(FTS_MSG_DEL_TABLE, nullptr);
+    pthread_cond_t cond;
+    pthread_cond_init(&cond, nullptr);
+    msg->ptr= new(mem_heap_alloc(msg->heap, sizeof(fts_msg_del_t)))
+      fts_msg_del_t{table, &cond};
+    add_msg(msg);
+    my_cond_wait(&cond, &fts_optimize_wq->mutex.m_mutex);
+    pthread_cond_destroy(&cond);
+    ut_ad(!table->fts->in_queue);
+  }
 
-	mutex_exit(&fts_optimize_wq->mutex);
+  mysql_mutex_unlock(&fts_optimize_wq->mutex);
 }
 
 /** Add a table to fts_slots if it doesn't already exist. */
@@ -2714,9 +2656,10 @@ static bool fts_optimize_new_table(dict_table_t* table)
 }
 
 /** Remove a table from fts_slots if it exists.
-@param[in,out]	table	table to be removed from fts_slots */
-static bool fts_optimize_del_table(const dict_table_t* table)
+@param remove	table to be removed from fts_slots */
+static bool fts_optimize_del_table(fts_msg_del_t *remove)
 {
+	const dict_table_t* table = remove->table;
 	ut_ad(table);
 	for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
 		fts_slot_t*	slot;
@@ -2729,14 +2672,18 @@ static bool fts_optimize_del_table(const dict_table_t* table)
 					<< table->name;
 			}
 
-			mutex_enter(&fts_optimize_wq->mutex);
-			slot->table->fts->in_queue = false;
-			mutex_exit(&fts_optimize_wq->mutex);
+			mysql_mutex_lock(&fts_optimize_wq->mutex);
+			table->fts->in_queue = false;
+			pthread_cond_signal(remove->cond);
+			mysql_mutex_unlock(&fts_optimize_wq->mutex);
 			slot->table = NULL;
 			return true;
 		}
 	}
 
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
+	pthread_cond_signal(remove->cond);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
 	return false;
 }
 
@@ -2818,19 +2765,21 @@ static void fts_optimize_sync_table(dict_table_t *table,
 
   if (sync_table->fts && sync_table->fts->cache && sync_table->is_accessible())
   {
-    fts_sync_table(sync_table, false);
+    fts_sync_table(sync_table);
+
     if (process_message)
     {
-      mutex_enter(&fts_optimize_wq->mutex);
+      mysql_mutex_lock(&fts_optimize_wq->mutex);
       sync_table->fts->sync_message = false;
-      mutex_exit(&fts_optimize_wq->mutex);
+      mysql_mutex_unlock(&fts_optimize_wq->mutex);
     }
   }
 
-  DBUG_EXECUTE_IF("ib_optimize_wq_hang", os_thread_sleep(6000000););
+  DBUG_EXECUTE_IF("ib_optimize_wq_hang",
+		  std::this_thread::sleep_for(std::chrono::seconds(6)););
 
   if (mdl_ticket)
-    dict_table_close(sync_table, false, false, fts_opt_thd, mdl_ticket);
+    dict_table_close(sync_table, false, fts_opt_thd, mdl_ticket);
 }
 
 /**********************************************************************//**
@@ -2840,15 +2789,16 @@ static void fts_optimize_callback(void *)
 {
 	ut_ad(!srv_read_only_mode);
 
-	if (!fts_optimize_wq) {
+	static ulint	current;
+	static bool	done;
+	static ulint	n_optimize;
+
+	if (!fts_optimize_wq || done) {
 		/* Possibly timer initiated callback, can come after FTS_MSG_STOP.*/
 		return;
 	}
 
-	static ulint		current = 0;
-	static ibool		done = FALSE;
 	static ulint		n_tables = ib_vector_size(fts_slots);
-	static ulint		n_optimize = 0;
 
 	while (!done && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
 		/* If there is no message in the queue and we have tables
@@ -2897,7 +2847,7 @@ retry_later:
 
 			switch (msg->type) {
 			case FTS_MSG_STOP:
-				done = TRUE;
+				done = true;
 				break;
 
 			case FTS_MSG_ADD_TABLE:
@@ -2912,31 +2862,10 @@ retry_later:
 			case FTS_MSG_DEL_TABLE:
 				if (fts_optimize_del_table(
 					    static_cast<fts_msg_del_t*>(
-						    msg->ptr)->table)) {
+						    msg->ptr))) {
 					--n_tables;
 				}
-
-				/* Signal the producer that we have
-				removed the table. */
-				os_event_set(
-					((fts_msg_del_t*) msg->ptr)->event);
-				break;
-
-			case FTS_MSG_SYNC_TABLE:
-				if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) {
-					add_msg(msg);
-					goto retry_later;
-				}
-
-				DBUG_EXECUTE_IF(
-					"fts_instrument_msg_sync_sleep",
-					os_thread_sleep(300000););
-
-				fts_optimize_sync_table(
-					static_cast<dict_table_t*>(msg->ptr),
-					true);
 				break;
-
 			default:
 				ut_error;
 			}
@@ -2960,15 +2889,12 @@ retry_later:
 	}
 
 	ib_vector_free(fts_slots);
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
 	fts_slots = NULL;
+	pthread_cond_broadcast(&fts_opt_shutdown_cond);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
 
-	ib_wqueue_free(fts_optimize_wq);
-	fts_optimize_wq = NULL;
-
-	destroy_background_thd(fts_opt_thd);
 	ib::info() << "FTS optimize thread exiting.";
-
-	os_event_set(fts_opt_shutdown_event);
 }
 
 /**********************************************************************//**
@@ -2998,7 +2924,7 @@ fts_optimize_init(void)
 	/* Add fts tables to fts_slots which could be skipped
 	during dict_load_table_one() because fts_optimize_thread
 	wasn't even started. */
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 	for (dict_table_t* table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
 	     table != NULL;
 	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
@@ -3013,9 +2939,9 @@ fts_optimize_init(void)
 		fts_optimize_new_table(table);
 		table->fts->in_queue = true;
 	}
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 
-	fts_opt_shutdown_event = os_event_create(0);
+	pthread_cond_init(&fts_opt_shutdown_cond, nullptr);
 	last_check_sync_time = time(NULL);
 }
 
@@ -3025,17 +2951,15 @@ fts_optimize_shutdown()
 {
 	ut_ad(!srv_read_only_mode);
 
-	fts_msg_t*	msg;
-
 	/* If there is an ongoing activity on dictionary, such as
 	srv_master_evict_from_table_cache(), wait for it */
-	dict_mutex_enter_for_mysql();
-
+	dict_sys.freeze(SRW_LOCK_CALL);
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
 	/* Tells FTS optimizer system that we are exiting from
 	optimizer thread, message send their after will not be
 	processed */
 	fts_opt_start_shutdown = true;
-	dict_mutex_exit_for_mysql();
+	dict_sys.unfreeze();
 
 	/* We tell the OPTIMIZE thread to switch to state done, we
 	can't delete the work queue here because the add thread needs
@@ -3043,14 +2967,21 @@ fts_optimize_shutdown()
 	timer->disarm();
 	task_group.cancel_pending(&task);
 
-	msg = fts_optimize_create_msg(FTS_MSG_STOP, NULL);
+	add_msg(fts_optimize_create_msg(FTS_MSG_STOP, nullptr));
 
-	add_msg(msg);
-
-	os_event_wait(fts_opt_shutdown_event);
+	while (fts_slots) {
+		my_cond_wait(&fts_opt_shutdown_cond,
+			     &fts_optimize_wq->mutex.m_mutex);
+	}
 
-	os_event_destroy(fts_opt_shutdown_event);
+	destroy_background_thd(fts_opt_thd);
 	fts_opt_thd = NULL;
+	pthread_cond_destroy(&fts_opt_shutdown_cond);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
+
+	ib_wqueue_free(fts_optimize_wq);
+	fts_optimize_wq = NULL;
+
 	delete timer;
 	timer = NULL;
 }
@@ -3061,17 +2992,15 @@ void fts_sync_during_ddl(dict_table_t* table)
 {
   if (!fts_optimize_wq)
     return;
-  mutex_enter(&fts_optimize_wq->mutex);
-  if (!table->fts->sync_message)
-  {
-    mutex_exit(&fts_optimize_wq->mutex);
+  mysql_mutex_lock(&fts_optimize_wq->mutex);
+  const auto sync_message= table->fts->sync_message;
+  mysql_mutex_unlock(&fts_optimize_wq->mutex);
+  if (!sync_message)
     return;
-  }
 
-  mutex_exit(&fts_optimize_wq->mutex);
-  fts_sync_table(table, false);
+  fts_sync_table(table);
 
-  mutex_enter(&fts_optimize_wq->mutex);
+  mysql_mutex_lock(&fts_optimize_wq->mutex);
   table->fts->sync_message = false;
-  mutex_exit(&fts_optimize_wq->mutex);
+  mysql_mutex_unlock(&fts_optimize_wq->mutex);
 }
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 47fd2a92331..4407224a24d 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -1146,7 +1146,7 @@ fts_query_difference(
 		fts_cache_t*		cache = table->fts->cache;
 		dberr_t			error;
 
-		rw_lock_x_lock(&cache->lock);
+		mysql_mutex_lock(&cache->lock);
 
 		index_cache = fts_find_index_cache(cache, query->index);
 
@@ -1172,7 +1172,7 @@ fts_query_difference(
 			}
 		}
 
-		rw_lock_x_unlock(&cache->lock);
+		mysql_mutex_unlock(&cache->lock);
 
 		/* error is passed by 'query->error' */
 		if (query->error != DB_SUCCESS) {
@@ -1194,7 +1194,7 @@ fts_query_difference(
 			query->error = error;
 		}
 
-		fts_que_graph_free(graph);
+		que_graph_free(graph);
 	}
 
 	/* The size can't increase. */
@@ -1279,7 +1279,7 @@ fts_query_intersect(
 
 		/* Search the cache for a matching word first. */
 
-		rw_lock_x_lock(&cache->lock);
+		mysql_mutex_lock(&cache->lock);
 
 		/* Search for the index specific cache. */
 		index_cache = fts_find_index_cache(cache, query->index);
@@ -1304,7 +1304,7 @@ fts_query_intersect(
 			}
 		}
 
-		rw_lock_x_unlock(&cache->lock);
+		mysql_mutex_unlock(&cache->lock);
 
 		/* error is passed by 'query->error' */
 		if (query->error != DB_SUCCESS) {
@@ -1327,7 +1327,7 @@ fts_query_intersect(
 			query->error = error;
 		}
 
-		fts_que_graph_free(graph);
+		que_graph_free(graph);
 
 		if (query->error == DB_SUCCESS) {
 			/* Make the intesection (rb tree) the current doc id
@@ -1361,7 +1361,7 @@ fts_query_cache(
 	fts_cache_t*		cache = table->fts->cache;
 
 	/* Search the cache for a matching word first. */
-	rw_lock_x_lock(&cache->lock);
+	mysql_mutex_lock(&cache->lock);
 
 	/* Search for the index specific cache. */
 	index_cache = fts_find_index_cache(cache, query->index);
@@ -1391,7 +1391,7 @@ fts_query_cache(
 		}
 	}
 
-	rw_lock_x_unlock(&cache->lock);
+	mysql_mutex_unlock(&cache->lock);
 
 	return(query->error);
 }
@@ -1449,7 +1449,7 @@ fts_query_union(
 		query->error = error;
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	if (query->error == DB_SUCCESS) {
 
@@ -2347,7 +2347,7 @@ fts_query_total_docs_containing_term(
 		}
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	return(error);
 }
@@ -2429,7 +2429,7 @@ fts_query_terms_in_document(
 		}
 	}
 
-	fts_que_graph_free(graph);
+	que_graph_free(graph);
 
 	return(error);
 }
@@ -2496,9 +2496,9 @@ fts_query_is_in_proximity_range(
 
 	memset(&get_doc, 0x0, sizeof(get_doc));
 
-	rw_lock_x_lock(&cache->lock);
+	mysql_mutex_lock(&cache->lock);
 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
-	rw_lock_x_unlock(&cache->lock);
+	mysql_mutex_unlock(&cache->lock);
 	ut_a(get_doc.index_cache != NULL);
 
 	fts_phrase_t	phrase(get_doc.index_cache->index->table);
@@ -2520,7 +2520,7 @@ fts_query_is_in_proximity_range(
 
 	/* Free the prepared statement. */
 	if (get_doc.get_document_graph) {
-		fts_que_graph_free(get_doc.get_document_graph);
+		que_graph_free(get_doc.get_document_graph);
 		get_doc.get_document_graph = NULL;
 	}
 
@@ -2556,14 +2556,14 @@ fts_query_search_phrase(
 	/* Setup the doc retrieval infrastructure. */
 	memset(&get_doc, 0x0, sizeof(get_doc));
 
-	rw_lock_x_lock(&cache->lock);
+	mysql_mutex_lock(&cache->lock);
 
 	get_doc.index_cache = fts_find_index_cache(cache, query->index);
 
 	/* Must find the index cache */
 	ut_a(get_doc.index_cache != NULL);
 
-	rw_lock_x_unlock(&cache->lock);
+	mysql_mutex_unlock(&cache->lock);
 
 #ifdef FTS_INTERNAL_DIAG_PRINT
 	ib::info() << "Start phrase search";
@@ -2610,7 +2610,7 @@ fts_query_search_phrase(
 func_exit:
 	/* Free the prepared statement. */
 	if (get_doc.get_document_graph) {
-		fts_que_graph_free(get_doc.get_document_graph);
+		que_graph_free(get_doc.get_document_graph);
 		get_doc.get_document_graph = NULL;
 	}
 
@@ -2825,7 +2825,7 @@ fts_query_phrase_search(
 				query->error = error;
 			}
 
-			fts_que_graph_free(graph);
+			que_graph_free(graph);
 			graph = NULL;
 
 			fts_query_cache(query, token);
@@ -3802,7 +3802,7 @@ fts_query_free(
 {
 
 	if (query->read_nodes_graph) {
-		fts_que_graph_free(query->read_nodes_graph);
+		que_graph_free(query->read_nodes_graph);
 	}
 
 	if (query->root) {
@@ -4271,9 +4271,9 @@ fts_expand_query(
 	/* Init "result_doc", to hold words from the first search pass */
 	fts_doc_init(&result_doc);
 
-	rw_lock_x_lock(&index->table->fts->cache->lock);
+	mysql_mutex_lock(&index->table->fts->cache->lock);
 	index_cache = fts_find_index_cache(index->table->fts->cache, index);
-	rw_lock_x_unlock(&index->table->fts->cache->lock);
+	mysql_mutex_unlock(&index->table->fts->cache->lock);
 
 	ut_a(index_cache);
 
diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc
index a4234f7b376..1970f6f584f 100644
--- a/storage/innobase/fts/fts0sql.cc
+++ b/storage/innobase/fts/fts0sql.cc
@@ -86,44 +86,21 @@ fts_get_table_id(
 
 /** Construct the name of an internal FTS table for the given table.
 @param[in]	fts_table	metadata on fulltext-indexed table
-@param[in]	dict_locked	whether dict_sys.mutex is being held
-@return	the prefix, must be freed with ut_free() */
-char* fts_get_table_name_prefix(const fts_table_t* fts_table)
-{
-	char		table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
-	const size_t table_id_len = size_t(fts_get_table_id(fts_table,
-							    table_id)) + 1;
-	mutex_enter(&dict_sys.mutex);
-	/* Include the separator as well. */
-	const size_t dbname_len = fts_table->table->name.dblen() + 1;
-	ut_ad(dbname_len > 1);
-	const size_t prefix_name_len = dbname_len + 4 + table_id_len;
-	char* prefix_name = static_cast<char*>(
-		ut_malloc_nokey(prefix_name_len));
-	memcpy(prefix_name, fts_table->table->name.m_name, dbname_len);
-	mutex_exit(&dict_sys.mutex);
-	memcpy(prefix_name + dbname_len, "FTS_", 4);
-	memcpy(prefix_name + dbname_len + 4, table_id, table_id_len);
-	return prefix_name;
-}
-
-/** Construct the name of an internal FTS table for the given table.
-@param[in]	fts_table	metadata on fulltext-indexed table
 @param[out]	table_name	a name up to MAX_FULL_NAME_LEN
-@param[in]	dict_locked	whether dict_sys.mutex is being held */
+@param[in]	dict_locked	whether dict_sys.latch is being held */
 void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
 			bool dict_locked)
 {
 	if (!dict_locked) {
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.freeze(SRW_LOCK_CALL);
 	}
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.frozen());
 	/* Include the separator as well. */
 	const size_t dbname_len = fts_table->table->name.dblen() + 1;
 	ut_ad(dbname_len > 1);
 	memcpy(table_name, fts_table->table->name.m_name, dbname_len);
 	if (!dict_locked) {
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unfreeze();
 	}
 	memcpy(table_name += dbname_len, "FTS_", 4);
 	table_name += 4;
@@ -152,17 +129,15 @@ fts_parse_sql(
 		       && fts_table->table->fts->dict_locked);
 
 	if (!dict_locked) {
-		ut_ad(!mutex_own(&dict_sys.mutex));
-
 		/* The InnoDB SQL parser is not re-entrant. */
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 	}
 
 	graph = pars_sql(info, str);
 	ut_a(graph);
 
 	if (!dict_locked) {
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unlock();
 	}
 
 	ut_free(str);
@@ -171,30 +146,6 @@ fts_parse_sql(
 }
 
 /******************************************************************//**
-Parse an SQL string.
-@return query graph */
-que_t*
-fts_parse_sql_no_dict_lock(
-/*=======================*/
-	pars_info_t*	info,		/*!< in: info struct, or NULL */
-	const char*	sql)		/*!< in: SQL string to evaluate */
-{
-	char*		str;
-	que_t*		graph;
-
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
-
-	graph = pars_sql(info, str);
-	ut_a(graph);
-
-	ut_free(str);
-
-	return(graph);
-}
-
-/******************************************************************//**
 Evaluate an SQL query graph.
 @return DB_SUCCESS or error code */
 dberr_t
@@ -206,7 +157,6 @@ fts_eval_sql(
 	que_thr_t*	thr;
 
 	graph->trx = trx;
-	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
 
 	ut_a(thr = que_fork_start_command(graph));
 
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
index e084f0b7935..a52027f28bc 100644
--- a/storage/innobase/fut/fut0lst.cc
+++ b/storage/innobase/fut/fut0lst.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -38,9 +38,8 @@ Created 11/28/1995 Heikki Tuuri
 static void flst_write_addr(const buf_block_t& block, byte *faddr,
                             uint32_t page, uint16_t boffset, mtr_t* mtr)
 {
-  ut_ad(mtr->memo_contains_page_flagged(faddr,
-					MTR_MEMO_PAGE_X_FIX
-					| MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_page_flagged(faddr, MTR_MEMO_PAGE_X_FIX |
+                                        MTR_MEMO_PAGE_SX_FIX));
   ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA);
   ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
 
@@ -69,12 +68,12 @@ static void flst_write_addr(const buf_block_t& block, byte *faddr,
 
 /** Write 2 null file addresses.
 @param[in]      b       file page
-@param[in,out]  addr	file address to be zeroed out
+@param[in,out]  addr    file address to be zeroed out
 @param[in,out]  mtr     mini-transaction */
 static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr)
 {
   if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL)
-    mtr->memset(&b, ulint(addr - b.frame) + FIL_ADDR_PAGE, 4, 0xff);
+    mtr->memset(&b, ulint(addr - b.page.frame) + FIL_ADDR_PAGE, 4, 0xff);
   mtr->write<2,mtr_t::MAYBE_NOP>(b, addr + FIL_ADDR_BYTE, 0U);
   /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
   which is 4 bytes, or less than FIL_ADDR_SIZE. */
@@ -95,12 +94,13 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
   ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
 
-  ut_ad(!mach_read_from_4(base->frame + boffset + FLST_LEN));
-  mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U);
+  ut_ad(!mach_read_from_4(base->page.frame + boffset + FLST_LEN));
+  mtr->write<1>(*base, base->page.frame + boffset + (FLST_LEN + 3), 1U);
   /* Update first and last fields of base node */
-  flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+  flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
                   add->page.id().page_no(), aoffset, mtr);
-  memcpy(base->frame + boffset + FLST_LAST, base->frame + boffset + FLST_FIRST,
+  memcpy(base->page.frame + boffset + FLST_LAST,
+         base->page.frame + boffset + FLST_FIRST,
          FIL_ADDR_SIZE);
   /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
   which is 4 bytes, or less than FIL_ADDR_SIZE. */
@@ -109,7 +109,7 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
 
   /* Set prev and next fields of node to add */
   static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility");
-  flst_zero_both(*add, add->frame + aoffset + FLST_PREV, mtr);
+  flst_zero_both(*add, add->page.frame + aoffset + FLST_PREV, mtr);
 }
 
 /** Insert a node after another one.
@@ -119,10 +119,11 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
 @param[in]      coffset byte offset of the insert position
 @param[in,out]  add     block to be added
 @param[in]      aoffset byte offset of the block to be added
-@param[in,outr] mtr     mini-transaction */
-static void flst_insert_after(buf_block_t *base, uint16_t boffset,
-                              buf_block_t *cur, uint16_t coffset,
-                              buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+@param[in,out]  mtr     mini-transaction */
+static dberr_t flst_insert_after(buf_block_t *base, uint16_t boffset,
+                                 buf_block_t *cur, uint16_t coffset,
+                                 buf_block_t *add, uint16_t aoffset,
+                                 mtr_t *mtr)
 {
   ut_ad(base != cur || boffset != coffset);
   ut_ad(base != add || boffset != aoffset);
@@ -137,30 +138,32 @@ static void flst_insert_after(buf_block_t *base, uint16_t boffset,
   ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
 
-  fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
+  fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset);
 
-  flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV,
                   cur->page.id().page_no(), coffset, mtr);
-  flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_NEXT,
                   next_addr.page, next_addr.boffset, mtr);
 
+  dberr_t err= DB_SUCCESS;
+
   if (next_addr.page == FIL_NULL)
-    flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+    flst_write_addr(*base, base->page.frame + boffset + FLST_LAST,
                     add->page.id().page_no(), aoffset, mtr);
-  else
-  {
-    buf_block_t *block;
-    flst_node_t *next= fut_get_ptr(add->page.id().space(), add->zip_size(),
-                                   next_addr, RW_SX_LATCH, mtr, &block);
-    flst_write_addr(*block, next + FLST_PREV,
+  else if (buf_block_t *block=
+           buf_page_get_gen(page_id_t{add->page.id().space(), next_addr.page},
+                            add->zip_size(), RW_SX_LATCH, nullptr,
+                            BUF_GET_POSSIBLY_FREED, mtr, &err))
+    flst_write_addr(*block, block->page.frame +
+                    next_addr.boffset + FLST_PREV,
                     add->page.id().page_no(), aoffset, mtr);
-  }
 
-  flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT,
+  flst_write_addr(*cur, cur->page.frame + coffset + FLST_NEXT,
                   add->page.id().page_no(), aoffset, mtr);
 
-  byte *len= &base->frame[boffset + FLST_LEN];
+  byte *len= &base->page.frame[boffset + FLST_LEN];
   mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
+  return err;
 }
 
 /** Insert a node before another one.
@@ -170,10 +173,12 @@ static void flst_insert_after(buf_block_t *base, uint16_t boffset,
 @param[in]      coffset byte offset of the insert position
 @param[in,out]  add     block to be added
 @param[in]      aoffset byte offset of the block to be added
-@param[in,outr] mtr     mini-transaction */
-static void flst_insert_before(buf_block_t *base, uint16_t boffset,
-                               buf_block_t *cur, uint16_t coffset,
-                               buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+@param[in,out]  mtr     mini-transaction
+@return error code */
+static dberr_t flst_insert_before(buf_block_t *base, uint16_t boffset,
+                                  buf_block_t *cur, uint16_t coffset,
+                                  buf_block_t *add, uint16_t aoffset,
+                                  mtr_t *mtr)
 {
   ut_ad(base != cur || boffset != coffset);
   ut_ad(base != add || boffset != aoffset);
@@ -188,30 +193,32 @@ static void flst_insert_before(buf_block_t *base, uint16_t boffset,
   ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
 
-  fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
+  fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset);
 
-  flst_write_addr(*add, add->frame + aoffset + FLST_PREV,
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV,
                   prev_addr.page, prev_addr.boffset, mtr);
-  flst_write_addr(*add, add->frame + aoffset + FLST_NEXT,
-		  cur->page.id().page_no(), coffset, mtr);
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_NEXT,
+                  cur->page.id().page_no(), coffset, mtr);
+
+  dberr_t err= DB_SUCCESS;
 
   if (prev_addr.page == FIL_NULL)
-    flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+    flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
                     add->page.id().page_no(), aoffset, mtr);
-  else
-  {
-    buf_block_t *block;
-    flst_node_t *prev= fut_get_ptr(add->page.id().space(), add->zip_size(),
-                                   prev_addr, RW_SX_LATCH, mtr, &block);
-    flst_write_addr(*block, prev + FLST_NEXT,
+  else if (buf_block_t *block=
+           buf_page_get_gen(page_id_t{add->page.id().space(), prev_addr.page},
+                            add->zip_size(), RW_SX_LATCH, nullptr,
+                            BUF_GET_POSSIBLY_FREED, mtr, &err))
+    flst_write_addr(*block, block->page.frame +
+                    prev_addr.boffset + FLST_NEXT,
                     add->page.id().page_no(), aoffset, mtr);
-  }
 
-  flst_write_addr(*cur, cur->frame + coffset + FLST_PREV,
+  flst_write_addr(*cur, cur->page.frame + coffset + FLST_PREV,
                     add->page.id().page_no(), aoffset, mtr);
 
-  byte *len= &base->frame[boffset + FLST_LEN];
+  byte *len= &base->page.frame[boffset + FLST_LEN];
   mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
+  return err;
 }
 
 /** Initialize a list base node.
@@ -233,8 +240,8 @@ void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
 @param[in,out]  add     block to be added
 @param[in]      aoffset byte offset of the node to be added
 @param[in,outr] mtr     mini-transaction */
-void flst_add_last(buf_block_t *base, uint16_t boffset,
-                   buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+                      buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
 {
   ut_ad(base != add || boffset != aoffset);
   ut_ad(boffset < base->physical_size());
@@ -243,20 +250,23 @@ void flst_add_last(buf_block_t *base, uint16_t boffset,
                                    MTR_MEMO_PAGE_SX_FIX));
   ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
-
-  if (!flst_get_len(base->frame + boffset))
+  if (!flst_get_len(base->page.frame + boffset))
+  {
     flst_add_to_empty(base, boffset, add, aoffset, mtr);
+    return DB_SUCCESS;
+  }
   else
   {
-    fil_addr_t addr= flst_get_last(base->frame + boffset);
+    fil_addr_t addr= flst_get_last(base->page.frame + boffset);
     buf_block_t *cur= add;
-    const flst_node_t *c= addr.page == add->page.id().page_no()
-      ? add->frame + addr.boffset
-      : fut_get_ptr(add->page.id().space(), add->zip_size(), addr,
-                    RW_SX_LATCH, mtr, &cur);
-    flst_insert_after(base, boffset, cur,
-                      static_cast<uint16_t>(c - cur->frame),
-                      add, aoffset, mtr);
+    dberr_t err;
+    if (addr.page != add->page.id().page_no() &&
+        !(cur= buf_page_get_gen(page_id_t{add->page.id().space(), addr.page},
+                                add->zip_size(), RW_SX_LATCH, nullptr,
+                                BUF_GET_POSSIBLY_FREED, mtr, &err)))
+      return err;
+    return flst_insert_after(base, boffset, cur, addr.boffset,
+                             add, aoffset, mtr);
   }
 }
 
@@ -265,9 +275,10 @@ void flst_add_last(buf_block_t *base, uint16_t boffset,
 @param[in]      boffset byte offset of the base node
 @param[in,out]  add     block to be added
 @param[in]      aoffset byte offset of the node to be added
-@param[in,outr] mtr     mini-transaction */
-void flst_add_first(buf_block_t *base, uint16_t boffset,
-                    buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
+                       buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
 {
   ut_ad(base != add || boffset != aoffset);
   ut_ad(boffset < base->physical_size());
@@ -277,19 +288,23 @@ void flst_add_first(buf_block_t *base, uint16_t boffset,
   ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
 
-  if (!flst_get_len(base->frame + boffset))
+  if (!flst_get_len(base->page.frame + boffset))
+  {
     flst_add_to_empty(base, boffset, add, aoffset, mtr);
+    return DB_SUCCESS;
+  }
   else
   {
-    fil_addr_t addr= flst_get_first(base->frame + boffset);
+    fil_addr_t addr= flst_get_first(base->page.frame + boffset);
     buf_block_t *cur= add;
-    const flst_node_t *c= addr.page == add->page.id().page_no()
-      ? add->frame + addr.boffset
-      : fut_get_ptr(add->page.id().space(), add->zip_size(), addr,
-                    RW_SX_LATCH, mtr, &cur);
-    flst_insert_before(base, boffset, cur,
-                       static_cast<uint16_t>(c - cur->frame),
-                       add, aoffset, mtr);
+    dberr_t err;
+    if (addr.page != add->page.id().page_no() &&
+        !(cur= buf_page_get_gen(page_id_t{add->page.id().space(), addr.page},
+                                add->zip_size(), RW_SX_LATCH, nullptr,
+                                BUF_GET_POSSIBLY_FREED, mtr, &err)))
+      return err;
+    return flst_insert_before(base, boffset, cur, addr.boffset,
+                              add, aoffset, mtr);
   }
 }
 
@@ -298,9 +313,10 @@ void flst_add_first(buf_block_t *base, uint16_t boffset,
 @param[in]      boffset byte offset of the base node
 @param[in,out]  cur     block to be removed
 @param[in]      coffset byte offset of the current record to be removed
-@param[in,outr] mtr     mini-transaction */
-void flst_remove(buf_block_t *base, uint16_t boffset,
-                 buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
 {
   ut_ad(boffset < base->physical_size());
   ut_ad(coffset < cur->physical_size());
@@ -309,40 +325,46 @@ void flst_remove(buf_block_t *base, uint16_t boffset,
   ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
                                    MTR_MEMO_PAGE_SX_FIX));
 
-  const fil_addr_t prev_addr= flst_get_prev_addr(cur->frame + coffset);
-  const fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset);
+  const fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset);
+  const fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset);
+  dberr_t err= DB_SUCCESS;
 
   if (prev_addr.page == FIL_NULL)
-    flst_write_addr(*base, base->frame + boffset + FLST_FIRST,
+    flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
                     next_addr.page, next_addr.boffset, mtr);
   else
   {
-    buf_block_t *block= cur;
-    flst_node_t *prev= prev_addr.page == cur->page.id().page_no()
-      ? cur->frame + prev_addr.boffset
-      : fut_get_ptr(cur->page.id().space(), cur->zip_size(), prev_addr,
-                    RW_SX_LATCH, mtr, &block);
-    flst_write_addr(*block, prev + FLST_NEXT,
-                    next_addr.page, next_addr.boffset, mtr);
+    buf_block_t *b= cur;
+    if (prev_addr.page == b->page.id().page_no() ||
+        (b= buf_page_get_gen(page_id_t(b->page.id().space(), prev_addr.page),
+                             b->zip_size(), RW_SX_LATCH, nullptr,
+                             BUF_GET_POSSIBLY_FREED, mtr, &err)))
+      flst_write_addr(*b, b->page.frame + prev_addr.boffset + FLST_NEXT,
+                      next_addr.page, next_addr.boffset, mtr);
   }
 
   if (next_addr.page == FIL_NULL)
-    flst_write_addr(*base, base->frame + boffset + FLST_LAST,
+    flst_write_addr(*base, base->page.frame + boffset + FLST_LAST,
                     prev_addr.page, prev_addr.boffset, mtr);
   else
   {
-    buf_block_t *block= cur;
-    flst_node_t *next= next_addr.page == cur->page.id().page_no()
-      ? cur->frame + next_addr.boffset
-      : fut_get_ptr(cur->page.id().space(), cur->zip_size(), next_addr,
-                    RW_SX_LATCH, mtr, &block);
-    flst_write_addr(*block, next + FLST_PREV,
-                    prev_addr.page, prev_addr.boffset, mtr);
+    dberr_t err2;
+    if (next_addr.page == cur->page.id().page_no() ||
+        (cur= buf_page_get_gen(page_id_t(cur->page.id().space(),
+                                         next_addr.page),
+                               cur->zip_size(), RW_SX_LATCH, nullptr,
+                               BUF_GET_POSSIBLY_FREED, mtr, &err2)))
+      flst_write_addr(*cur, cur->page.frame + next_addr.boffset + FLST_PREV,
+                      prev_addr.page, prev_addr.boffset, mtr);
+    else if (err == DB_SUCCESS)
+      err= err2;
   }
 
-  byte *len= &base->frame[boffset + FLST_LEN];
-  ut_ad(mach_read_from_4(len) > 0);
+  byte *len= &base->page.frame[boffset + FLST_LEN];
+  if (UNIV_UNLIKELY(!mach_read_from_4(len)))
+    return DB_CORRUPTION;
   mtr->write<4>(*base, len, mach_read_from_4(len) - 1);
+  return err;
 }
 
 #ifdef UNIV_DEBUG
@@ -360,30 +382,32 @@ void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr)
   the x-locked pages could fill the buffer, resulting in a deadlock. */
   mtr_t mtr2;
 
-  const uint32_t len= flst_get_len(base->frame + boffset);
-  fil_addr_t addr= flst_get_first(base->frame + boffset);
+  const uint32_t len= flst_get_len(base->page.frame + boffset);
+  fil_addr_t addr= flst_get_first(base->page.frame + boffset);
 
   for (uint32_t i= len; i--; )
   {
     mtr2.start();
-    const flst_node_t *node= fut_get_ptr(base->page.id().space(),
-                                         base->zip_size(), addr,
-                                         RW_SX_LATCH, &mtr2);
-    addr= flst_get_next_addr(node);
+    const buf_block_t *b=
+      buf_page_get_gen(page_id_t(base->page.id().space(), addr.page),
+                       base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr);
+    ut_ad(b);
+    addr= flst_get_next_addr(b->page.frame + addr.boffset);
     mtr2.commit();
   }
 
   ut_ad(addr.page == FIL_NULL);
 
-  addr= flst_get_last(base->frame + boffset);
+  addr= flst_get_last(base->page.frame + boffset);
 
   for (uint32_t i= len; i--; )
   {
     mtr2.start();
-    const flst_node_t *node= fut_get_ptr(base->page.id().space(),
-                                         base->zip_size(), addr,
-                                         RW_SX_LATCH, &mtr2);
-    addr= flst_get_prev_addr(node);
+    const buf_block_t *b=
+      buf_page_get_gen(page_id_t(base->page.id().space(), addr.page),
+                       base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr);
+    ut_ad(b);
+    addr= flst_get_prev_addr(b->page.frame + addr.boffset);
     mtr2.commit();
   }
 
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
index a4fbf70e7a1..83afd732b21 100644
--- a/storage/innobase/gis/gis0rtree.cc
+++ b/storage/innobase/gis/gis0rtree.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2022, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -70,7 +70,7 @@ rtr_page_split_initialize_nodes(
 
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
-	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index());
 
 	n_recs = ulint(page_get_n_recs(page)) + 1;
 
@@ -88,8 +88,8 @@ rtr_page_split_initialize_nodes(
 
 	rec = page_rec_get_next(page_get_infimum_rec(page));
 	const ulint n_core = page_is_leaf(page)
-		? cursor->index->n_core_fields : 0;
-	*offsets = rec_get_offsets(rec, cursor->index, *offsets, n_core,
+		? cursor->index()->n_core_fields : 0;
+	*offsets = rec_get_offsets(rec, cursor->index(), *offsets, n_core,
 				   n_uniq, &heap);
 
 	source_cur = rec_get_nth_field(rec, *offsets, 0, &len);
@@ -101,7 +101,7 @@ rtr_page_split_initialize_nodes(
 		memcpy(cur->coords, source_cur, DATA_MBR_LEN);
 
 		rec = page_rec_get_next(rec);
-		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
 					   n_core, n_uniq, &heap);
 		source_cur = rec_get_nth_field(rec, *offsets, 0, &len);
 	}
@@ -111,9 +111,9 @@ rtr_page_split_initialize_nodes(
 		dtuple_get_nth_field(tuple, 0)));
 	cur->coords = reserve_coords(buf_pos, SPDIMS);
 	rec = (byte*) mem_heap_alloc(
-		heap, rec_get_converted_size(cursor->index, tuple, 0));
+		heap, rec_get_converted_size(cursor->index(), tuple, 0));
 
-	rec = rec_convert_dtuple_to_rec(rec, cursor->index, tuple, 0);
+	rec = rec_convert_dtuple_to_rec(rec, cursor->index(), tuple, 0);
 	cur->key = rec;
 
 	memcpy(cur->coords, source_cur, DATA_MBR_LEN);
@@ -200,7 +200,7 @@ rtr_update_mbr_field(
 	rec_t*		new_rec,	/*!< in: rec to use */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	dict_index_t*	index = cursor->index;
+	dict_index_t*	index = cursor->index();
 	mem_heap_t*	heap;
 	page_t*		page;
 	rec_t*		rec;
@@ -230,7 +230,7 @@ rtr_update_mbr_field(
 	ut_ad(page == buf_block_get_frame(block));
 
 	child = btr_node_ptr_get_child_page_no(rec, offsets);
-	const ulint n_core = page_is_leaf(block->frame)
+	const ulint n_core = page_is_leaf(block->page.frame)
 		? index->n_core_fields : 0;
 
 	if (new_rec) {
@@ -245,6 +245,7 @@ rtr_update_mbr_field(
 	/* We need to remember the child page no of cursor2, since page could be
 	reorganized or insert a new rec before it. */
 	if (cursor2) {
+		ut_ad(cursor2->index() == index);
 		rec_t*	del_rec = btr_cur_get_rec(cursor2);
 		offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2),
 					   index, NULL, 0,
@@ -268,7 +269,7 @@ rtr_update_mbr_field(
 			if (!btr_cur_update_alloc_zip(
 					page_zip,
 					btr_cur_get_page_cur(cursor),
-					index, offsets,
+					offsets,
 					rec_offs_size(offsets),
 					false, mtr)) {
 
@@ -321,7 +322,7 @@ rtr_update_mbr_field(
 							offsets2));
 
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
-					    index, offsets2, mtr);
+					    offsets2, mtr);
 		}
 	} else if (page_get_n_recs(page) == 1) {
 		/* When there's only one rec in the page, we do insert/delete to
@@ -352,9 +353,10 @@ rtr_update_mbr_field(
 		ut_ad(old_rec != insert_rec);
 
 		page_cur_position(old_rec, block, &page_cur);
+		page_cur.index = index;
 		offsets2 = rec_get_offsets(old_rec, index, NULL, n_core,
 					   ULINT_UNDEFINED, &heap);
-		page_cur_delete_rec(&page_cur, index, offsets2, mtr);
+		page_cur_delete_rec(&page_cur, offsets2, mtr);
 
 	} else {
 update_mbr:
@@ -366,8 +368,7 @@ update_mbr:
 
 		/* Delete the rec which cursor point to. */
 		next_rec = page_rec_get_next(rec);
-		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
-				    index, offsets, mtr);
+		page_cur_delete_rec(&cursor->page_cur, offsets, mtr);
 		if (!ins_suc) {
 			ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
 
@@ -400,40 +401,40 @@ update_mbr:
 			      == btr_node_ptr_get_child_page_no(cur2_rec,
 								offsets2));
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
-					    index, offsets2, mtr);
+					    offsets2, mtr);
 			cursor2 = NULL;
 		}
 
 		/* Insert the new rec. */
-		page_cur_search_with_match(block, index, node_ptr,
-					   PAGE_CUR_LE , &up_match, &low_match,
-					   btr_cur_get_page_cur(cursor), NULL);
+		if (page_cur_search_with_match(node_ptr, PAGE_CUR_LE,
+					       &up_match, &low_match,
+					       btr_cur_get_page_cur(cursor),
+					       NULL)) {
+			goto err_exit;
+		}
 
 		err = btr_cur_optimistic_insert(flags, cursor, &insert_offsets,
 						&heap, node_ptr, &insert_rec,
 						&dummy_big_rec, 0, NULL, mtr);
 
-		if (!ins_suc && err == DB_SUCCESS) {
-			ins_suc = true;
-		}
-
 		/* If optimistic insert fail, try reorganize the page
 		and insert again. */
-		if (err != DB_SUCCESS && ins_suc) {
-			btr_page_reorganize(btr_cur_get_page_cur(cursor),
-					    index, mtr);
-
-			err = btr_cur_optimistic_insert(flags,
-							cursor,
-							&insert_offsets,
-							&heap,
-							node_ptr,
-							&insert_rec,
-							&dummy_big_rec,
-							0, NULL, mtr);
+		if (err == DB_SUCCESS) {
+			ins_suc = true;
+		} else if (ins_suc) {
+			ut_ad(err == DB_FAIL);
+			err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+						  mtr);
+			if (err == DB_SUCCESS) {
+				err = btr_cur_optimistic_insert(
+					flags, cursor, &insert_offsets, &heap,
+					node_ptr, &insert_rec, &dummy_big_rec,
+					0, NULL, mtr);
+			}
 
 			/* Will do pessimistic insert */
 			if (err != DB_SUCCESS) {
+				ut_ad(err == DB_FAIL);
 				ins_suc = false;
 			}
 		}
@@ -465,10 +466,14 @@ update_mbr:
 			cur2_pno = btr_node_ptr_get_child_page_no(cur2_rec, offsets2);
 			if ((del_page_no != cur2_pno)
 			    || (cur2_rec == insert_rec)) {
-				cur2_rec = page_rec_get_next(
-					page_get_infimum_rec(page));
+				cur2_rec = page_get_infimum_rec(page);
+
+				while ((cur2_rec
+					= page_rec_get_next(cur2_rec))) {
+					if (page_rec_is_supremum(cur2_rec)) {
+						break;
+					}
 
-				while (!page_rec_is_supremum(cur2_rec)) {
 					offsets2 = rec_get_offsets(cur2_rec, index,
 								   NULL,
 								   n_core,
@@ -483,10 +488,7 @@ update_mbr:
 							break;
 						}
 					}
-					cur2_rec = page_rec_get_next(cur2_rec);
 				}
-
-				ut_ad(!page_rec_is_supremum(cur2_rec));
 			}
 
 			rec_info = rec_get_info_bits(cur2_rec,
@@ -503,7 +505,7 @@ update_mbr:
 			ut_ad(cur2_pno == del_page_no && cur2_rec != insert_rec);
 
 			page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
-					    index, offsets2, mtr);
+					    offsets2, mtr);
 		}
 
 		if (!ins_suc) {
@@ -532,14 +534,15 @@ update_mbr:
 	      || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
 			  page_rec_get_next(page_get_infimum_rec(page)),
 			  page_is_comp(page))));
-
+err_exit:
 	mem_heap_free(heap);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /**************************************************************//**
 Update parent page's MBR and Predicate lock information during a split */
-static MY_ATTRIBUTE((nonnull))
-void
+static
+dberr_t
 rtr_adjust_upper_level(
 /*===================*/
 	btr_cur_t*	sea_cur,	/*!< in: search cursor */
@@ -553,24 +556,23 @@ rtr_adjust_upper_level(
 {
 	ulint		page_no;
 	ulint		new_page_no;
-	dict_index_t*	index = sea_cur->index;
 	btr_cur_t	cursor;
 	rec_offs*	offsets;
 	mem_heap_t*	heap;
 	ulint		level;
-	dtuple_t*	node_ptr_upper;
+	dtuple_t*	node_ptr_upper = nullptr;
 	page_cur_t*	page_cursor;
 	lock_prdt_t	prdt;
 	lock_prdt_t	new_prdt;
-	dberr_t		err;
 	big_rec_t*	dummy_big_rec;
 	rec_t*		rec;
 
 	/* Create a memory heap where the data tuple is stored */
 	heap = mem_heap_create(1024);
-	cursor.init();
 
 	cursor.thr = sea_cur->thr;
+	cursor.page_cur.index = sea_cur->index();
+	cursor.page_cur.block = block;
 
 	/* Get the level of the split pages */
 	level = btr_page_get_level(buf_block_get_frame(block));
@@ -582,13 +584,12 @@ rtr_adjust_upper_level(
 
 	/* Set new mbr for the old page on the upper level. */
 	/* Look up the index for the node pointer to page */
-	offsets = rtr_page_get_father_block(
-		NULL, heap, index, block, mtr, sea_cur, &cursor);
+	offsets = rtr_page_get_father_block(NULL, heap, mtr, sea_cur, &cursor);
 
 	page_cursor = btr_cur_get_page_cur(&cursor);
 
-	rtr_update_mbr_field(&cursor, offsets, NULL, block->frame, mbr, NULL,
-			     mtr);
+	rtr_update_mbr_field(&cursor, offsets, nullptr, block->page.frame, mbr,
+			     nullptr, mtr);
 
 	/* Already updated parent MBR, reset in our path */
 	if (sea_cur->rtr_info) {
@@ -599,29 +600,31 @@ rtr_adjust_upper_level(
 		}
 	}
 
-	/* Insert the node for the new page. */
-	node_ptr_upper = rtr_index_build_node_ptr(
-		index, new_mbr,
-		page_rec_get_next(page_get_infimum_rec(new_block->frame)),
-		new_page_no, heap);
-
-	ulint	up_match = 0;
-	ulint	low_match = 0;
-
-	buf_block_t*	father_block = btr_cur_get_block(&cursor);
-
-	page_cur_search_with_match(
-		father_block, index, node_ptr_upper,
-		PAGE_CUR_LE , &up_match, &low_match,
-		btr_cur_get_page_cur(&cursor), NULL);
-
-	err = btr_cur_optimistic_insert(
-		flags
-		| BTR_NO_LOCKING_FLAG
-		| BTR_KEEP_SYS_FLAG
-		| BTR_NO_UNDO_LOG_FLAG,
-		&cursor, &offsets, &heap,
-		node_ptr_upper, &rec, &dummy_big_rec, 0, NULL, mtr);
+	dberr_t err;
+
+	if (const rec_t* first = page_rec_get_next_const(
+		    page_get_infimum_rec(new_block->page.frame))) {
+		/* Insert the node for the new page. */
+		node_ptr_upper = rtr_index_build_node_ptr(
+			sea_cur->index(), new_mbr, first, new_page_no, heap);
+		ulint	up_match = 0, low_match = 0;
+		err = page_cur_search_with_match(node_ptr_upper,
+						 PAGE_CUR_LE,
+						 &up_match, &low_match,
+						 btr_cur_get_page_cur(&cursor),
+						 NULL)
+			? DB_CORRUPTION
+			: btr_cur_optimistic_insert(flags
+						    | BTR_NO_LOCKING_FLAG
+						    | BTR_KEEP_SYS_FLAG
+						    | BTR_NO_UNDO_LOG_FLAG,
+						    &cursor, &offsets, &heap,
+						    node_ptr_upper, &rec,
+						    &dummy_big_rec, 0, NULL,
+						    mtr);
+	} else {
+		err = DB_CORRUPTION;
+	}
 
 	if (err == DB_FAIL) {
 		cursor.rtr_info = sea_cur->rtr_info;
@@ -640,42 +643,49 @@ rtr_adjust_upper_level(
 						 node_ptr_upper, &rec,
 						 &dummy_big_rec, 0, NULL, mtr);
 		cursor.rtr_info = NULL;
-		ut_a(err == DB_SUCCESS);
-
 		mem_heap_free(new_heap);
 	}
 
-	prdt.data = static_cast<void*>(mbr);
-	prdt.op = 0;
-	new_prdt.data = static_cast<void*>(new_mbr);
-	new_prdt.op = 0;
+	if (err == DB_SUCCESS) {
+		prdt.data = static_cast<void*>(mbr);
+		prdt.op = 0;
+		new_prdt.data = static_cast<void*>(new_mbr);
+		new_prdt.op = 0;
 
-	lock_prdt_update_parent(block, new_block, &prdt, &new_prdt,
-				page_cursor->block->page.id());
+		lock_prdt_update_parent(block, new_block, &prdt, &new_prdt,
+					page_cursor->block->page.id());
+	}
 
 	mem_heap_free(heap);
 
-	ut_ad(block->zip_size() == index->table->space->zip_size());
+	ut_ad(block->zip_size() == sea_cur->index()->table->space->zip_size());
 
-	const uint32_t next_page_no = btr_page_get_next(block->frame);
-
-	if (next_page_no != FIL_NULL) {
-		buf_block_t*	next_block = btr_block_get(
-			*index, next_page_no, RW_X_LATCH, false, mtr);
-#ifdef UNIV_BTR_DEBUG
-		ut_a(page_is_comp(next_block->frame)
-		     == page_is_comp(block->frame));
-		ut_a(btr_page_get_prev(next_block->frame)
-		     == block->page.id().page_no());
-#endif /* UNIV_BTR_DEBUG */
+	if (err != DB_SUCCESS) {
+		return err;
+	}
 
+	const uint32_t next_page_no = btr_page_get_next(block->page.frame);
+
+	if (next_page_no == FIL_NULL) {
+	} else if (buf_block_t*	next_block =
+		   btr_block_get(*sea_cur->index(), next_page_no, RW_X_LATCH,
+				 false, mtr, &err)) {
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame
+						    + FIL_PAGE_PREV,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4))) {
+			return DB_CORRUPTION;
+		}
 		btr_page_set_prev(next_block, new_page_no, mtr);
+	} else {
+		return err;
 	}
 
 	btr_page_set_next(block, new_page_no, mtr);
 
 	btr_page_set_prev(new_block, page_no, mtr);
 	btr_page_set_next(new_block, next_page_no, mtr);
+	return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -686,9 +696,10 @@ if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
 or by invoking ibuf_reset_free_bits() before mtr_commit().
 
-@return TRUE on success; FALSE on compression failure */
+@return error code
+@retval DB_FAIL on ROW_FORMAT=COMPRESSED compression failure */
 static
-ibool
+dberr_t
 rtr_split_page_move_rec_list(
 /*=========================*/
 	rtr_split_node_t*	node_array,	/*!< in: split node array. */
@@ -716,7 +727,6 @@ rtr_split_page_move_rec_list(
 	page_zip_des_t*		new_page_zip
 		= buf_block_get_page_zip(new_block);
 	rec_t*			rec;
-	rec_t*			ret;
 	ulint			moved		= 0;
 	ulint			max_to_move	= 0;
 	rtr_rec_move_t*		rec_move	= NULL;
@@ -728,10 +738,10 @@ rtr_split_page_move_rec_list(
 
 	page_cur_set_before_first(block, &page_cursor);
 	page_cur_set_before_first(new_block, &new_page_cursor);
+	page_cursor.index = new_page_cursor.index = index;
 
 	page = buf_block_get_frame(block);
 	new_page = buf_block_get_frame(new_block);
-	ret = page_rec_get_prev(page_get_supremum_rec(new_page));
 
 	end_split_node = node_array + page_get_n_recs(page);
 
@@ -741,8 +751,7 @@ rtr_split_page_move_rec_list(
 		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	}
 
-	max_to_move = page_get_n_recs(
-				buf_block_get_frame(block));
+	max_to_move = page_get_n_recs(buf_block_get_frame(block));
 	rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
 			heap,
 			sizeof (*rec_move) * max_to_move));
@@ -764,14 +773,16 @@ rtr_split_page_move_rec_list(
 
 			rec = page_cur_insert_rec_low(
 				&new_page_cursor,
-				index, cur_split_node->key, offsets, mtr);
+				cur_split_node->key, offsets, mtr);
 
-			ut_a(rec);
+			if (UNIV_UNLIKELY
+			    (!rec
+			     || !page_cur_move_to_next(&new_page_cursor))) {
+				return DB_CORRUPTION;
+			}
 
 			lock_rec_restore_from_page_infimum(
-				new_block, rec, block);
-
-			page_cur_move_to_next(&new_page_cursor);
+				*new_block, rec, block->page.id());
 
 			rec_move[moved].new_rec = rec;
 			rec_move[moved].old_rec = cur_split_node->key;
@@ -803,35 +814,16 @@ rtr_split_page_move_rec_list(
 
 		if (!page_zip_compress(new_block, index,
 				       page_zip_level, mtr)) {
-			ulint	ret_pos;
-
-			/* Before trying to reorganize the page,
-			store the number of preceding records on the page. */
-			ret_pos = page_rec_get_n_recs_before(ret);
-			/* Before copying, "ret" was the predecessor
-			of the predefined supremum record.  If it was
-			the predefined infimum record, then it would
-			still be the infimum, and we would have
-			ret_pos == 0. */
-
-			if (UNIV_UNLIKELY
-			    (!page_zip_reorganize(new_block, index,
-						  page_zip_level, mtr))) {
-
-				if (UNIV_UNLIKELY
-				    (!page_zip_decompress(new_page_zip,
-							  new_page, FALSE))) {
-					ut_error;
+			if (dberr_t err =
+				page_zip_reorganize(new_block, index,
+						    page_zip_level, mtr)) {
+				if (err == DB_FAIL) {
+					ut_a(page_zip_decompress(new_page_zip,
+								 new_page,
+								 FALSE));
 				}
-#ifdef UNIV_GIS_DEBUG
-				ut_ad(page_validate(new_page, index));
-#endif
-
-				return(false);
+				return err;
 			}
-
-			/* The page was reorganized: Seek to ret_pos. */
-			ret = page_rec_get_nth(new_page, ret_pos);
 		}
 	}
 
@@ -848,12 +840,11 @@ rtr_split_page_move_rec_list(
 				page_cur_get_rec(&page_cursor), index,
 				offsets, n_core, ULINT_UNDEFINED,
 				&heap);
-			page_cur_delete_rec(&page_cursor,
-				index, offsets, mtr);
+			page_cur_delete_rec(&page_cursor, offsets, mtr);
 		}
 	}
 
-	return(true);
+	return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -875,7 +866,8 @@ rtr_page_split_and_insert(
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
 {
 	buf_block_t*		block;
 	page_t*			page;
@@ -883,7 +875,6 @@ rtr_page_split_and_insert(
 	buf_block_t*		new_block;
 	page_zip_des_t*		page_zip;
 	page_zip_des_t*		new_page_zip;
-	buf_block_t*		insert_block;
 	page_cur_t*		page_cursor;
 	rec_t*			rec = 0;
 	ulint			n_recs;
@@ -912,13 +903,10 @@ func_start:
 	mem_heap_empty(*heap);
 	*offsets = NULL;
 
-	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock, MTR_MEMO_X_LOCK
-					 | MTR_MEMO_SX_LOCK));
-	ut_ad(!dict_index_is_online_ddl(cursor->index)
-	      || (flags & BTR_CREATE_FLAG)
-	      || dict_index_is_clust(cursor->index));
-	ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index),
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(!dict_index_is_online_ddl(cursor->index()));
+	ut_ad(cursor->index()->lock.have_u_or_x());
 
 	block = btr_cur_get_block(cursor);
 	page = buf_block_get_frame(block);
@@ -933,6 +921,11 @@ func_start:
 	if (!page_has_prev(page) && !page_is_leaf(page)) {
 		first_rec = page_rec_get_next(
 			page_get_infimum_rec(buf_block_get_frame(block)));
+		if (UNIV_UNLIKELY(!first_rec)) {
+corrupted:
+			*err = DB_CORRUPTION;
+			return nullptr;
+		}
 	}
 
 	/* Initial split nodes array. */
@@ -956,7 +949,7 @@ func_start:
 	}
 #endif
 
-	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
 	total_data = page_get_data_size(page) + insert_size;
 	first_rec_group = split_rtree_node(rtr_split_node_array,
 					   static_cast<int>(n_recs),
@@ -967,19 +960,19 @@ func_start:
 
 	/* Allocate a new page to the index */
 	const uint16_t page_level = btr_page_get_level(page);
-	new_block = btr_page_alloc(cursor->index, page_id.page_no() + 1,
-				   FSP_UP, page_level, mtr, mtr);
-	if (!new_block) {
-		return NULL;
+	new_block = btr_page_alloc(cursor->index(), page_id.page_no() + 1,
+				   FSP_UP, page_level, mtr, mtr, err);
+	if (UNIV_UNLIKELY(!new_block)) {
+		return nullptr;
 	}
 
 	new_page_zip = buf_block_get_page_zip(new_block);
 	if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
 		/* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
 		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
-		memset_aligned<4>(new_block->frame + FIL_PAGE_PREV, 0, 4);
+		memset_aligned<4>(new_block->page.frame + FIL_PAGE_PREV, 0, 4);
 	}
-	btr_page_create(new_block, new_page_zip, cursor->index,
+	btr_page_create(new_block, new_page_zip, cursor->index(),
 			page_level, mtr);
 
 	new_page = buf_block_get_frame(new_block);
@@ -987,7 +980,7 @@ func_start:
 
 	/* Set new ssn to the new page and page. */
 	page_set_ssn_id(new_block, new_page_zip, current_ssn, mtr);
-	next_ssn = rtr_get_new_ssn_id(cursor->index);
+	next_ssn = rtr_get_new_ssn_id(cursor->index());
 
 	page_set_ssn_id(block, page_zip, next_ssn, mtr);
 
@@ -997,10 +990,17 @@ func_start:
 #ifdef UNIV_ZIP_COPY
 	    || page_zip
 #endif
-	    || !rtr_split_page_move_rec_list(rtr_split_node_array,
-					     first_rec_group,
-					     new_block, block, first_rec,
-					     cursor->index, *heap, mtr)) {
+	    || (*err = rtr_split_page_move_rec_list(rtr_split_node_array,
+						    first_rec_group,
+						    new_block, block,
+						    first_rec, cursor->index(),
+						    *heap, mtr))) {
+		if (*err != DB_FAIL) {
+			return nullptr;
+		}
+
+		*err = DB_SUCCESS;
+
 		ulint			n		= 0;
 		rec_t*			rec;
 		ulint			moved		= 0;
@@ -1016,7 +1016,7 @@ func_start:
 		ut_a(new_page_zip);
 
 		page_zip_copy_recs(new_block,
-				   page_zip, page, cursor->index, mtr);
+				   page_zip, page, cursor->index(), mtr);
 
 		page_cursor = btr_cur_get_page_cur(cursor);
 
@@ -1051,7 +1051,7 @@ func_start:
 		lock_rtr_move_rec_list(new_block, block, rec_move, moved);
 
 		const ulint n_core = page_level
-			? 0 : cursor->index->n_core_fields;
+			? 0 : cursor->index()->n_core_fields;
 
 		/* Delete recs in first group from the new page. */
 		for (cur_split_node = rtr_split_node_array;
@@ -1071,11 +1071,11 @@ func_start:
 
 				*offsets = rec_get_offsets(
 					page_cur_get_rec(page_cursor),
-					cursor->index, *offsets, n_core,
+					cursor->index(), *offsets, n_core,
 					ULINT_UNDEFINED, heap);
 
 				page_cur_delete_rec(page_cursor,
-					cursor->index, *offsets, mtr);
+						    *offsets, mtr);
 				n++;
 			}
 		}
@@ -1088,32 +1088,34 @@ func_start:
 						  block, page_cursor);
 				*offsets = rec_get_offsets(
 					page_cur_get_rec(page_cursor),
-					cursor->index, *offsets, n_core,
+					page_cursor->index, *offsets, n_core,
 					ULINT_UNDEFINED, heap);
-				page_cur_delete_rec(page_cursor,
-					cursor->index, *offsets, mtr);
+				page_cur_delete_rec(page_cursor, *offsets,
+						    mtr);
 			}
 		}
 
 #ifdef UNIV_GIS_DEBUG
-		ut_ad(page_validate(new_page, cursor->index));
-		ut_ad(page_validate(page, cursor->index));
+		ut_ad(page_validate(new_page, cursor->index()));
+		ut_ad(page_validate(page, cursor->index()));
 #endif
 	}
 
 	/* Insert the new rec to the proper page. */
 	cur_split_node = end_split_node - 1;
-	if (cur_split_node->n_node != first_rec_group) {
-		insert_block = new_block;
-	} else {
-		insert_block = block;
-	}
 
 	/* Reposition the cursor for insert and try insertion */
 	page_cursor = btr_cur_get_page_cur(cursor);
+	page_cursor->block = cur_split_node->n_node != first_rec_group
+		? new_block : block;
 
-	page_cur_search(insert_block, cursor->index, tuple,
-			PAGE_CUR_LE, page_cursor);
+	ulint up_match = 0, low_match = 0;
+
+	if (page_cur_search_with_match(tuple,
+				       PAGE_CUR_LE, &up_match, &low_match,
+				       page_cursor, nullptr)) {
+		goto corrupted;
+	}
 
 	/* It's possible that the new record is too big to be inserted into
 	the page, and it'll need the second round split in this case.
@@ -1124,7 +1126,7 @@ func_start:
 				goto after_insert; }
 	);
 
-	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+	rec = page_cur_tuple_insert(page_cursor, tuple,
 				    offsets, heap, n_ext, mtr);
 
 	/* If insert did not fit, try page reorganization.
@@ -1132,14 +1134,13 @@ func_start:
 	attempted this already. */
 	if (rec == NULL) {
 		if (!is_page_cur_get_page_zip(page_cursor)
-		    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
+		    && btr_page_reorganize(page_cursor, mtr)) {
 			rec = page_cur_tuple_insert(page_cursor, tuple,
-						    cursor->index, offsets,
+						    offsets,
 						    heap, n_ext, mtr);
 
 		}
-		/* If insert fail, we will try to split the insert_block
-		again. */
+		/* If insert fail, we will try to split the block again. */
 	}
 
 #ifdef UNIV_DEBUG
@@ -1147,8 +1148,8 @@ after_insert:
 #endif
 	/* Calculate the mbr on the upper half-page, and the mbr on
 	original page. */
-	rtr_page_cal_mbr(cursor->index, block, &mbr, *heap);
-	rtr_page_cal_mbr(cursor->index, new_block, &new_mbr, *heap);
+	rtr_page_cal_mbr(cursor->index(), block, &mbr, *heap);
+	rtr_page_cal_mbr(cursor->index(), new_block, &new_mbr, *heap);
 	prdt.data = &mbr;
 	new_prdt.data = &new_mbr;
 
@@ -1157,13 +1158,20 @@ after_insert:
 	lock_prdt_update_split(new_block, &prdt, &new_prdt, page_id);
 
 	/* Adjust the upper level. */
-	rtr_adjust_upper_level(cursor, flags, block, new_block,
-			       &mbr, &new_mbr, mtr);
+	*err = rtr_adjust_upper_level(cursor, flags, block, new_block,
+				      &mbr, &new_mbr, mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
+	}
 
 	/* Save the new ssn to the root page, since we need to reinit
 	the first ssn value from it after restart server. */
 
-	root_block = btr_root_block_get(cursor->index, RW_SX_LATCH, mtr);
+	root_block = btr_root_block_get(cursor->index(), RW_SX_LATCH,
+					mtr, err);
+	if (UNIV_UNLIKELY(!root_block)) {
+		return nullptr;
+	}
 
 	page_zip = buf_block_get_page_zip(root_block);
 	page_set_ssn_id(root_block, page_zip, next_ssn, mtr);
@@ -1172,8 +1180,8 @@ after_insert:
 	 again. */
 	if (!rec) {
 		/* We play safe and reset the free bits for new_page */
-		if (!dict_index_is_clust(cursor->index)
-		    && !cursor->index->table->is_temporary()) {
+		if (!dict_index_is_clust(cursor->index())
+		    && !cursor->index()->table->is_temporary()) {
 			ibuf_reset_free_bits(new_block);
 			ibuf_reset_free_bits(block);
 		}
@@ -1187,19 +1195,20 @@ after_insert:
 
 		rec_t* i_rec = page_rec_get_next(page_get_infimum_rec(
 			buf_block_get_frame(block)));
-		btr_cur_position(cursor->index, i_rec, block, cursor);
+		if (UNIV_UNLIKELY(!i_rec)) {
+			goto corrupted;
+		}
+		btr_cur_position(cursor->index(), i_rec, block, cursor);
 
 		goto func_start;
 	}
 
 #ifdef UNIV_GIS_DEBUG
-	ut_ad(page_validate(buf_block_get_frame(block), cursor->index));
-	ut_ad(page_validate(buf_block_get_frame(new_block), cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(block), cursor->index()));
+	ut_ad(page_validate(buf_block_get_frame(new_block), cursor->index()));
 
-	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index(), *offsets));
 #endif
-	MONITOR_INC(MONITOR_INDEX_SPLIT);
-
 	return(rec);
 }
 
@@ -1216,14 +1225,13 @@ rtr_ins_enlarge_mbr(
 	rtr_mbr_t		new_mbr;
 	buf_block_t*		block;
 	mem_heap_t*		heap;
-	dict_index_t*		index = btr_cur->index;
 	page_cur_t*		page_cursor;
 	rec_offs*		offsets;
 	node_visit_t*		node_visit;
 	btr_cur_t		cursor;
 	page_t*			page;
 
-	ut_ad(dict_index_is_spatial(index));
+	ut_ad(btr_cur->index()->is_spatial());
 
 	/* If no rtr_info or rtree is one level tree, return. */
 	if (!btr_cur->rtr_info || btr_cur->tree_height == 1) {
@@ -1251,20 +1259,20 @@ rtr_ins_enlarge_mbr(
 		}
 
 		/* Calculate the mbr of the child page. */
-		rtr_page_cal_mbr(index, block, &new_mbr, heap);
+		rtr_page_cal_mbr(page_cursor->index, block, &new_mbr, heap);
 
 		/* Get father block. */
-		cursor.init();
+		cursor.page_cur.index = page_cursor->index;
+		cursor.page_cur.block = block;
 		offsets = rtr_page_get_father_block(
-			NULL, heap, index, block, mtr, btr_cur, &cursor);
+			NULL, heap, mtr, btr_cur, &cursor);
 
 		page = buf_block_get_frame(block);
 
 		/* Update the mbr field of the rec. */
 		rtr_update_mbr_field(&cursor, offsets, NULL, page,
 				     &new_mbr, NULL, mtr);
-		page_cursor = btr_cur_get_page_cur(&cursor);
-		block = page_cur_get_block(page_cursor);
+		block = btr_cur_get_block(&cursor);
 	}
 
 	mem_heap_free(heap);
@@ -1274,14 +1282,9 @@ rtr_ins_enlarge_mbr(
 
 /*************************************************************//**
 Copy recs from a page to new_block of rtree.
-Differs from page_copy_rec_list_end, because this function does not
-touch the lock table and max trx id on page or compress the page.
 
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit(). */
-void
+@return error code */
+dberr_t
 rtr_page_copy_rec_list_end_no_locks(
 /*================================*/
 	buf_block_t*	new_block,	/*!< in: index page to copy to */
@@ -1311,18 +1314,21 @@ rtr_page_copy_rec_list_end_no_locks(
 
 	page_cur_position(rec, block, &cur1);
 
-	if (page_cur_is_before_first(&cur1)) {
-		page_cur_move_to_next(&cur1);
+	if (page_cur_is_before_first(&cur1) && !page_cur_move_to_next(&cur1)) {
+		return DB_CORRUPTION;
 	}
 
-	btr_assert_not_corrupted(new_block, index);
 	ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
 	ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint)
 	     (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
 
 	cur_rec = page_rec_get_next(
 		page_get_infimum_rec(buf_block_get_frame(new_block)));
+	if (UNIV_UNLIKELY(!cur_rec)) {
+		return DB_CORRUPTION;
+	}
 	page_cur_position(cur_rec, new_block, &page_cur);
+	page_cur.index = index;
 
 	/* Copy records from the original page to the new page */
 	while (!page_cur_is_after_last(&cur1)) {
@@ -1331,6 +1337,9 @@ rtr_page_copy_rec_list_end_no_locks(
 
 		if (page_rec_is_infimum(cur_rec)) {
 			cur_rec = page_rec_get_next(cur_rec);
+			if (UNIV_UNLIKELY(!cur_rec)) {
+				return DB_CORRUPTION;
+			}
 		}
 
 		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
@@ -1346,12 +1355,10 @@ rtr_page_copy_rec_list_end_no_locks(
 					  offsets1, offsets2, index, false,
 					  &cur_matched_fields);
 			if (cmp < 0) {
-				page_cur_move_to_prev(&page_cur);
-				break;
+				goto move_to_prev;
 			} else if (cmp > 0) {
 				/* Skip small recs. */
-				page_cur_move_to_next(&page_cur);
-				cur_rec = page_cur_get_rec(&page_cur);
+				cur_rec = page_cur_move_to_next(&page_cur);
 			} else if (n_core) {
 				if (rec_get_deleted_flag(cur1_rec,
 					dict_table_is_comp(index->table))) {
@@ -1370,26 +1377,23 @@ rtr_page_copy_rec_list_end_no_locks(
 		/* If position is on suprenum rec, need to move to
 		previous rec. */
 		if (page_rec_is_supremum(cur_rec)) {
-			page_cur_move_to_prev(&page_cur);
+move_to_prev:
+			cur_rec = page_cur_move_to_prev(&page_cur);
+		} else {
+			cur_rec = page_cur_get_rec(&page_cur);
 		}
 
-		cur_rec = page_cur_get_rec(&page_cur);
+		if (UNIV_UNLIKELY(!cur_rec)) {
+			return DB_CORRUPTION;
+		}
 
 		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
 					   ULINT_UNDEFINED, &heap);
 
-		ins_rec = page_cur_insert_rec_low(&page_cur, index,
+		ins_rec = page_cur_insert_rec_low(&page_cur,
 						  cur1_rec, offsets1, mtr);
-		if (UNIV_UNLIKELY(!ins_rec)) {
-			fprintf(stderr, "page number %u and %u\n",
-				new_block->page.id().page_no(),
-				block->page.id().page_no());
-
-			ib::fatal() << "rec offset " << page_offset(rec)
-				<< ", cur1 offset "
-				<<  page_offset(page_cur_get_rec(&cur1))
-				<< ", cur_rec offset "
-				<< page_offset(cur_rec);
+		if (UNIV_UNLIKELY(!ins_rec || moved >= max_move)) {
+			return DB_CORRUPTION;
 		}
 
 		rec_move[moved].new_rec = ins_rec;
@@ -1397,20 +1401,20 @@ rtr_page_copy_rec_list_end_no_locks(
 		rec_move[moved].moved = false;
 		moved++;
 next:
-		if (moved > max_move) {
-			ut_ad(0);
-			break;
+		if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+			return DB_CORRUPTION;
 		}
-
-		page_cur_move_to_next(&cur1);
 	}
 
 	*num_moved = moved;
+	return DB_SUCCESS;
 }
 
 /*************************************************************//**
-Copy recs till a specified rec from a page to new_block of rtree. */
-void
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
 rtr_page_copy_rec_list_start_no_locks(
 /*==================================*/
 	buf_block_t*	new_block,	/*!< in: index page to copy to */
@@ -1438,11 +1442,17 @@ rtr_page_copy_rec_list_start_no_locks(
 	rec_offs_init(offsets_2);
 
 	page_cur_set_before_first(block, &cur1);
-	page_cur_move_to_next(&cur1);
+	if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+		return DB_CORRUPTION;
+	}
 
 	cur_rec = page_rec_get_next(
 		page_get_infimum_rec(buf_block_get_frame(new_block)));
+	if (UNIV_UNLIKELY(!cur_rec)) {
+		return DB_CORRUPTION;
+	}
 	page_cur_position(cur_rec, new_block, &page_cur);
+	page_cur.index = index;
 
 	while (page_cur_get_rec(&cur1) != rec) {
 		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
@@ -1450,6 +1460,9 @@ rtr_page_copy_rec_list_start_no_locks(
 
 		if (page_rec_is_infimum(cur_rec)) {
 			cur_rec = page_rec_get_next(cur_rec);
+			if (UNIV_UNLIKELY(!cur_rec)) {
+				return DB_CORRUPTION;
+			}
 		}
 
 		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
@@ -1465,13 +1478,10 @@ rtr_page_copy_rec_list_start_no_locks(
 					      offsets1, offsets2, index, false,
 					      &cur_matched_fields);
 			if (cmp < 0) {
-				page_cur_move_to_prev(&page_cur);
-				cur_rec = page_cur_get_rec(&page_cur);
-				break;
+				goto move_to_prev;
 			} else if (cmp > 0) {
 				/* Skip small recs. */
-				page_cur_move_to_next(&page_cur);
-				cur_rec = page_cur_get_rec(&page_cur);
+				cur_rec = page_cur_move_to_next(&page_cur);
 			} else if (n_core) {
 				if (rec_get_deleted_flag(
 					cur1_rec,
@@ -1491,23 +1501,23 @@ rtr_page_copy_rec_list_start_no_locks(
 		/* If position is on suprenum rec, need to move to
 		previous rec. */
 		if (page_rec_is_supremum(cur_rec)) {
-			page_cur_move_to_prev(&page_cur);
+move_to_prev:
+			cur_rec = page_cur_move_to_prev(&page_cur);
+		} else {
+			cur_rec = page_cur_get_rec(&page_cur);
 		}
 
-		cur_rec = page_cur_get_rec(&page_cur);
+		if (UNIV_UNLIKELY(!cur_rec)) {
+			return DB_CORRUPTION;
+		}
 
 		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
 					   ULINT_UNDEFINED, &heap);
 
-		ins_rec = page_cur_insert_rec_low(&page_cur, index,
+		ins_rec = page_cur_insert_rec_low(&page_cur,
 						  cur1_rec, offsets1, mtr);
-		if (UNIV_UNLIKELY(!ins_rec)) {
-			ib::fatal() << new_block->page.id()
-				<< "rec offset " << page_offset(rec)
-				<< ", cur1 offset "
-				<<  page_offset(page_cur_get_rec(&cur1))
-				<< ", cur_rec offset "
-				<< page_offset(cur_rec);
+		if (UNIV_UNLIKELY(!ins_rec || moved >= max_move)) {
+			return DB_CORRUPTION;
 		}
 
 		rec_move[moved].new_rec = ins_rec;
@@ -1515,15 +1525,13 @@ rtr_page_copy_rec_list_start_no_locks(
 		rec_move[moved].moved = false;
 		moved++;
 next:
-		if (moved > max_move) {
-			ut_ad(0);
-			break;
+		if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+			return DB_CORRUPTION;
 		}
-
-		page_cur_move_to_next(&cur1);
 	}
 
 	*num_moved = moved;
+	return DB_SUCCESS;
 }
 
 /****************************************************************//**
@@ -1544,7 +1552,7 @@ rtr_merge_mbr_changed(
 	ulint		len;
 	bool		changed = false;
 
-	ut_ad(dict_index_is_spatial(cursor->index));
+	ut_ad(cursor->index()->is_spatial());
 
 	rec = btr_cur_get_rec(cursor);
 
@@ -1585,7 +1593,7 @@ rtr_merge_and_update_mbr(
 	rtr_mbr_t		new_mbr;
 
 	if (rtr_merge_mbr_changed(cursor, cursor2, offsets, offsets2,
-                                  &new_mbr)) {
+				  &new_mbr)) {
 		rtr_update_mbr_field(cursor, offsets, cursor2, child_page,
 				     &new_mbr, NULL, mtr);
 	} else {
@@ -1624,16 +1632,15 @@ rtr_check_same_block(
 	btr_cur_t*	cursor,	/*!< in/out: position at the parent entry
 				pointing to the child if successful */
 	buf_block_t*	parentb,/*!< in: parent page to check */
-	buf_block_t*	childb,	/*!< in: child Page */
 	mem_heap_t*	heap)	/*!< in: memory heap */
 
 {
-	ulint		page_no = childb->page.id().page_no();
+	const uint32_t	page_no =
+		btr_cur_get_block(cursor)->page.id().page_no();
 	rec_offs*	offsets;
-	rec_t*		rec = page_rec_get_next(page_get_infimum_rec(
-				buf_block_get_frame(parentb)));
+	rec_t*		rec = page_get_infimum_rec(parentb->page.frame);
 
-	while (!page_rec_is_supremum(rec)) {
+	while ((rec = page_rec_get_next(rec)) && !page_rec_is_supremum(rec)) {
 		offsets = rec_get_offsets(
 			rec, index, NULL, 0, ULINT_UNDEFINED, &heap);
 
@@ -1641,8 +1648,6 @@ rtr_check_same_block(
 			btr_cur_position(index, rec, parentb, cursor);
 			return(true);
 		}
-
-		rec = page_rec_get_next(rec);
 	}
 
 	return(false);
@@ -1836,7 +1841,8 @@ rtr_estimate_n_rows_in_range(
 	index->set_modified(mtr);
 	mtr_s_lock_index(index, &mtr);
 
-	buf_block_t* block = btr_root_block_get(index, RW_S_LATCH, &mtr);
+	dberr_t err;
+	buf_block_t* block = btr_root_block_get(index, RW_S_LATCH, &mtr, &err);
 	if (!block) {
 err_exit:
 		mtr.commit();
@@ -1851,9 +1857,9 @@ err_exit:
 
 	/* Scan records in root page and calculate area. */
 	double	area = 0;
-	for (const rec_t* rec = page_rec_get_next(
-		     page_get_infimum_rec(block->frame));
-	     !page_rec_is_supremum(rec);
+	for (const rec_t* rec = page_rec_get_next_const(
+		     page_get_infimum_rec(block->page.frame));
+	     rec && !page_rec_is_supremum(rec);
 	     rec = page_rec_get_next_const(rec)) {
 		rtr_mbr_t	mbr;
 		double		rec_area;
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
index 7432aab1a29..8ca8681bce9 100644
--- a/storage/innobase/gis/gis0sea.cc
+++ b/storage/innobase/gis/gis0sea.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2016, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -44,7 +44,6 @@ Created 2014/01/16 Jimmy Yang
 static
 bool
 rtr_cur_restore_position(
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
 	btr_cur_t*	cursor,		/*!< in: detached persistent cursor */
 	ulint		level,		/*!< in: index level */
 	mtr_t*		mtr);		/*!< in: mtr */
@@ -74,10 +73,75 @@ rtr_adjust_parent_path(
 	}
 }
 
+/** Latches the leaf page or pages requested.
+@param[in]	block_savepoint	leaf page where the search converged
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[in]	cursor		cursor
+@param[in]	mtr		mini-transaction */
+static void
+rtr_latch_leaves(
+	ulint			block_savepoint,
+	btr_latch_mode		latch_mode,
+	btr_cur_t*		cursor,
+	mtr_t*			mtr)
+{
+	compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
+	compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
+	compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
+
+	buf_block_t* block = mtr->at_savepoint(block_savepoint);
+
+	ut_ad(block->page.id().space() == cursor->index()->table->space->id);
+	ut_ad(block->page.in_file());
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_S_LOCK
+					 | MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+
+	switch (latch_mode) {
+		uint32_t	left_page_no;
+		uint32_t	right_page_no;
+	default:
+		ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
+		break;
+	case BTR_MODIFY_TREE:
+		/* It is exclusive for other operations which calls
+		btr_page_set_prev() */
+		ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+						 MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
+		/* x-latch also siblings from left to right */
+		left_page_no = btr_page_get_prev(block->page.frame);
+
+		if (left_page_no != FIL_NULL) {
+			btr_block_get(*cursor->index(), left_page_no, RW_X_LATCH,
+				      true, mtr);
+		}
+
+		mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
+
+		right_page_no = btr_page_get_next(block->page.frame);
+
+		if (right_page_no != FIL_NULL) {
+			btr_block_get(*cursor->index(), right_page_no,
+				      RW_X_LATCH, true, mtr);
+		}
+		break;
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+		rw_lock_type_t mode =
+			rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
+		static_assert(int{RW_S_LATCH} == int{BTR_SEARCH_LEAF}, "");
+		static_assert(int{RW_X_LATCH} == int{BTR_MODIFY_LEAF}, "");
+		mtr->upgrade_buffer_fix(block_savepoint, mode);
+	}
+}
+
 /*************************************************************//**
 Find the next matching record. This function is used by search
 or record locating during index delete/update.
 @return true if there is suitable record found, otherwise false */
+TRANSACTIONAL_TARGET
 static
 bool
 rtr_pcur_getnext_from_path(
@@ -94,17 +158,15 @@ rtr_pcur_getnext_from_path(
 				/*!< in: index tree locked */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	dict_index_t*	index = btr_cur->index;
+	dict_index_t*	index = btr_cur->index();
 	bool		found = false;
 	page_cur_t*	page_cursor;
 	ulint		level = 0;
 	node_visit_t	next_rec;
 	rtr_info_t*	rtr_info = btr_cur->rtr_info;
 	node_seq_t	page_ssn;
-	ulint		my_latch_mode;
 	ulint		skip_parent = false;
 	bool		new_split = false;
-	bool		need_parent;
 	bool		for_delete = false;
 	bool		for_undo_ins = false;
 
@@ -115,7 +177,7 @@ rtr_pcur_getnext_from_path(
 
 	ut_ad(dtuple_get_n_fields_cmp(tuple));
 
-	my_latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+	const auto my_latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
 
 	for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
 	for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
@@ -130,13 +192,13 @@ rtr_pcur_getnext_from_path(
 
 	/* Whether need to track parent information. Only need so
 	when we do tree altering operations (such as index page merge) */
-	need_parent = ((my_latch_mode == BTR_MODIFY_TREE
-		        || my_latch_mode == BTR_CONT_MODIFY_TREE)
-		       && mode == PAGE_CUR_RTREE_LOCATE);
+	static_assert(BTR_CONT_MODIFY_TREE == (4 | BTR_MODIFY_TREE), "");
+
+	const bool need_parent = mode == PAGE_CUR_RTREE_LOCATE
+		&& (my_latch_mode | 4) == BTR_CONT_MODIFY_TREE;
 
 	if (!index_locked) {
-		ut_ad(latch_mode & BTR_SEARCH_LEAF
-		      || latch_mode & BTR_MODIFY_LEAF);
+		ut_ad(mtr->is_empty());
 		mtr_s_lock_index(index, mtr);
 	} else {
 		ut_ad(mtr->memo_contains_flagged(&index->lock,
@@ -155,15 +217,13 @@ rtr_pcur_getnext_from_path(
 		buf_block_t*	block;
 		node_seq_t	path_ssn;
 		const page_t*	page;
-		ulint		rw_latch = RW_X_LATCH;
-		ulint		tree_idx;
+		rw_lock_type_t	rw_latch;
 
-		mutex_enter(&rtr_info->rtr_path_mutex);
+		mysql_mutex_lock(&rtr_info->rtr_path_mutex);
 		next_rec = rtr_info->path->back();
 		rtr_info->path->pop_back();
 		level = next_rec.level;
 		path_ssn = next_rec.seq_no;
-		tree_idx = btr_cur->tree_height - level - 1;
 
 		/* Maintain the parent path info as well, if needed */
 		if (need_parent && !skip_parent && !new_split) {
@@ -201,7 +261,7 @@ rtr_pcur_getnext_from_path(
 			      == rtr_info->parent_path->back().child_no);
 		}
 
-		mutex_exit(&rtr_info->rtr_path_mutex);
+		mysql_mutex_unlock(&rtr_info->rtr_path_mutex);
 
 		skip_parent = false;
 		new_split = false;
@@ -214,73 +274,36 @@ rtr_pcur_getnext_from_path(
 		One reason for pre-latch is that we might need to position
 		some parent position (requires latch) during search */
 		if (level == 0) {
-			/* S latched for SEARCH_LEAF, and X latched
-			for MODIFY_LEAF */
-			if (my_latch_mode <= BTR_MODIFY_LEAF) {
-				rw_latch = my_latch_mode;
-			}
-
-			if (my_latch_mode == BTR_CONT_MODIFY_TREE
-			    || my_latch_mode == BTR_MODIFY_TREE) {
-				rw_latch = RW_NO_LATCH;
-			}
-
-		} else if (level == target_level) {
+			static_assert(ulint{BTR_SEARCH_LEAF} ==
+				      ulint{RW_S_LATCH}, "");
+			static_assert(ulint{BTR_MODIFY_LEAF} ==
+				      ulint{RW_X_LATCH}, "");
+			rw_latch = (my_latch_mode | 4) == BTR_CONT_MODIFY_TREE
+				? RW_NO_LATCH
+				: rw_lock_type_t(my_latch_mode);
+		} else {
 			rw_latch = RW_X_LATCH;
 		}
 
-		/* Release previous locked blocks */
-		if (my_latch_mode != BTR_SEARCH_LEAF) {
-			for (ulint idx = 0; idx < btr_cur->tree_height;
-			     idx++) {
-				if (rtr_info->tree_blocks[idx]) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						rtr_info->tree_savepoints[idx],
-						rtr_info->tree_blocks[idx]);
-					rtr_info->tree_blocks[idx] = NULL;
-				}
-			}
-			for (ulint idx = RTR_MAX_LEVELS; idx < RTR_MAX_LEVELS + 3;
-			     idx++) {
-				if (rtr_info->tree_blocks[idx]) {
-					mtr_release_block_at_savepoint(
-						mtr,
-						rtr_info->tree_savepoints[idx],
-						rtr_info->tree_blocks[idx]);
-					rtr_info->tree_blocks[idx] = NULL;
-				}
-			}
+		if (my_latch_mode == BTR_MODIFY_LEAF) {
+			mtr->rollback_to_savepoint(1);
 		}
 
-		/* set up savepoint to record any locks to be taken */
-		rtr_info->tree_savepoints[tree_idx] = mtr_set_savepoint(mtr);
-
-#ifdef UNIV_RTR_DEBUG
-		ut_ad(!(rw_lock_own_flagged(&btr_cur->page_cur.block->lock,
-					    RW_LOCK_FLAG_X | RW_LOCK_FLAG_S))
-			|| my_latch_mode == BTR_MODIFY_TREE
-			|| my_latch_mode == BTR_CONT_MODIFY_TREE
-			|| !page_is_leaf(buf_block_get_frame(
-					btr_cur->page_cur.block)));
-#endif /* UNIV_RTR_DEBUG */
-
-		dberr_t err = DB_SUCCESS;
+		ut_ad((my_latch_mode | 4) == BTR_CONT_MODIFY_TREE
+		      || !page_is_leaf(btr_cur_get_page(btr_cur))
+		      || !btr_cur->page_cur.block->page.lock.have_any());
 
+		const auto block_savepoint = mtr->get_savepoint();
 		block = buf_page_get_gen(
 			page_id_t(index->table->space_id,
 				  next_rec.page_no), zip_size,
-			rw_latch, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err);
+			rw_latch, NULL, BUF_GET, mtr);
 
-		if (block == NULL) {
-			continue;
-		} else if (rw_latch != RW_NO_LATCH) {
-			ut_ad(!dict_index_is_ibuf(index));
-			buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		if (!block) {
+			found = false;
+			break;
 		}
 
-		rtr_info->tree_blocks[tree_idx] = block;
-
 		page = buf_block_get_frame(block);
 		page_ssn = page_get_ssn_id(page);
 
@@ -313,20 +336,20 @@ rtr_pcur_getnext_from_path(
 
 		page_cursor = btr_cur_get_page_cur(btr_cur);
 		page_cursor->rec = NULL;
+		page_cursor->block = block;
 
 		if (mode == PAGE_CUR_RTREE_LOCATE) {
-			if (level == target_level && level == 0) {
-				ulint	low_match;
+			if (target_level == 0 && level == 0) {
+				ulint	low_match = 0, up_match = 0;
 
 				found = false;
 
-				low_match = page_cur_search(
-					block, index, tuple,
-					PAGE_CUR_LE,
-					btr_cur_get_page_cur(btr_cur));
-
-				if (low_match == dtuple_get_n_fields_cmp(
-							tuple)) {
+				if (!page_cur_search_with_match(
+					tuple, PAGE_CUR_LE,
+					&up_match, &low_match,
+					btr_cur_get_page_cur(btr_cur), nullptr)
+				    && low_match
+				    == dtuple_get_n_fields_cmp(tuple)) {
 					rec_t*	rec = btr_cur_get_rec(btr_cur);
 
 					if (!rec_get_deleted_flag(rec,
@@ -366,17 +389,12 @@ rtr_pcur_getnext_from_path(
 						 BTR_PCUR_IS_POSITIONED;
 					r_cursor->latch_mode = my_latch_mode;
 					btr_pcur_store_position(r_cursor, mtr);
-#ifdef UNIV_DEBUG
-					ulint num_stored =
-						rtr_store_parent_path(
-							block, btr_cur,
-							rw_latch, level, mtr);
-					ut_ad(num_stored > 0);
-#else
+					ut_d(ulint num_stored =)
 					rtr_store_parent_path(
-						block, btr_cur, rw_latch,
+						block, btr_cur,
+						btr_latch_mode(rw_latch),
 						level, mtr);
-#endif /* UNIV_DEBUG */
+					ut_ad(num_stored > 0);
 				}
 			}
 		} else {
@@ -395,44 +413,44 @@ rtr_pcur_getnext_from_path(
 
 			trx_t*		trx = thr_get_trx(
 						btr_cur->rtr_info->thr);
-			lock_mutex_enter();
-			lock_init_prdt_from_mbr(
-				&prdt, &btr_cur->rtr_info->mbr,
-				mode, trx->lock.lock_heap);
-			lock_mutex_exit();
+			{
+				TMLockTrxGuard g{TMLockTrxArgs(*trx)};
+				lock_init_prdt_from_mbr(
+					&prdt, &btr_cur->rtr_info->mbr,
+					mode, trx->lock.lock_heap);
+			}
 
 			if (rw_latch == RW_NO_LATCH) {
-				rw_lock_s_lock(&(block->lock));
+				block->page.lock.s_lock();
 			}
 
 			lock_prdt_lock(block, &prdt, index, LOCK_S,
 				       LOCK_PREDICATE, btr_cur->rtr_info->thr);
 
 			if (rw_latch == RW_NO_LATCH) {
-				rw_lock_s_unlock(&(block->lock));
+				block->page.lock.s_unlock();
 			}
 		}
 
 		if (found) {
 			if (level == target_level) {
-				page_cur_t*	r_cur;;
+				ut_ad(block
+				      == mtr->at_savepoint(block_savepoint));
 
 				if (my_latch_mode == BTR_MODIFY_TREE
 				    && level == 0) {
 					ut_ad(rw_latch == RW_NO_LATCH);
 
-					btr_cur_latch_leaves(
-						block,
+					rtr_latch_leaves(
+						block_savepoint,
 						BTR_MODIFY_TREE,
 						btr_cur, mtr);
 				}
 
-				r_cur = btr_cur_get_page_cur(btr_cur);
-
 				page_cur_position(
 					page_cur_get_rec(page_cursor),
 					page_cur_get_block(page_cursor),
-					r_cur);
+					btr_cur_get_page_cur(btr_cur));
 
 				btr_cur->low_match = level != 0 ?
 					DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1
@@ -444,25 +462,18 @@ rtr_pcur_getnext_from_path(
 			last node just located */
 			skip_parent = true;
 		} else {
-			/* Release latch on the current page */
-			ut_ad(rtr_info->tree_blocks[tree_idx]);
-
-			mtr_release_block_at_savepoint(
-				mtr, rtr_info->tree_savepoints[tree_idx],
-				rtr_info->tree_blocks[tree_idx]);
-			rtr_info->tree_blocks[tree_idx] = NULL;
+			mtr->release_last_page();
 		}
 
 	} while (!rtr_info->path->empty());
 
 	const rec_t* rec = btr_cur_get_rec(btr_cur);
 
-	if (page_rec_is_infimum(rec) || page_rec_is_supremum(rec)) {
-		mtr_commit(mtr);
-		mtr_start(mtr);
+	if (!page_rec_is_user_rec(rec)) {
+		mtr->commit();
+		mtr->start();
 	} else if (!index_locked) {
-		mtr_memo_release(mtr, dict_index_get_lock(index),
-				 MTR_MEMO_X_LOCK);
+		mtr->release(index->lock);
 	}
 
 	return(found);
@@ -489,13 +500,13 @@ rtr_pcur_move_to_next(
 
 	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 
-	mutex_enter(&rtr_info->matches->rtr_match_mutex);
+	mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex);
 	/* First retrieve the next record on the current page */
 	if (!rtr_info->matches->matched_recs->empty()) {
 		rtr_rec_t	rec;
 		rec = rtr_info->matches->matched_recs->back();
 		rtr_info->matches->matched_recs->pop_back();
-		mutex_exit(&rtr_info->matches->rtr_match_mutex);
+		mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
 
 		cursor->btr_cur.page_cur.rec = rec.r_rec;
 		cursor->btr_cur.page_cur.block = &rtr_info->matches->block;
@@ -504,7 +515,7 @@ rtr_pcur_move_to_next(
 		return(true);
 	}
 
-	mutex_exit(&rtr_info->matches->rtr_match_mutex);
+	mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
 
 	/* Fetch the next page */
 	return(rtr_pcur_getnext_from_path(tuple, mode, &cursor->btr_cur,
@@ -512,181 +523,601 @@ rtr_pcur_move_to_next(
 					 false, mtr));
 }
 
+#ifdef UNIV_DEBUG
 /*************************************************************//**
 Check if the cursor holds record pointing to the specified child page
 @return	true if it is (pointing to the child page) false otherwise */
-static
-bool
-rtr_compare_cursor_rec(
-/*===================*/
-	dict_index_t*	index,		/*!< in: index */
-	btr_cur_t*	cursor,		/*!< in: Cursor to check */
-	ulint		page_no,	/*!< in: desired child page number */
-	mem_heap_t**	heap)		/*!< in: memory heap */
+static void rtr_compare_cursor_rec(const rec_t *rec, dict_index_t *index,
+                                   ulint page_no)
 {
-	const rec_t*	rec;
-	rec_offs*	offsets;
+  if (!rec)
+    return;
+  mem_heap_t *heap= nullptr;
+  rec_offs *offsets= rec_get_offsets(rec, index, nullptr, 0,
+                                     ULINT_UNDEFINED, &heap);
+  ut_ad(btr_node_ptr_get_child_page_no(rec, offsets) == page_no);
+  mem_heap_free(heap);
+}
+#endif
+
+TRANSACTIONAL_TARGET
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+                                page_cur_mode_t mode,
+                                btr_latch_mode latch_mode,
+                                btr_cur_t *cur, mtr_t *mtr)
+{
+  page_cur_mode_t page_mode;
+  page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+
+  bool mbr_adj= false;
+  bool found= false;
+  dict_index_t *const index= cur->index();
+
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+  ut_ad(level == 0 || mode == PAGE_CUR_LE || RTREE_SEARCH_MODE(mode));
+  ut_ad(dict_index_check_search_tuple(index, tuple));
+  ut_ad(dtuple_check_typed(tuple));
+  ut_ad(index->is_spatial());
+  ut_ad(index->page != FIL_NULL);
+
+  MEM_UNDEFINED(&cur->up_match, sizeof cur->up_match);
+  MEM_UNDEFINED(&cur->up_bytes, sizeof cur->up_bytes);
+  MEM_UNDEFINED(&cur->low_match, sizeof cur->low_match);
+  MEM_UNDEFINED(&cur->low_bytes, sizeof cur->low_bytes);
+  ut_d(cur->up_match= ULINT_UNDEFINED);
+  ut_d(cur->low_match= ULINT_UNDEFINED);
+
+  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+
+  ut_ad(!latch_by_caller
+        || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+                                      | MTR_MEMO_SX_LOCK));
+  latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+  ut_ad(!latch_by_caller || latch_mode == BTR_SEARCH_LEAF ||
+        latch_mode == BTR_MODIFY_LEAF);
+
+  cur->flag= BTR_CUR_BINARY;
+
+#ifndef BTR_CUR_ADAPT
+  buf_block_t *guess= nullptr;
+#else
+  btr_search_t *const info= btr_search_get_info(index);
+  buf_block_t *guess= info->root_guess;
+#endif
+
+  /* Store the position of the tree latch we push to mtr so that we
+     know how to release it when we have latched leaf node(s) */
+
+  const ulint savepoint= mtr->get_savepoint();
+
+  rw_lock_type_t upper_rw_latch, root_leaf_rw_latch= RW_NO_LATCH;
+
+  switch (latch_mode) {
+  case BTR_MODIFY_TREE:
+    mtr_x_lock_index(index, mtr);
+    upper_rw_latch= root_leaf_rw_latch= RW_X_LATCH;
+    break;
+  case BTR_CONT_MODIFY_TREE:
+    ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+                                     MTR_MEMO_SX_LOCK));
+    upper_rw_latch= RW_X_LATCH;
+    break;
+  default:
+    ut_ad(latch_mode != BTR_MODIFY_PREV);
+    ut_ad(latch_mode != BTR_SEARCH_PREV);
+    if (!latch_by_caller)
+      mtr_s_lock_index(index, mtr);
+    upper_rw_latch= root_leaf_rw_latch= RW_S_LATCH;
+    if (latch_mode == BTR_MODIFY_LEAF)
+      root_leaf_rw_latch= RW_X_LATCH;
+  }
+
+  auto root_savepoint= mtr->get_savepoint();
+  const ulint zip_size= index->table->space->zip_size();
+
+  /* Start with the root page. */
+  page_id_t page_id(index->table->space_id, index->page);
+
+  ulint up_match= 0, up_bytes= 0, low_match= 0, low_bytes= 0;
+  ulint height= ULINT_UNDEFINED;
+
+  /* We use these modified search modes on non-leaf levels of the
+     B-tree. These let us end up in the right B-tree leaf. In that leaf
+     we use the original search mode. */
+
+  switch (mode) {
+  case PAGE_CUR_GE:
+    page_mode= PAGE_CUR_L;
+    break;
+  case PAGE_CUR_G:
+    page_mode= PAGE_CUR_LE;
+    break;
+  default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+          || RTREE_SEARCH_MODE(mode)
+          || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+          || RTREE_SEARCH_MODE(mode));
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+    page_mode= mode;
+    break;
+  }
+
+ search_loop:
+  auto buf_mode= BUF_GET;
+  ulint rw_latch= RW_NO_LATCH;
+
+  if (height)
+  {
+    /* We are about to fetch the root or a non-leaf page. */
+    if (latch_mode != BTR_MODIFY_TREE || height == level)
+      /* If doesn't have SX or X latch of index,
+         each page should be latched before reading. */
+      rw_latch= upper_rw_latch;
+  }
+  else if (latch_mode <= BTR_MODIFY_LEAF)
+    rw_latch= latch_mode;
+
+  dberr_t err;
+  auto block_savepoint= mtr->get_savepoint();
+  buf_block_t *block= buf_page_get_gen(page_id, zip_size, rw_latch, guess,
+                                       buf_mode, mtr, &err, false);
+  if (!block)
+  {
+    if (err == DB_DECRYPTION_FAILED)
+      btr_decryption_failed(*index);
+  func_exit:
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+
+    if (mbr_adj)
+      /* remember that we will need to adjust parent MBR */
+      cur->rtr_info->mbr_adj= true;
+
+    return err;
+  }
+
+  const page_t *page= buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+  if (rw_latch != RW_NO_LATCH) {
+    const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
+    ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+  }
+#endif /* UNIV_ZIP_DEBUG */
+
+  ut_ad(fil_page_index_page_check(page));
+  ut_ad(index->id == btr_page_get_index_id(page));
+
+  if (height != ULINT_UNDEFINED);
+  else if (page_is_leaf(page) &&
+           rw_latch != RW_NO_LATCH && rw_latch != root_leaf_rw_latch)
+  {
+    /* The root page is also a leaf page (root_leaf).
+    We should reacquire the page, because the root page
+    is latched differently from leaf pages. */
+    ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+    ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
+
+    ut_ad(block == mtr->at_savepoint(block_savepoint));
+    mtr->rollback_to_savepoint(block_savepoint);
+
+    upper_rw_latch= root_leaf_rw_latch;
+    goto search_loop;
+  }
+  else
+  {
+    /* We are in the root node */
+
+    height= btr_page_get_level(page);
+    cur->tree_height= height + 1;
+
+    ut_ad(cur->rtr_info);
+
+    /* If SSN in memory is not initialized, fetch it from root page */
+    if (!rtr_get_current_ssn_id(index))
+      /* FIXME: do this in dict_load_table_one() */
+      index->set_ssn(page_get_ssn_id(page) + 1);
+
+    /* Save the MBR */
+    cur->rtr_info->thr= cur->thr;
+    rtr_get_mbr_from_tuple(tuple, &cur->rtr_info->mbr);
+
+#ifdef BTR_CUR_ADAPT
+    info->root_guess= block;
+#endif
+  }
+
+  if (height == 0) {
+    if (rw_latch == RW_NO_LATCH)
+    {
+      ut_ad(block == mtr->at_savepoint(block_savepoint));
+      rtr_latch_leaves(block_savepoint, latch_mode, cur, mtr);
+    }
+
+    switch (latch_mode) {
+    case BTR_MODIFY_TREE:
+    case BTR_CONT_MODIFY_TREE:
+      break;
+    default:
+      if (!latch_by_caller)
+      {
+        /* Release the tree s-latch */
+        mtr->rollback_to_savepoint(savepoint,
+                                   savepoint + 1);
+        block_savepoint--;
+        root_savepoint--;
+      }
+      /* release upper blocks */
+      if (savepoint < block_savepoint)
+        mtr->rollback_to_savepoint(savepoint, block_savepoint);
+    }
+
+    page_mode= mode;
+  }
+
+  /* Remember the page search mode */
+  search_mode= page_mode;
+
+  /* Some adjustment on search mode, when the page search mode is
+  PAGE_CUR_RTREE_LOCATE or PAGE_CUR_RTREE_INSERT, as we are searching
+  with MBRs. When it is not the target level, we should search all
+  sub-trees that "CONTAIN" the search range/MBR. When it is at the
+  target level, the search becomes PAGE_CUR_LE */
+
+  if (page_mode == PAGE_CUR_RTREE_INSERT)
+  {
+    page_mode= (level == height)
+      ? PAGE_CUR_LE
+      : PAGE_CUR_RTREE_INSERT;
+
+    ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
+  }
+  else if (page_mode == PAGE_CUR_RTREE_LOCATE && level == height)
+    page_mode= level == 0 ? PAGE_CUR_LE : PAGE_CUR_RTREE_GET_FATHER;
+
+  up_match= 0;
+  low_match= 0;
+
+  if (latch_mode == BTR_MODIFY_TREE || latch_mode == BTR_CONT_MODIFY_TREE)
+    /* Tree are locked, no need for Page Lock to protect the "path" */
+    cur->rtr_info->need_page_lock= false;
+
+  cur->page_cur.block= block;
+
+  if (page_mode >= PAGE_CUR_CONTAIN)
+  {
+    found= rtr_cur_search_with_match(block, index, tuple, page_mode,
+                                     &cur->page_cur, cur->rtr_info);
+
+    /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
+    if (search_mode == PAGE_CUR_RTREE_INSERT && cur->rtr_info->mbr_adj) {
+      static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+
+      if (!(latch_mode & 8))
+        /* Parent MBR needs updated, should retry with BTR_MODIFY_TREE */
+        goto func_exit;
+
+      cur->rtr_info->mbr_adj= false;
+      mbr_adj= true;
+    }
+
+    if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER)
+      cur->low_match= DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+  }
+  else
+  {
+    /* Search for complete index fields. */
+    up_bytes= low_bytes= 0;
+    if (page_cur_search_with_match(tuple, page_mode, &up_match,
+                                   &low_match, &cur->page_cur, nullptr)) {
+      err= DB_CORRUPTION;
+      goto func_exit;
+    }
+  }
+
+  /* If this is the desired level, leave the loop */
+
+  ut_ad(height == btr_page_get_level(btr_cur_get_page(cur)));
+
+  /* Add Predicate lock if it is serializable isolation
+     and only if it is in the search case */
+  if (mode >= PAGE_CUR_CONTAIN && mode != PAGE_CUR_RTREE_INSERT &&
+      mode != PAGE_CUR_RTREE_LOCATE && cur->rtr_info->need_prdt_lock)
+  {
+    lock_prdt_t prdt;
+
+    {
+      trx_t* trx= thr_get_trx(cur->thr);
+      TMLockTrxGuard g{TMLockTrxArgs(*trx)};
+      lock_init_prdt_from_mbr(&prdt, &cur->rtr_info->mbr, mode,
+                              trx->lock.lock_heap);
+    }
 
-	rec = btr_cur_get_rec(cursor);
+    if (rw_latch == RW_NO_LATCH && height != 0)
+      block->page.lock.s_lock();
 
-	offsets = rec_get_offsets(rec, index, NULL, 0, ULINT_UNDEFINED, heap);
+    lock_prdt_lock(block, &prdt, index, LOCK_S, LOCK_PREDICATE, cur->thr);
+
+    if (rw_latch == RW_NO_LATCH && height != 0)
+      block->page.lock.s_unlock();
+  }
 
-	return(btr_node_ptr_get_child_page_no(rec, offsets) == page_no);
+  if (level != height)
+  {
+    ut_ad(height > 0);
+
+    height--;
+    guess= nullptr;
+
+    const rec_t *node_ptr= btr_cur_get_rec(cur);
+
+    offsets= rec_get_offsets(node_ptr, index, offsets, 0,
+                             ULINT_UNDEFINED, &heap);
+
+    if (page_rec_is_supremum(node_ptr))
+    {
+      cur->low_match= 0;
+      cur->up_match= 0;
+      goto func_exit;
+    }
+
+    /* If we are doing insertion or record locating,
+       remember the tree nodes we visited */
+    if (page_mode == PAGE_CUR_RTREE_INSERT ||
+        (search_mode == PAGE_CUR_RTREE_LOCATE &&
+         latch_mode != BTR_MODIFY_LEAF))
+    {
+      const bool add_latch= latch_mode == BTR_MODIFY_TREE &&
+        rw_latch == RW_NO_LATCH;
+
+      if (add_latch)
+      {
+        ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+                                         MTR_MEMO_SX_LOCK));
+        block->page.lock.s_lock();
+      }
+
+      /* Store the parent cursor location */
+      ut_d(auto num_stored=)
+      rtr_store_parent_path(block, cur, latch_mode, height + 1, mtr);
+
+      if (page_mode == PAGE_CUR_RTREE_INSERT)
+      {
+        btr_pcur_t *r_cursor= rtr_get_parent_cursor(cur, height + 1, true);
+        /* If it is insertion, there should be only one parent for
+        each level traverse */
+        ut_ad(num_stored == 1);
+        node_ptr= btr_pcur_get_rec(r_cursor);
+      }
+
+      if (add_latch)
+        block->page.lock.s_unlock();
+
+      ut_ad(!page_rec_is_supremum(node_ptr));
+    }
+
+    ut_ad(page_mode == search_mode ||
+          (page_mode == PAGE_CUR_WITHIN &&
+           search_mode == PAGE_CUR_RTREE_LOCATE));
+    page_mode= search_mode;
+
+    if (height == level && latch_mode == BTR_MODIFY_TREE)
+    {
+      ut_ad(upper_rw_latch == RW_X_LATCH);
+      for (auto i= root_savepoint, n= mtr->get_savepoint(); i < n; i++)
+        mtr->upgrade_buffer_fix(i, RW_X_LATCH);
+    }
+
+    /* Go to the child node */
+    page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+    if (page_mode >= PAGE_CUR_CONTAIN && page_mode != PAGE_CUR_RTREE_INSERT)
+    {
+      rtr_node_path_t *path= cur->rtr_info->path;
+
+      if (found && !path->empty())
+      {
+        ut_ad(path->back().page_no == page_id.page_no());
+        path->pop_back();
+#ifdef UNIV_DEBUG
+        if (page_mode == PAGE_CUR_RTREE_LOCATE &&
+            latch_mode != BTR_MODIFY_LEAF)
+        {
+          btr_pcur_t* pcur= cur->rtr_info->parent_path->back().cursor;
+          rec_t *my_node_ptr= btr_pcur_get_rec(pcur);
+
+          offsets= rec_get_offsets(my_node_ptr, index, offsets,
+                                   0, ULINT_UNDEFINED, &heap);
+
+          ut_ad(page_id.page_no() ==
+                btr_node_ptr_get_child_page_no(my_node_ptr, offsets));
+        }
+#endif
+      }
+    }
+
+    goto search_loop;
+  }
+
+  if (level)
+  {
+    if (upper_rw_latch == RW_NO_LATCH)
+    {
+      ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
+      btr_block_get(*index, page_id.page_no(), RW_X_LATCH, false, mtr, &err);
+    }
+    else
+    {
+      ut_ad(mtr->memo_contains_flagged(block, upper_rw_latch));
+      ut_ad(!latch_by_caller);
+    }
+
+    if (page_mode <= PAGE_CUR_LE)
+    {
+      cur->low_match= low_match;
+      cur->up_match= up_match;
+    }
+  }
+  else
+  {
+    cur->low_match= low_match;
+    cur->low_bytes= low_bytes;
+    cur->up_match= up_match;
+    cur->up_bytes= up_bytes;
+
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+  }
+
+  goto func_exit;
+}
+
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                        btr_latch_mode latch_mode,
+                        mtr_t *mtr, page_cur_mode_t mode)
+{
+  return rtr_search_to_nth_level(0, tuple, mode, latch_mode, cur, mtr);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur         cursor
+@param tuple       search tuple
+@param mode        search mode
+@param mtr         mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+                        page_cur_mode_t mode, mtr_t *mtr)
+{
+#ifdef UNIV_DEBUG
+  switch (mode) {
+  case PAGE_CUR_CONTAIN:
+  case PAGE_CUR_INTERSECT:
+  case PAGE_CUR_WITHIN:
+  case PAGE_CUR_DISJOINT:
+  case PAGE_CUR_MBR_EQUAL:
+    break;
+  default:
+    ut_ad("invalid mode" == 0);
+  }
+#endif
+  pcur->latch_mode= BTR_SEARCH_LEAF;
+  pcur->search_mode= mode;
+  pcur->pos_state= BTR_PCUR_IS_POSITIONED;
+  pcur->trx_if_known= nullptr;
+  return rtr_search_leaf(&pcur->btr_cur, tuple, BTR_SEARCH_LEAF, mtr, mode);
 }
 
 /**************************************************************//**
 Initializes and opens a persistent cursor to an index tree. It should be
-closed with btr_pcur_close. Mainly called by row_search_index_entry() */
-void
-rtr_pcur_open_low(
-/*==============*/
-	dict_index_t*	index,	/*!< in: index */
-	ulint		level,	/*!< in: level in the rtree */
+closed with btr_pcur_close. */
+bool rtr_search(
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
-	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_RTREE_LOCATE, ... */
-	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_latch_mode	latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
 	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	btr_cur_t*	btr_cursor;
-	ulint		n_fields;
-	ulint		low_match;
-	rec_t*		rec;
-	bool		tree_latched = false;
-	bool		for_delete = false;
-	bool		for_undo_ins = false;
-
-	ut_ad(level == 0);
-
-	ut_ad(latch_mode & BTR_MODIFY_LEAF || latch_mode & BTR_MODIFY_TREE);
-	ut_ad(mode == PAGE_CUR_RTREE_LOCATE);
+	static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+	ut_ad(latch_mode & BTR_MODIFY_LEAF);
+	ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED));
+	ut_ad(mtr->is_empty());
 
 	/* Initialize the cursor */
 
 	btr_pcur_init(cursor);
 
-	for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
-	for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
-
 	cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
-	cursor->search_mode = mode;
+	cursor->search_mode = PAGE_CUR_RTREE_LOCATE;
+	cursor->trx_if_known = nullptr;
+
+	if (latch_mode & 8) {
+		mtr_x_lock_index(cursor->index(), mtr);
+	} else {
+		latch_mode
+			= btr_latch_mode(latch_mode | BTR_ALREADY_S_LATCHED);
+		mtr_sx_lock_index(cursor->index(), mtr);
+	}
 
 	/* Search with the tree cursor */
 
-	btr_cursor = btr_pcur_get_btr_cur(cursor);
+	btr_cur_t* btr_cursor = btr_pcur_get_btr_cur(cursor);
 
-	btr_cursor->rtr_info = rtr_create_rtr_info(false, false,
-						   btr_cursor, index);
+	btr_cursor->rtr_info
+		= rtr_create_rtr_info(false, false,
+				      btr_cursor, cursor->index());
 
-	/* Purge will SX lock the tree instead of take Page Locks */
 	if (btr_cursor->thr) {
 		btr_cursor->rtr_info->need_page_lock = true;
 		btr_cursor->rtr_info->thr = btr_cursor->thr;
 	}
 
-	btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
-				    btr_cursor, file, line, mtr);
-	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
-
-	cursor->trx_if_known = NULL;
-
-	low_match = btr_pcur_get_low_match(cursor);
-
-	rec = btr_pcur_get_rec(cursor);
+	if (rtr_search_leaf(btr_cursor, tuple, latch_mode, mtr)
+	    != DB_SUCCESS) {
+		return true;
+	}
 
-	n_fields = dtuple_get_n_fields(tuple);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
 
-	if (latch_mode & BTR_ALREADY_S_LATCHED) {
-		ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
-		tree_latched = true;
-	}
+	const rec_t* rec = btr_pcur_get_rec(cursor);
 
-	if (latch_mode & BTR_MODIFY_TREE) {
-		ut_ad(mtr->memo_contains_flagged(&index->lock,
-						 MTR_MEMO_X_LOCK
-						 | MTR_MEMO_SX_LOCK));
-		tree_latched = true;
-	}
+	const bool d= rec_get_deleted_flag(
+		rec, cursor->index()->table->not_redundant());
 
-	if (page_rec_is_infimum(rec) || low_match != n_fields
-	    || (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))
-		&& (for_delete || for_undo_ins))) {
+	if (page_rec_is_infimum(rec)
+	    || btr_pcur_get_low_match(cursor) != dtuple_get_n_fields(tuple)
+	    || (d && latch_mode
+		& (BTR_RTREE_DELETE_MARK | BTR_RTREE_UNDO_INS))) {
 
-		if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))
-		    && for_delete) {
+		if (d && latch_mode & BTR_RTREE_DELETE_MARK) {
 			btr_cursor->rtr_info->fd_del = true;
 			btr_cursor->low_match = 0;
 		}
-		/* Did not find matched row in first dive. Release
-		latched block if any before search more pages */
-		if (latch_mode & BTR_MODIFY_LEAF) {
-			ulint		tree_idx = btr_cursor->tree_height - 1;
-			rtr_info_t*	rtr_info = btr_cursor->rtr_info;
-
-			ut_ad(level == 0);
-
-			if (rtr_info->tree_blocks[tree_idx]) {
-				mtr_release_block_at_savepoint(
-					mtr,
-					rtr_info->tree_savepoints[tree_idx],
-					rtr_info->tree_blocks[tree_idx]);
-				rtr_info->tree_blocks[tree_idx] = NULL;
-			}
-		}
 
-		bool	ret = rtr_pcur_getnext_from_path(
-			tuple, mode, btr_cursor, level, latch_mode,
-			tree_latched, mtr);
+		mtr->rollback_to_savepoint(1);
 
-		if (ret) {
-			low_match = btr_pcur_get_low_match(cursor);
-			ut_ad(low_match == n_fields);
+		if (!rtr_pcur_getnext_from_path(tuple, PAGE_CUR_RTREE_LOCATE,
+						btr_cursor, 0, latch_mode,
+						true, mtr)) {
+			return true;
 		}
+
+		ut_ad(btr_pcur_get_low_match(cursor)
+		      == dtuple_get_n_fields(tuple));
 	}
+
+	if (!(latch_mode & 8)) {
+		mtr->rollback_to_savepoint(0, 1);
+	}
+
+	return false;
 }
 
 /* Get the rtree page father.
-@param[in]	index		rtree index
-@param[in]	block		child page in the index
-@param[in]	mtr		mtr
+@param[in,out]	mtr		mtr
 @param[in]	sea_cur		search cursor, contains information
 				about parent nodes in search
-@param[in]	cursor		cursor on node pointer record,
-				its page x-latched */
-void
-rtr_page_get_father(
-	dict_index_t*	index,
-	buf_block_t*	block,
-	mtr_t*		mtr,
-	btr_cur_t*	sea_cur,
-	btr_cur_t*	cursor)
+@param[out]	cursor		cursor on node pointer record,
+				its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
 {
-	mem_heap_t*	heap = mem_heap_create(100);
-#ifdef UNIV_DEBUG
-	rec_offs*	offsets;
-
-	offsets = rtr_page_get_father_block(
-		NULL, heap, index, block, mtr, sea_cur, cursor);
-
-	ulint	page_no = btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
-							 offsets);
-
-	ut_ad(page_no == block->page.id().page_no());
-#else
-	rtr_page_get_father_block(
-		NULL, heap, index, block, mtr, sea_cur, cursor);
-#endif
-
-	mem_heap_free(heap);
+  mem_heap_t *heap = mem_heap_create(100);
+  rec_offs *offsets= rtr_page_get_father_block(nullptr, heap,
+                                               mtr, sea_cur, cursor);
+  mem_heap_free(heap);
+  return offsets != nullptr;
 }
 
+MY_ATTRIBUTE((warn_unused_result))
 /********************************************************************//**
 Returns the upper level node pointer to a R-Tree page. It is assumed
 that mtr holds an x-latch on the tree. */
-static void rtr_get_father_node(
-	dict_index_t*	index,	/*!< in: index */
+static const rec_t* rtr_get_father_node(
 	ulint		level,	/*!< in: the tree level of search */
 	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
 				tuple must be set so that it cannot get
@@ -697,28 +1128,18 @@ static void rtr_get_father_node(
 	ulint		page_no,/*!< Current page no */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	mem_heap_t*	heap = NULL;
-	bool		ret = false;
-	const rec_t*	rec;
-	ulint		n_fields;
-	bool		new_rtr = false;
+	const rec_t* rec = nullptr;
+	auto had_rtr = btr_cur->rtr_info;
+	dict_index_t* const index = btr_cur->index();
 
 	/* Try to optimally locate the parent node. Level should always
 	less than sea_cur->tree_height unless the root is splitting */
 	if (sea_cur && sea_cur->tree_height > level) {
 		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
 						 | MTR_MEMO_SX_LOCK));
-		ret = rtr_cur_restore_position(
-			BTR_CONT_MODIFY_TREE, sea_cur, level, mtr);
-
-		/* Once we block shrink tree nodes while there are
-		active search on it, this optimal locating should always
-		succeeds */
-		ut_ad(ret);
-
-		if (ret) {
+		if (rtr_cur_restore_position(sea_cur, level, mtr)) {
 			btr_pcur_t*	r_cursor = rtr_get_parent_cursor(
-						sea_cur, level, false);
+				sea_cur, level, false);
 
 			rec = btr_pcur_get_rec(r_cursor);
 
@@ -726,70 +1147,57 @@ static void rtr_get_father_node(
 			page_cur_position(rec,
 					  btr_pcur_get_block(r_cursor),
 					  btr_cur_get_page_cur(btr_cur));
-			btr_cur->rtr_info = sea_cur->rtr_info;
+			had_rtr = btr_cur->rtr_info = sea_cur->rtr_info;
 			btr_cur->tree_height = sea_cur->tree_height;
-			ut_ad(rtr_compare_cursor_rec(
-				index, btr_cur, page_no, &heap));
-			goto func_exit;
 		}
+		goto func_exit;
 	}
 
 	/* We arrive here in one of two scenario
 	1) check table and btr_valide
 	2) index root page being raised */
-	ut_ad(!sea_cur || sea_cur->tree_height == level);
 
 	if (btr_cur->rtr_info) {
 		rtr_clean_rtr_info(btr_cur->rtr_info, true);
-	} else {
-		new_rtr = true;
 	}
 
 	btr_cur->rtr_info = rtr_create_rtr_info(false, false, btr_cur, index);
 
-	if (sea_cur && sea_cur->tree_height == level) {
-		/* root split, and search the new root */
-		btr_cur_search_to_nth_level(
-			index, level, tuple, PAGE_CUR_RTREE_LOCATE,
-			BTR_CONT_MODIFY_TREE, btr_cur, __FILE__, __LINE__, mtr);
-
+	if (rtr_search_to_nth_level(level, tuple, PAGE_CUR_RTREE_LOCATE,
+				    BTR_CONT_MODIFY_TREE, btr_cur, mtr)
+	    != DB_SUCCESS) {
+	} else if (sea_cur && sea_cur->tree_height == level) {
+		rec = btr_cur_get_rec(btr_cur);
 	} else {
 		/* btr_validate */
 		ut_ad(level >= 1);
 		ut_ad(!sea_cur);
 
-		btr_cur_search_to_nth_level(
-			index, level, tuple, PAGE_CUR_RTREE_LOCATE,
-			BTR_CONT_MODIFY_TREE, btr_cur, __FILE__, __LINE__, mtr);
-
 		rec = btr_cur_get_rec(btr_cur);
-		n_fields = dtuple_get_n_fields_cmp(tuple);
+		const ulint n_fields = dtuple_get_n_fields_cmp(tuple);
 
 		if (page_rec_is_infimum(rec)
 		    || (btr_cur->low_match != n_fields)) {
-			ret = rtr_pcur_getnext_from_path(
-				tuple, PAGE_CUR_RTREE_LOCATE, btr_cur,
-				level, BTR_CONT_MODIFY_TREE,
-				true, mtr);
-
-			ut_ad(ret && btr_cur->low_match == n_fields);
+			if (!rtr_pcur_getnext_from_path(
+				    tuple, PAGE_CUR_RTREE_LOCATE, btr_cur,
+				    level, BTR_CONT_MODIFY_TREE, true, mtr)) {
+				rec = nullptr;
+			} else {
+				ut_ad(btr_cur->low_match == n_fields);
+				rec = btr_cur_get_rec(btr_cur);
+			}
 		}
 	}
 
-	ret = rtr_compare_cursor_rec(
-		index, btr_cur, page_no, &heap);
-
-	ut_ad(ret);
-
 func_exit:
-	if (heap) {
-		mem_heap_free(heap);
-	}
+	ut_d(rtr_compare_cursor_rec(rec, index, page_no));
 
-	if (new_rtr && btr_cur->rtr_info) {
+	if (!had_rtr && btr_cur->rtr_info) {
 		rtr_clean_rtr_info(btr_cur->rtr_info, true);
 		btr_cur->rtr_info = NULL;
 	}
+
+	return rec;
 }
 
 /** Returns the upper level node pointer to a R-Tree page. It is assumed
@@ -807,8 +1215,6 @@ rtr_page_get_father_node_ptr(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	dtuple_t*	tuple;
-	rec_t*		user_rec;
-	rec_t*		node_ptr;
 	ulint		level;
 	ulint		page_no;
 	dict_index_t*	index;
@@ -817,15 +1223,14 @@ rtr_page_get_father_node_ptr(
 	page_no = btr_cur_get_block(cursor)->page.id().page_no();
 	index = btr_cur_get_index(cursor);
 
-	ut_ad(srv_read_only_mode
-	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
-					    | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
 
 	ut_ad(dict_index_get_page(index) != page_no);
 
 	level = btr_page_get_level(btr_cur_get_page(cursor));
 
-	user_rec = btr_cur_get_rec(cursor);
+	const rec_t* user_rec = btr_cur_get_rec(cursor);
 	ut_a(page_rec_is_user_rec(user_rec));
 
 	offsets = rec_get_offsets(user_rec, index, offsets,
@@ -840,50 +1245,20 @@ rtr_page_get_father_node_ptr(
 		sea_cur = NULL;
 	}
 
-	rtr_get_father_node(index, level + 1, tuple, sea_cur, cursor,
-			    page_no, mtr);
+	const rec_t* node_ptr = rtr_get_father_node(level + 1, tuple,
+						    sea_cur, cursor,
+						    page_no, mtr);
+	if (!node_ptr) {
+		return nullptr;
+	}
 
-	node_ptr = btr_cur_get_rec(cursor);
 	ut_ad(!page_rec_is_comp(node_ptr)
 	      || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
 	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
 				  ULINT_UNDEFINED, &heap);
 
-	ulint	child_page = btr_node_ptr_get_child_page_no(node_ptr, offsets);
-
-	if (child_page != page_no) {
-		const rec_t*	print_rec;
-
-		ib::fatal	error;
-
-		error << "Corruption of index " << index->name
-			<< " of table " << index->table->name
-			<< " parent page " << page_no
-			<< " child page " << child_page;
-
-		print_rec = page_rec_get_next(
-			page_get_infimum_rec(page_align(user_rec)));
-		offsets = rec_get_offsets(print_rec, index, offsets,
-					  page_rec_is_leaf(user_rec)
-					  ? index->n_fields : 0,
-					  ULINT_UNDEFINED, &heap);
-		error << "; child ";
-		rec_print(error.m_oss, print_rec,
-			  rec_get_info_bits(print_rec, rec_offs_comp(offsets)),
-			  offsets);
-		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
-					  ULINT_UNDEFINED, &heap);
-		error << "; parent ";
-		rec_print(error.m_oss, print_rec,
-			  rec_get_info_bits(print_rec, rec_offs_comp(offsets)),
-			  offsets);
-
-		error << ". You should dump + drop + reimport the table to"
-			" fix the corruption. If the crash happens at"
-			" database startup, see "
-			"https://mariadb.com/kb/en/library/innodb-recovery-modes/"
-			" about forcing"
-			" recovery. Then dump + drop + reimport.";
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		offsets = nullptr;
 	}
 
 	return(offsets);
@@ -898,20 +1273,18 @@ rtr_page_get_father_block(
 /*======================*/
 	rec_offs*	offsets,/*!< in: work area for the return value */
 	mem_heap_t*	heap,	/*!< in: memory heap to use */
-	dict_index_t*	index,	/*!< in: b-tree index */
-	buf_block_t*	block,	/*!< in: child page in the index */
 	mtr_t*		mtr,	/*!< in: mtr */
 	btr_cur_t*	sea_cur,/*!< in: search cursor, contains information
 				about parent nodes in search */
 	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
 				its page x-latched */
 {
-	rec_t*  rec = page_rec_get_next(
-		page_get_infimum_rec(buf_block_get_frame(block)));
-	btr_cur_position(index, rec, block, cursor);
-
-	return(rtr_page_get_father_node_ptr(offsets, heap, sea_cur,
-					    cursor, mtr));
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (!rec)
+    return nullptr;
+  cursor->page_cur.rec= rec;
+  return rtr_page_get_father_node_ptr(offsets, heap, sea_cur, cursor, mtr);
 }
 
 /*******************************************************************//**
@@ -929,7 +1302,7 @@ rtr_create_rtr_info(
 {
 	rtr_info_t*	rtr_info;
 
-	index = index ? index : cursor->index;
+	index = index ? index : cursor->index();
 	ut_ad(index);
 
 	rtr_info = static_cast<rtr_info_t*>(ut_zalloc_nokey(sizeof(*rtr_info)));
@@ -950,22 +1323,21 @@ rtr_create_rtr_info(
 
 		rtr_info->matches->bufp = page_align(rtr_info->matches->rec_buf
 						     + UNIV_PAGE_SIZE_MAX + 1);
-		mutex_create(LATCH_ID_RTR_MATCH_MUTEX,
-			     &rtr_info->matches->rtr_match_mutex);
-		rw_lock_create(PFS_NOT_INSTRUMENTED,
-			       &(rtr_info->matches->block.lock),
-			      SYNC_LEVEL_VARYING);
+		mysql_mutex_init(rtr_match_mutex_key,
+				 &rtr_info->matches->rtr_match_mutex,
+				 nullptr);
+		rtr_info->matches->block.page.lock.init();
 	}
 
 	rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
 	rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
 	rtr_info->need_prdt_lock = need_prdt;
-	mutex_create(LATCH_ID_RTR_PATH_MUTEX,
-		     &rtr_info->rtr_path_mutex);
+	mysql_mutex_init(rtr_path_mutex_key, &rtr_info->rtr_path_mutex,
+			 nullptr);
 
-	mutex_enter(&index->rtr_track->rtr_active_mutex);
+	mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
 	index->rtr_track->rtr_active.push_front(rtr_info);
-	mutex_exit(&index->rtr_track->rtr_active_mutex);
+	mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
 	return(rtr_info);
 }
 
@@ -1000,32 +1372,10 @@ rtr_init_rtr_info(
 
 	if (!reinit) {
 		/* Reset all members. */
-		rtr_info->path = NULL;
-		rtr_info->parent_path = NULL;
-		rtr_info->matches = NULL;
-
-		mutex_create(LATCH_ID_RTR_PATH_MUTEX,
-			     &rtr_info->rtr_path_mutex);
-
-		memset(rtr_info->tree_blocks, 0x0,
-		       sizeof(rtr_info->tree_blocks));
-		memset(rtr_info->tree_savepoints, 0x0,
-		       sizeof(rtr_info->tree_savepoints));
-		rtr_info->mbr.xmin = 0.0;
-		rtr_info->mbr.xmax = 0.0;
-		rtr_info->mbr.ymin = 0.0;
-		rtr_info->mbr.ymax = 0.0;
-		rtr_info->thr = NULL;
-		rtr_info->heap = NULL;
-		rtr_info->cursor = NULL;
-		rtr_info->index = NULL;
-		rtr_info->need_prdt_lock = false;
-		rtr_info->need_page_lock = false;
-		rtr_info->allocated = false;
-		rtr_info->mbr_adj = false;
-		rtr_info->fd_del = false;
-		rtr_info->search_tuple = NULL;
-		rtr_info->search_mode = PAGE_CUR_UNSUPP;
+		memset(rtr_info, 0, sizeof *rtr_info);
+		static_assert(PAGE_CUR_UNSUPP == 0, "compatibility");
+		mysql_mutex_init(rtr_path_mutex_key, &rtr_info->rtr_path_mutex,
+				 nullptr);
 	}
 
 	ut_ad(!rtr_info->matches || rtr_info->matches->matched_recs->empty());
@@ -1036,9 +1386,9 @@ rtr_init_rtr_info(
 	rtr_info->cursor = cursor;
 	rtr_info->index = index;
 
-	mutex_enter(&index->rtr_track->rtr_active_mutex);
+	mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
 	index->rtr_track->rtr_active.push_front(rtr_info);
-	mutex_exit(&index->rtr_track->rtr_active_mutex);
+	mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
 }
 
 /**************************************************************//**
@@ -1059,7 +1409,7 @@ rtr_clean_rtr_info(
 	index = rtr_info->index;
 
 	if (index) {
-		mutex_enter(&index->rtr_track->rtr_active_mutex);
+		mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
 	}
 
 	while (rtr_info->parent_path && !rtr_info->parent_path->empty()) {
@@ -1090,7 +1440,7 @@ rtr_clean_rtr_info(
 
 	if (index) {
 		index->rtr_track->rtr_active.remove(rtr_info);
-		mutex_exit(&index->rtr_track->rtr_active_mutex);
+		mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
 	}
 
 	if (free_all) {
@@ -1099,9 +1449,10 @@ rtr_clean_rtr_info(
 				UT_DELETE(rtr_info->matches->matched_recs);
 			}
 
-			rw_lock_free(&(rtr_info->matches->block.lock));
+			rtr_info->matches->block.page.lock.free();
 
-			mutex_destroy(&rtr_info->matches->rtr_match_mutex);
+			mysql_mutex_destroy(
+				&rtr_info->matches->rtr_match_mutex);
 		}
 
 		if (rtr_info->heap) {
@@ -1109,7 +1460,7 @@ rtr_clean_rtr_info(
 		}
 
 		if (initialized) {
-			mutex_destroy(&rtr_info->rtr_path_mutex);
+			mysql_mutex_destroy(&rtr_info->rtr_path_mutex);
 		}
 
 		if (rtr_info->allocated) {
@@ -1194,46 +1545,39 @@ rtr_check_discard_page(
 				the root page */
 	buf_block_t*	block)	/*!< in: block of page to be discarded */
 {
-	const ulint pageno = block->page.id().page_no();
+	const page_id_t id{block->page.id()};
 
-	mutex_enter(&index->rtr_track->rtr_active_mutex);
+	mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
 
 	for (const auto& rtr_info : index->rtr_track->rtr_active) {
 		if (cursor && rtr_info == cursor->rtr_info) {
 			continue;
 		}
 
-		mutex_enter(&rtr_info->rtr_path_mutex);
+		mysql_mutex_lock(&rtr_info->rtr_path_mutex);
 		for (const node_visit_t& node : *rtr_info->path) {
-			if (node.page_no == pageno) {
-				rtr_rebuild_path(rtr_info, pageno);
+			if (node.page_no == id.page_no()) {
+				rtr_rebuild_path(rtr_info, node.page_no);
 				break;
 			}
 		}
-		mutex_exit(&rtr_info->rtr_path_mutex);
+		mysql_mutex_unlock(&rtr_info->rtr_path_mutex);
 
-		if (rtr_info->matches) {
-			mutex_enter(&rtr_info->matches->rtr_match_mutex);
+		if (auto matches = rtr_info->matches) {
+			mysql_mutex_lock(&matches->rtr_match_mutex);
 
-			if ((&rtr_info->matches->block)->page.id().page_no()
-			     == pageno) {
-				if (!rtr_info->matches->matched_recs->empty()) {
-					rtr_info->matches->matched_recs->clear();
-				}
-				ut_ad(rtr_info->matches->matched_recs->empty());
-				rtr_info->matches->valid = false;
+			if (matches->block.page.id() == id) {
+				matches->matched_recs->clear();
+				matches->valid = false;
 			}
 
-			mutex_exit(&rtr_info->matches->rtr_match_mutex);
+			mysql_mutex_unlock(&matches->rtr_match_mutex);
 		}
 	}
 
-	mutex_exit(&index->rtr_track->rtr_active_mutex);
+	mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
 
-	lock_mutex_enter();
-	lock_prdt_page_free_from_discard(block, &lock_sys.prdt_hash);
-	lock_prdt_page_free_from_discard(block, &lock_sys.prdt_page_hash);
-	lock_mutex_exit();
+	lock_sys.prdt_page_free_from_discard(id, true);
 }
 
 /** Structure acts as functor to get the optimistic access of the page.
@@ -1249,8 +1593,7 @@ struct optimistic_get
   bool operator()(buf_block_t *hint) const
   {
     return hint && buf_page_optimistic_get(
-       RW_X_LATCH, hint, r_cursor->modify_clock, __FILE__,
-       __LINE__, mtr);
+       RW_X_LATCH, hint, r_cursor->modify_clock, mtr);
   }
 };
 
@@ -1258,7 +1601,6 @@ struct optimistic_get
 static
 bool
 rtr_cur_restore_position(
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
 	btr_cur_t*	btr_cur,	/*!< in: detached persistent cursor */
 	ulint		level,		/*!< in: index level */
 	mtr_t*		mtr)		/*!< in: mtr */
@@ -1274,6 +1616,7 @@ rtr_cur_restore_position(
 	ut_ad(mtr->is_active());
 
 	index = btr_cur_get_index(btr_cur);
+	ut_ad(r_cursor->index() == btr_cur->index());
 
 	if (r_cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
 	    || r_cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
@@ -1285,8 +1628,6 @@ rtr_cur_restore_position(
 		r_cursor->modify_clock = 100;
 	);
 
-	ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
-
 	if (r_cursor->block_when_stored.run_with_hint(
 		optimistic_get(r_cursor, mtr))) {
 		ut_ad(r_cursor->pos_state == BTR_PCUR_IS_POSITIONED);
@@ -1333,7 +1674,6 @@ rtr_cur_restore_position(
 
 	/* Page has changed, for R-Tree, the page cannot be shrunk away,
 	so we search the page and its right siblings */
-	buf_block_t*	block;
 	node_seq_t	page_ssn;
 	const page_t*	page;
 	page_cur_t*	page_cursor;
@@ -1351,21 +1691,27 @@ rtr_cur_restore_position(
 	ut_ad(r_cursor == node->cursor);
 
 search_again:
-	dberr_t err = DB_SUCCESS;
+	ulint up_match = 0, low_match = 0;
 
-	block = buf_page_get_gen(
+	page_cursor->block = buf_page_get_gen(
 		page_id_t(index->table->space_id, page_no),
-		zip_size, RW_X_LATCH, NULL,
-		BUF_GET, __FILE__, __LINE__, mtr, &err);
+		zip_size, RW_X_LATCH, NULL, BUF_GET, mtr);
 
-	ut_ad(block);
+	if (!page_cursor->block) {
+corrupted:
+		ret = false;
+		goto func_exit;
+	}
 
 	/* Get the page SSN */
-	page = buf_block_get_frame(block);
+	page = buf_block_get_frame(page_cursor->block);
 	page_ssn = page_get_ssn_id(page);
 
-	ulint low_match = page_cur_search(
-				block, index, tuple, PAGE_CUR_LE, page_cursor);
+	if (page_cur_search_with_match(tuple, PAGE_CUR_LE,
+				       &up_match, &low_match, page_cursor,
+				       nullptr)) {
+		goto corrupted;
+	}
 
 	if (low_match == r_cursor->old_n_fields) {
 		const rec_t*	rec;
@@ -1403,6 +1749,7 @@ search_again:
 		goto search_again;
 	}
 
+func_exit:
 	mem_heap_free(heap);
 
 	return(ret);
@@ -1425,7 +1772,7 @@ rtr_leaf_push_match_rec(
 	ulint		data_len;
 	rtr_rec_t	rtr_rec;
 
-	buf = match_rec->block.frame + match_rec->used;
+	buf = match_rec->block.page.frame + match_rec->used;
 	ut_ad(page_rec_is_leaf(rec));
 
 	copy = rec_copy(buf, rec, offsets);
@@ -1456,7 +1803,7 @@ rtr_store_parent_path(
 /*==================*/
 	const buf_block_t*	block,	/*!< in: block of the page */
 	btr_cur_t*		btr_cur,/*!< in/out: persistent cursor */
-	ulint			latch_mode,
+	btr_latch_mode		latch_mode,
 					/*!< in: latch_mode */
 	ulint			level,	/*!< in: index level */
 	mtr_t*			mtr)	/*!< in: mtr */
@@ -1515,14 +1862,14 @@ rtr_non_leaf_insert_stack_push(
 
 	page_cur_position(rec, block, btr_pcur_get_page_cur(my_cursor));
 
-	(btr_pcur_get_btr_cur(my_cursor))->index = index;
+	btr_pcur_get_page_cur(my_cursor)->index = index;
 
 	new_seq = rtr_get_current_ssn_id(index);
 	rtr_non_leaf_stack_push(path, block->page.id().page_no(),
 				new_seq, level, child_no, my_cursor, mbr_inc);
 }
 
-/** Copy a buf_block_t, except "block->lock".
+/** Copy a buf_block_t, except "block->page.lock".
 @param[in,out]	matches	copy to match->block
 @param[in]	block	block to copy */
 static
@@ -1536,8 +1883,9 @@ rtr_copy_buf(
 	from the dummy buf_block_t we create here and because memcpy()ing
 	it generates (valid) compiler warnings that the vtable pointer
 	will be copied. */
+	matches->block.page.lock.free();
 	new (&matches->block.page) buf_page_t(block->page);
-	matches->block.frame = block->frame;
+	matches->block.page.frame = block->page.frame;
 	matches->block.unzip_LRU = block->unzip_LRU;
 
 	ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list);
@@ -1556,7 +1904,6 @@ rtr_copy_buf(
 	matches->block.curr_left_side = block->curr_left_side;
 	matches->block.index = block->index;
 #endif /* BTR_CUR_HASH_ADAPT */
-	ut_d(matches->block.debug_latch = NULL);
 }
 
 /****************************************************************//**
@@ -1573,13 +1920,12 @@ rtr_init_match(
 	ut_ad(matches->matched_recs->empty());
 	matches->locked = false;
 	rtr_copy_buf(matches, block);
-	matches->block.frame = matches->bufp;
+	matches->block.page.frame = matches->bufp;
 	matches->valid = false;
-	/* We have to copy PAGE_W*_SUPREMUM_END bytes so that we can
+	/* We have to copy PAGE_*_SUPREMUM_END bytes so that we can
 	use infimum/supremum of this page as normal btr page for search. */
-	memcpy(matches->block.frame, page, page_is_comp(page)
-						? PAGE_NEW_SUPREMUM_END
-						: PAGE_OLD_SUPREMUM_END);
+	memcpy(matches->block.page.frame, page, page_is_comp(page)
+	       ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END);
 	matches->used = page_is_comp(page)
 				? PAGE_NEW_SUPREMUM_END
 				: PAGE_OLD_SUPREMUM_END;
@@ -1724,13 +2070,20 @@ rtr_cur_search_with_match(
 		mode = PAGE_CUR_WITHIN;
 	}
 
-	rec = page_dir_slot_get_rec(page_dir_get_nth_slot(page, 0));
+	rec = page_dir_slot_get_rec_validate(page_dir_get_nth_slot(page, 0));
+
+	if (UNIV_UNLIKELY(!rec)) {
+		return false;
+	}
 
 	last_rec = rec;
 	best_rec = rec;
 
 	if (page_rec_is_infimum(rec)) {
 		rec = page_rec_get_next_const(rec);
+		if (UNIV_UNLIKELY(!rec)) {
+			return false;
+		}
 	}
 
 	/* Check insert tuple size is larger than first rec, and try to
@@ -1928,7 +2281,7 @@ rtr_cur_search_with_match(
 	}
 
 	/* All records on page are searched */
-	if (page_rec_is_supremum(rec)) {
+	if (rec && page_rec_is_supremum(rec)) {
 		if (!n_core) {
 			if (!found) {
 				/* No match case, if it is for insertion,
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index f7dd18e0e36..66125ac5d52 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -63,6 +63,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 #include <my_service_manager.h>
 #include <key.h>
+#include <sql_manager.h>
 
 /* Include necessary InnoDB headers */
 #include "btr0btr.h"
@@ -79,7 +80,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "btr0defragment.h"
 #include "dict0crea.h"
 #include "dict0dict.h"
-#include "dict0priv.h"
 #include "dict0stats.h"
 #include "dict0stats_bg.h"
 #include "fil0fil.h"
@@ -94,7 +94,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "mtr0mtr.h"
 #include "os0file.h"
 #include "page0zip.h"
-#include "pars0pars.h"
 #include "rem0types.h"
 #include "row0import.h"
 #include "row0ins.h"
@@ -113,7 +112,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "trx0trx.h"
 #include "fil0pagecompress.h"
 #include "ut0mem.h"
-#include "ut0mutex.h"
 #include "row0ext.h"
 
 #include <limits>
@@ -135,7 +133,6 @@ void close_thread_tables(THD* thd);
 
 #ifdef MYSQL_DYNAMIC_PLUGIN
 #define tc_size  400
-#define tdc_size 400
 #endif
 
 #include <mysql/plugin.h>
@@ -143,17 +140,20 @@ void close_thread_tables(THD* thd);
 
 #include "ha_innodb.h"
 #include "i_s.h"
-#include "sync0sync.h"
 
 #include <string>
 #include <sstream>
 
 #ifdef WITH_WSREP
-#include "dict0priv.h"
 #include <mysql/service_md5.h>
 #include "wsrep_sst.h"
 #endif /* WITH_WSREP */
 
+#ifdef HAVE_URING
+/** The Linux kernel version if io_uring() is considered unsafe */
+const char *io_uring_may_be_unsafe;
+#endif
+
 #define INSIDE_HA_INNOBASE_CC
 
 #define EQ_CURRENT_THD(thd) ((thd) == current_thd)
@@ -167,7 +167,7 @@ static const long AUTOINC_NO_LOCKING = 2;
 static ulong innobase_open_files;
 static long innobase_autoinc_lock_mode;
 
-static ulonglong innobase_buffer_pool_size;
+ulonglong innobase_buffer_pool_size;
 
 /** Percentage of the buffer pool to reserve for 'old' blocks.
 Connected to buf_LRU_old_ratio. */
@@ -185,13 +185,11 @@ static char*	innobase_disable_monitor_counter;
 static char*	innobase_reset_monitor_counter;
 static char*	innobase_reset_all_monitor_counter;
 
-static ulong	innodb_flush_method;
-
 /* This variable can be set in the server configure file, specifying
 stopword table to be used */
 static char*	innobase_server_stopword_table;
 
-static my_bool	innobase_rollback_on_timeout;
+my_bool innobase_rollback_on_timeout;
 static my_bool	innobase_create_status_file;
 my_bool	innobase_stats_on_metadata;
 static my_bool	innodb_optimize_fulltext_only;
@@ -216,23 +214,34 @@ enum default_row_format_enum {
 	DEFAULT_ROW_FORMAT_DYNAMIC = 2,
 };
 
+/** Whether ROW_FORMAT=COMPRESSED tables are read-only */
+static my_bool innodb_read_only_compressed;
+
 /** A dummy variable */
 static uint innodb_max_purge_lag_wait;
 
-/** Wait for trx_sys_t::rseg_history_len to be below a limit. */
+/** Wait for trx_sys.history_size() to be below a limit. */
 static void innodb_max_purge_lag_wait_update(THD *thd, st_mysql_sys_var *,
                                              void *, const void *limit)
 {
   const uint l= *static_cast<const uint*>(limit);
-  if (trx_sys.rseg_history_len <= l)
+  if (!trx_sys.history_exceeds(l))
     return;
   mysql_mutex_unlock(&LOCK_global_system_variables);
-  while (trx_sys.rseg_history_len > l)
+  while (trx_sys.history_exceeds(l))
   {
     if (thd_kill_level(thd))
       break;
+    /* Adjust for purge_coordinator_state::refresh() */
+    mysql_mutex_lock(&log_sys.mutex);
+    const lsn_t last= log_sys.last_checkpoint_lsn,
+      max_age= log_sys.max_checkpoint_age;
+    mysql_mutex_unlock(&log_sys.mutex);
+    const lsn_t lsn= log_sys.get_lsn();
+    if ((lsn - last) / 4 >= max_age / 5)
+      buf_flush_ahead(last + max_age / 5, false);
     srv_wake_purge_thread_if_not_active();
-    os_thread_sleep(100000);
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
   }
   mysql_mutex_lock(&LOCK_global_system_variables);
 }
@@ -305,10 +314,6 @@ static TYPELIB innodb_stats_method_typelib = {
 const char* innodb_checksum_algorithm_names[] = {
 	"crc32",
 	"strict_crc32",
-	"innodb",
-	"strict_innodb",
-	"none",
-	"strict_none",
 	"full_crc32",
 	"strict_full_crc32",
 	NullS
@@ -340,22 +345,6 @@ static TYPELIB innodb_default_row_format_typelib = {
 	NULL
 };
 
-/** Possible values of the parameter innodb_lock_schedule_algorithm */
-static const char* innodb_lock_schedule_algorithm_names[] = {
-	"fcfs",
-	"vats",
-	NullS
-};
-
-/** Used to define an enumerate type of the system variable
-innodb_lock_schedule_algorithm. */
-static TYPELIB innodb_lock_schedule_algorithm_typelib = {
-	array_elements(innodb_lock_schedule_algorithm_names) - 1,
-	"innodb_lock_schedule_algorithm_typelib",
-	innodb_lock_schedule_algorithm_names,
-	NULL
-};
-
 /** Names of allowed values of innodb_flush_method */
 const char* innodb_flush_method_names[] = {
 	"fsync",
@@ -380,6 +369,26 @@ TYPELIB innodb_flush_method_typelib = {
 	NULL
 };
 
+/** Names of allowed values of innodb_deadlock_report */
+static const char *innodb_deadlock_report_names[]= {
+	"off", /* Do not report any details of deadlocks */
+	"basic", /* Report waiting transactions and lock requests */
+	"full", /* Also report blocking locks */
+	NullS
+};
+
+static_assert(Deadlock::REPORT_OFF == 0, "compatibility");
+static_assert(Deadlock::REPORT_BASIC == 1, "compatibility");
+static_assert(Deadlock::REPORT_FULL == 2, "compatibility");
+
+/** Enumeration of innodb_deadlock_report */
+static TYPELIB innodb_deadlock_report_typelib = {
+	array_elements(innodb_deadlock_report_names) - 1,
+	"innodb_deadlock_report_typelib",
+	innodb_deadlock_report_names,
+	NULL
+};
+
 /** Allowed values of innodb_change_buffering */
 static const char* innodb_change_buffering_names[] = {
 	"none",		/* IBUF_USE_NONE */
@@ -499,14 +508,42 @@ const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version,
 
 #ifdef HAVE_PSI_INTERFACE
 # define PSI_KEY(n) {&n##_key, #n, 0}
-/* All RWLOCK used in Innodb are SX-locks */
-# define PSI_RWLOCK_KEY(n) {&n##_key, #n, PSI_RWLOCK_FLAG_SX}
-
 /* Keys to register pthread mutexes in the current file with
 performance schema */
 static mysql_pfs_key_t	pending_checkpoint_mutex_key;
 
 # ifdef UNIV_PFS_MUTEX
+mysql_pfs_key_t	buf_pool_mutex_key;
+mysql_pfs_key_t	dict_foreign_err_mutex_key;
+mysql_pfs_key_t	fil_system_mutex_key;
+mysql_pfs_key_t	flush_list_mutex_key;
+mysql_pfs_key_t	fts_cache_mutex_key;
+mysql_pfs_key_t	fts_cache_init_mutex_key;
+mysql_pfs_key_t	fts_delete_mutex_key;
+mysql_pfs_key_t	fts_doc_id_mutex_key;
+mysql_pfs_key_t	ibuf_bitmap_mutex_key;
+mysql_pfs_key_t	ibuf_mutex_key;
+mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
+mysql_pfs_key_t	log_sys_mutex_key;
+mysql_pfs_key_t	log_flush_order_mutex_key;
+mysql_pfs_key_t	recalc_pool_mutex_key;
+mysql_pfs_key_t	purge_sys_pq_mutex_key;
+mysql_pfs_key_t	recv_sys_mutex_key;
+mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+mysql_pfs_key_t rtr_active_mutex_key;
+mysql_pfs_key_t	rtr_match_mutex_key;
+mysql_pfs_key_t	rtr_path_mutex_key;
+mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
+mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+mysql_pfs_key_t	srv_monitor_file_mutex_key;
+mysql_pfs_key_t	buf_dblwr_mutex_key;
+mysql_pfs_key_t	trx_pool_mutex_key;
+mysql_pfs_key_t	trx_pool_manager_mutex_key;
+mysql_pfs_key_t	lock_wait_mutex_key;
+mysql_pfs_key_t	trx_sys_mutex_key;
+mysql_pfs_key_t	srv_threads_mutex_key;
+mysql_pfs_key_t	tpool_cache_mutex_key;
+
 /* all_innodb_mutexes array contains mutexes that are
 performance schema instrumented if "UNIV_PFS_MUTEX"
 is defined */
@@ -514,10 +551,11 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	PSI_KEY(pending_checkpoint_mutex),
 	PSI_KEY(buf_pool_mutex),
 	PSI_KEY(dict_foreign_err_mutex),
-	PSI_KEY(dict_sys_mutex),
 	PSI_KEY(recalc_pool_mutex),
 	PSI_KEY(fil_system_mutex),
 	PSI_KEY(flush_list_mutex),
+	PSI_KEY(fts_cache_mutex),
+	PSI_KEY(fts_cache_init_mutex),
 	PSI_KEY(fts_delete_mutex),
 	PSI_KEY(fts_doc_id_mutex),
 	PSI_KEY(log_flush_order_mutex),
@@ -528,42 +566,47 @@ static PSI_mutex_info all_innodb_mutexes[] = {
 	PSI_KEY(page_zip_stat_per_index_mutex),
 	PSI_KEY(purge_sys_pq_mutex),
 	PSI_KEY(recv_sys_mutex),
-	PSI_KEY(redo_rseg_mutex),
-	PSI_KEY(noredo_rseg_mutex),
-#  ifdef UNIV_DEBUG
-	PSI_KEY(rw_lock_debug_mutex),
-#  endif /* UNIV_DEBUG */
-	PSI_KEY(rw_lock_list_mutex),
 	PSI_KEY(srv_innodb_monitor_mutex),
 	PSI_KEY(srv_misc_tmpfile_mutex),
 	PSI_KEY(srv_monitor_file_mutex),
 	PSI_KEY(buf_dblwr_mutex),
 	PSI_KEY(trx_pool_mutex),
 	PSI_KEY(trx_pool_manager_mutex),
-	PSI_KEY(lock_mutex),
 	PSI_KEY(lock_wait_mutex),
-	PSI_KEY(trx_mutex),
 	PSI_KEY(srv_threads_mutex),
 	PSI_KEY(rtr_active_mutex),
 	PSI_KEY(rtr_match_mutex),
 	PSI_KEY(rtr_path_mutex),
 	PSI_KEY(trx_sys_mutex),
+	PSI_KEY(tpool_cache_mutex),
 };
 # endif /* UNIV_PFS_MUTEX */
 
 # ifdef UNIV_PFS_RWLOCK
+mysql_pfs_key_t	dict_operation_lock_key;
+mysql_pfs_key_t	index_tree_rw_lock_key;
+mysql_pfs_key_t	index_online_log_key;
+mysql_pfs_key_t	fil_space_latch_key;
+mysql_pfs_key_t trx_i_s_cache_lock_key;
+mysql_pfs_key_t	trx_purge_latch_key;
+mysql_pfs_key_t lock_latch_key;
+mysql_pfs_key_t trx_rseg_latch_key;
+
 /* all_innodb_rwlocks array contains rwlocks that are
 performance schema instrumented if "UNIV_PFS_RWLOCK"
 is defined */
-static PSI_rwlock_info all_innodb_rwlocks[] = {
-	PSI_RWLOCK_KEY(btr_search_latch),
-	PSI_RWLOCK_KEY(dict_operation_lock),
-	PSI_RWLOCK_KEY(fil_space_latch),
-	PSI_RWLOCK_KEY(fts_cache_rw_lock),
-	PSI_RWLOCK_KEY(fts_cache_init_rw_lock),
-	PSI_RWLOCK_KEY(trx_i_s_cache_lock),
-	PSI_RWLOCK_KEY(trx_purge_latch),
-	PSI_RWLOCK_KEY(index_tree_rw_lock),
+static PSI_rwlock_info all_innodb_rwlocks[] =
+{
+#  ifdef BTR_CUR_HASH_ADAPT
+  { &btr_search_latch_key, "btr_search_latch", 0 },
+#  endif
+  { &dict_operation_lock_key, "dict_operation_lock", 0 },
+  { &fil_space_latch_key, "fil_space_latch", 0 },
+  { &trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0 },
+  { &trx_purge_latch_key, "trx_purge_latch", 0 },
+  { &lock_latch_key, "lock_latch", 0 },
+  { &trx_rseg_latch_key, "trx_rseg_latch", 0 },
+  { &index_tree_rw_lock_key, "index_tree_rw_lock", PSI_RWLOCK_FLAG_SX }
 };
 # endif /* UNIV_PFS_RWLOCK */
 
@@ -722,7 +765,6 @@ innodb_tmpdir_validate(
 		return(1);
 	}
 
-	os_normalize_path(alter_tmp_dir);
 	my_realpath(tmp_abs_path, alter_tmp_dir, 0);
 	size_t	tmp_abs_len = strlen(tmp_abs_path);
 
@@ -846,9 +888,9 @@ static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG,
   NULL, NULL,
   /* default */ TRUE);
 
-static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
-  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
-  NULL, NULL, 50, 0, 1024 * 1024 * 1024, 0);
+static MYSQL_THDVAR_UINT(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. The value 100000000 is infinite timeout.",
+  NULL, NULL, 50, 0, 100000000, 0);
 
 static MYSQL_THDVAR_STR(ft_user_stopword_table,
   PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
@@ -862,8 +904,9 @@ static MYSQL_THDVAR_STR(tmpdir,
 
 static SHOW_VAR innodb_status_variables[]= {
 #ifdef BTR_CUR_HASH_ADAPT
-  {"adaptive_hash_hash_searches", &btr_cur_n_sea, SHOW_SIZE_T},
-  {"adaptive_hash_non_hash_searches", &btr_cur_n_non_sea, SHOW_SIZE_T},
+  {"adaptive_hash_hash_searches", &export_vars.innodb_ahi_hit, SHOW_SIZE_T},
+  {"adaptive_hash_non_hash_searches",
+  &export_vars.innodb_ahi_miss, SHOW_SIZE_T},
 #endif
   {"background_log_sync", &srv_log_writes_and_flush, SHOW_SIZE_T},
   {"buffer_pool_dump_status",
@@ -874,42 +917,37 @@ static SHOW_VAR innodb_status_variables[]= {
   (char*) &export_vars.innodb_buffer_pool_resize_status,  SHOW_CHAR},
   {"buffer_pool_load_incomplete",
   &export_vars.innodb_buffer_pool_load_incomplete,        SHOW_BOOL},
-  {"buffer_pool_pages_data",
-   &export_vars.innodb_buffer_pool_pages_data, SHOW_SIZE_T},
+  {"buffer_pool_pages_data", &UT_LIST_GET_LEN(buf_pool.LRU), SHOW_SIZE_T},
   {"buffer_pool_bytes_data",
    &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T},
   {"buffer_pool_pages_dirty",
-   &export_vars.innodb_buffer_pool_pages_dirty, SHOW_SIZE_T},
-  {"buffer_pool_bytes_dirty",
-   &export_vars.innodb_buffer_pool_bytes_dirty, SHOW_SIZE_T},
-  {"buffer_pool_pages_flushed", &buf_flush_page_count, SHOW_SIZE_T},
-  {"buffer_pool_pages_free",
-   &export_vars.innodb_buffer_pool_pages_free, SHOW_SIZE_T},
+   &UT_LIST_GET_LEN(buf_pool.flush_list), SHOW_SIZE_T},
+  {"buffer_pool_bytes_dirty", &buf_pool.flush_list_bytes, SHOW_SIZE_T},
+  {"buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written, SHOW_SIZE_T},
+  {"buffer_pool_pages_free", &UT_LIST_GET_LEN(buf_pool.free), SHOW_SIZE_T},
 #ifdef UNIV_DEBUG
   {"buffer_pool_pages_latched",
    &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T},
 #endif /* UNIV_DEBUG */
   {"buffer_pool_pages_made_not_young",
-   &export_vars.innodb_buffer_pool_pages_made_not_young, SHOW_SIZE_T},
+   &buf_pool.stat.n_pages_not_made_young, SHOW_SIZE_T},
   {"buffer_pool_pages_made_young",
-   &export_vars.innodb_buffer_pool_pages_made_young, SHOW_SIZE_T},
+   &buf_pool.stat.n_pages_made_young, SHOW_SIZE_T},
   {"buffer_pool_pages_misc",
    &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T},
-  {"buffer_pool_pages_old",
-   &export_vars.innodb_buffer_pool_pages_old, SHOW_SIZE_T},
+  {"buffer_pool_pages_old", &buf_pool.LRU_old_len, SHOW_SIZE_T},
   {"buffer_pool_pages_total",
    &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T},
   {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T},
+  {"buffer_pool_pages_LRU_freed", &buf_lru_freed_page_count, SHOW_SIZE_T},
+  {"buffer_pool_pages_split", &buf_pool.pages_split, SHOW_SIZE_T},
   {"buffer_pool_read_ahead_rnd",
-   &export_vars.innodb_buffer_pool_read_ahead_rnd, SHOW_SIZE_T},
-  {"buffer_pool_read_ahead",
-   &export_vars.innodb_buffer_pool_read_ahead, SHOW_SIZE_T},
+   &buf_pool.stat.n_ra_pages_read_rnd, SHOW_SIZE_T},
+  {"buffer_pool_read_ahead", &buf_pool.stat.n_ra_pages_read, SHOW_SIZE_T},
   {"buffer_pool_read_ahead_evicted",
-   &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_SIZE_T},
-  {"buffer_pool_read_requests",
-   &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T},
-  {"buffer_pool_reads",
-   &export_vars.innodb_buffer_pool_reads, SHOW_SIZE_T},
+   &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T},
+  {"buffer_pool_read_requests", &buf_pool.stat.n_page_gets, SHOW_SIZE_T},
+  {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T},
   {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T},
   {"buffer_pool_write_requests",
    &export_vars.innodb_buffer_pool_write_requests, SHOW_SIZE_T},
@@ -925,7 +963,7 @@ static SHOW_VAR innodb_status_variables[]= {
   {"data_written", &export_vars.innodb_data_written, SHOW_SIZE_T},
   {"dblwr_pages_written", &export_vars.innodb_dblwr_pages_written,SHOW_SIZE_T},
   {"dblwr_writes", &export_vars.innodb_dblwr_writes, SHOW_SIZE_T},
-  {"deadlocks", &srv_stats.lock_deadlock_count, SHOW_SIZE_T},
+  {"deadlocks", &lock_sys.deadlocks, SHOW_SIZE_T},
   {"history_list_length", &export_vars.innodb_history_list_length,SHOW_SIZE_T},
   {"ibuf_discarded_delete_marks", &ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK],
    SHOW_SIZE_T},
@@ -1179,7 +1217,7 @@ struct log_flush_request
 };
 
 /** Buffer of pending innodb_log_flush_request() */
-MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) static
+alignas(CPU_LEVEL1_DCACHE_LINESIZE) static
 struct
 {
   /** first request */
@@ -1232,16 +1270,336 @@ innobase_commit_by_xid(
 	handlerton*	hton,		/*!< in: InnoDB handlerton */
 	XID*		xid);		/*!< in: X/Open XA transaction
 					identification */
-/** Remove all tables in the named database inside InnoDB.
-@param[in]	hton	handlerton from InnoDB
-@param[in]	path	Database path; Inside InnoDB the name of the last
-directory in the path is used as the database name.
-For example, in 'mysql/data/test' the database name is 'test'. */
-static
-void
-innobase_drop_database(
-	handlerton*	hton,
-	char*		path);
+
+/** Ignore FOREIGN KEY constraints that would be violated by DROP DATABASE */
+static ibool innodb_drop_database_ignore_fk(void*,void*) { return false; }
+
+/** FOREIGN KEY error reporting context for DROP DATABASE */
+struct innodb_drop_database_fk_report
+{
+  /** database name, with trailing '/' */
+  const span<const char> name;
+  /** whether errors were found */
+  bool violated;
+};
+
+/** Report FOREIGN KEY constraints that would be violated by DROP DATABASE
+@return whether processing should continue */
+static ibool innodb_drop_database_fk(void *node, void *report)
+{
+  auto s= static_cast<sel_node_t*>(node);
+  auto r= static_cast<innodb_drop_database_fk_report*>(report);
+  const dfield_t *name= que_node_get_val(s->select_list);
+  ut_ad(name->type.mtype == DATA_VARCHAR);
+
+  if (name->len == UNIV_SQL_NULL || name->len <= r->name.size() ||
+      memcmp(static_cast<const char*>(name->data), r->name.data(),
+             r->name.size()))
+    return false; /* End of matches */
+
+  node= que_node_get_next(s->select_list);
+  const dfield_t *id= que_node_get_val(node);
+  ut_ad(id->type.mtype == DATA_VARCHAR);
+  ut_ad(!que_node_get_next(node));
+
+  if (id->len != UNIV_SQL_NULL)
+    sql_print_error("DROP DATABASE: table %.*s is referenced"
+                    " by FOREIGN KEY %.*s",
+                    static_cast<int>(name->len),
+                    static_cast<const char*>(name->data),
+                    static_cast<int>(id->len),
+                    static_cast<const char*>(id->data));
+  else
+    ut_ad("corrupted SYS_FOREIGN record" == 0);
+
+  return true;
+}
+
+/** After DROP DATABASE executed ha_innobase::delete_table() on all
+tables that it was aware of, drop any leftover tables inside InnoDB.
+@param path  database path */
+static void innodb_drop_database(handlerton*, char *path)
+{
+  if (high_level_read_only)
+    return;
+
+  ulint len= 0;
+  char *ptr;
+
+  for (ptr= strend(path) - 2; ptr >= path &&
+#ifdef _WIN32
+       *ptr != '\\' &&
+#endif
+       *ptr != '/'; ptr--)
+    len++;
+
+  ptr++;
+  char *namebuf= static_cast<char*>
+    (my_malloc(PSI_INSTRUMENT_ME, len + 2, MYF(0)));
+  if (!namebuf)
+    return;
+  memcpy(namebuf, ptr, len);
+  namebuf[len] = '/';
+  namebuf[len + 1] = '\0';
+
+#ifdef _WIN32
+  innobase_casedn_str(namebuf);
+#endif /* _WIN32 */
+
+  THD * const thd= current_thd;
+  trx_t *trx= innobase_trx_allocate(thd);
+  dberr_t err= DB_SUCCESS;
+
+  dict_sys.lock(SRW_LOCK_CALL);
+
+  for (auto i= dict_sys.table_id_hash.n_cells; i--; )
+  {
+    for (dict_table_t *next, *table= static_cast<dict_table_t*>
+         (dict_sys.table_id_hash.array[i].node); table; table= next)
+    {
+      ut_ad(table->cached);
+      next= table->id_hash;
+      if (strncmp(table->name.m_name, namebuf, len + 1))
+        continue;
+      const auto n_handles= table->get_ref_count();
+      const bool locks= !n_handles && lock_table_has_locks(table);
+      if (n_handles || locks)
+      {
+        err= DB_ERROR;
+        ib::error errmsg;
+        errmsg << "DROP DATABASE: cannot DROP TABLE " << table->name;
+        if (n_handles)
+          errmsg << " due to " << n_handles << " open handles";
+        else
+          errmsg << " due to locks";
+        continue;
+      }
+      dict_sys.remove(table);
+    }
+  }
+
+  dict_sys.unlock();
+
+  dict_table_t *table_stats, *index_stats;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                       DICT_ERR_IGNORE_NONE);
+  if (table_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table_stats= dict_acquire_mdl_shared<false>(table_stats,
+                                                thd, &mdl_table);
+    dict_sys.unfreeze();
+  }
+  index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                       DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    index_stats= dict_acquire_mdl_shared<false>(index_stats,
+                                                thd, &mdl_index);
+    dict_sys.unfreeze();
+  }
+
+  trx_start_for_ddl(trx);
+
+  uint errors= 0;
+  char db[NAME_LEN + 1];
+  strconvert(&my_charset_filename, namebuf, len, system_charset_info, db,
+             sizeof db, &errors);
+  if (!errors && table_stats && index_stats &&
+      !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
+      !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
+      lock_table_for_trx(table_stats, trx, LOCK_X) == DB_SUCCESS &&
+      lock_table_for_trx(index_stats, trx, LOCK_X) == DB_SUCCESS)
+  {
+    row_mysql_lock_data_dictionary(trx);
+    if (dict_stats_delete(db, trx))
+    {
+      /* Ignore this error. Leaving garbage statistics behind is a
+      lesser evil. Carry on to try to remove any garbage tables. */
+      trx->rollback();
+      trx_start_for_ddl(trx);
+    }
+    row_mysql_unlock_data_dictionary(trx);
+  }
+
+  if (err == DB_SUCCESS)
+    err= lock_sys_tables(trx);
+  row_mysql_lock_data_dictionary(trx);
+
+  static const char drop_database[] =
+    "PROCEDURE DROP_DATABASE_PROC () IS\n"
+    "fk CHAR;\n"
+    "name CHAR;\n"
+    "tid CHAR;\n"
+    "iid CHAR;\n"
+
+    "DECLARE FUNCTION fk_report;\n"
+
+    "DECLARE CURSOR fkf IS\n"
+    "SELECT ID FROM SYS_FOREIGN WHERE ID >= :db FOR UPDATE;\n"
+
+    "DECLARE CURSOR fkr IS\n"
+    "SELECT REF_NAME,ID FROM SYS_FOREIGN WHERE REF_NAME >= :db FOR UPDATE\n"
+    "ORDER BY REF_NAME;\n"
+
+    "DECLARE CURSOR tab IS\n"
+    "SELECT ID,NAME FROM SYS_TABLES WHERE NAME >= :db FOR UPDATE;\n"
+
+    "DECLARE CURSOR idx IS\n"
+    "SELECT ID FROM SYS_INDEXES WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+    "BEGIN\n"
+
+    "OPEN fkf;\n"
+    "WHILE 1 = 1 LOOP\n"
+    "  FETCH fkf INTO fk;\n"
+    "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+    "  IF TO_BINARY(SUBSTR(fk, 0, LENGTH(:db)))<>TO_BINARY(:db)"
+    " THEN EXIT; END IF;\n"
+    "  DELETE FROM SYS_FOREIGN_COLS WHERE TO_BINARY(ID)=TO_BINARY(fk);\n"
+    "  DELETE FROM SYS_FOREIGN WHERE CURRENT OF fkf;\n"
+    "END LOOP;\n"
+    "CLOSE fkf;\n"
+
+    "OPEN fkr;\n"
+    "FETCH fkr INTO fk_report();\n"
+    "CLOSE fkr;\n"
+
+    "OPEN tab;\n"
+    "WHILE 1 = 1 LOOP\n"
+    "  FETCH tab INTO tid,name;\n"
+    "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+    "  IF TO_BINARY(SUBSTR(name, 0, LENGTH(:db))) <> TO_BINARY(:db)"
+    " THEN EXIT; END IF;\n"
+    "  DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+    "  DELETE FROM SYS_TABLES WHERE ID=tid;\n"
+    "  OPEN idx;\n"
+    "  WHILE 1 = 1 LOOP\n"
+    "    FETCH idx INTO iid;\n"
+    "    IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+    "    DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+    "    DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n"
+    "  END LOOP;\n"
+    "  CLOSE idx;\n"
+    "END LOOP;\n"
+    "CLOSE tab;\n"
+
+    "END;\n";
+
+  innodb_drop_database_fk_report report{{namebuf, len + 1}, false};
+
+  if (err == DB_SUCCESS)
+  {
+    pars_info_t* pinfo = pars_info_create();
+    pars_info_bind_function(pinfo, "fk_report", trx->check_foreigns
+                            ? innodb_drop_database_fk
+                            : innodb_drop_database_ignore_fk, &report);
+    pars_info_add_str_literal(pinfo, "db", namebuf);
+    err= que_eval_sql(pinfo, drop_database, trx);
+    if (err == DB_SUCCESS && report.violated)
+      err= DB_CANNOT_DROP_CONSTRAINT;
+  }
+
+  const trx_id_t trx_id= trx->id;
+
+  if (err != DB_SUCCESS)
+  {
+    trx->rollback();
+    namebuf[len] = '\0';
+    ib::error() << "DROP DATABASE " << namebuf << ": " << err;
+  }
+  else
+    trx->commit();
+
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+  row_mysql_unlock_data_dictionary(trx);
+
+  trx->free();
+
+  if (err == DB_SUCCESS)
+  {
+    /* Eventually after the DELETE FROM SYS_INDEXES was committed,
+    purge would invoke dict_drop_index_tree() to delete the associated
+    tablespaces. Because the SQL layer expects the directory to be empty,
+    we will "manually" purge the tablespaces that belong to the
+    records that we delete-marked. */
+
+    dfield_t dfield;
+    dtuple_t tuple{
+      0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+      , DATA_TUPLE_MAGIC_N
+#endif
+    };
+    dict_index_t* sys_index= UT_LIST_GET_FIRST(dict_sys.sys_tables->indexes);
+    btr_pcur_t pcur;
+    namebuf[len++]= '/';
+    dfield_set_data(&dfield, namebuf, len);
+    dict_index_copy_types(&tuple, sys_index, 1);
+    std::vector<pfs_os_file_t> to_close;
+    mtr_t mtr;
+    mtr.start();
+    pcur.btr_cur.page_cur.index = sys_index;
+    err= btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr);
+    if (err != DB_SUCCESS)
+      goto err_exit;
+
+    for (; btr_pcur_is_on_user_rec(&pcur);
+         btr_pcur_move_to_next_user_rec(&pcur, &mtr))
+    {
+      const rec_t *rec= btr_pcur_get_rec(&pcur);
+      if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES)
+      {
+        ut_ad("corrupted SYS_TABLES record" == 0);
+        break;
+      }
+      if (!rec_get_deleted_flag(rec, false))
+        continue;
+      ulint flen;
+      static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility");
+      rec_get_nth_field_offs_old(rec, 0, &flen);
+      if (flen == UNIV_SQL_NULL || flen <= len || memcmp(rec, namebuf, len))
+        /* We ran out of tables that had existed in the database. */
+        break;
+      const byte *db_trx_id=
+        rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &flen);
+      if (flen != 6)
+      {
+        ut_ad("corrupted SYS_TABLES.SPACE" == 0);
+        break;
+      }
+      if (mach_read_from_6(db_trx_id) != trx_id)
+        /* This entry was modified by some other transaction than us.
+        Unfortunately, because SYS_TABLES.NAME is the PRIMARY KEY,
+        we cannot distinguish RENAME and DROP here. It is possible
+        that the table had been renamed to some other database. */
+        continue;
+      const byte *s=
+        rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__SPACE, &flen);
+      if (flen != 4)
+        ut_ad("corrupted SYS_TABLES.SPACE" == 0);
+      else if (uint32_t space_id= mach_read_from_4(s))
+      {
+        pfs_os_file_t detached= fil_delete_tablespace(space_id);
+        if (detached != OS_FILE_CLOSED)
+          to_close.emplace_back(detached);
+      }
+    }
+  err_exit:
+    mtr.commit();
+    for (pfs_os_file_t detached : to_close)
+      os_file_close(detached);
+    /* Any changes must be persisted before we return. */
+    log_write_up_to(mtr.commit_lsn(), true);
+  }
+
+  my_free(namebuf);
+}
 
 /** Shut down the InnoDB storage engine.
 @return	0 */
@@ -1435,18 +1793,6 @@ innodb_page_size_validate(
 }
 
 /******************************************************************//**
-Returns true if the thread is the replication thread on the slave
-server.
-@return true if thd is the replication thread */
-ibool
-thd_is_replication_slave_thread(
-/*============================*/
-	THD*	thd)	/*!< in: thread handle */
-{
-	return thd && ((ibool) thd_slave_thread(thd));
-}
-
-/******************************************************************//**
 Returns true if transaction should be flagged as read-only.
 @return true if the thd is marked as read-only */
 bool
@@ -1465,9 +1811,7 @@ static MYSQL_THDVAR_BOOL(background_thread,
 /** Create a MYSQL_THD for a background thread and mark it as such.
 @param name thread info for SHOW PROCESSLIST
 @return new MYSQL_THD */
-MYSQL_THD
-innobase_create_background_thd(const char* name)
-/*============================*/
+MYSQL_THD innobase_create_background_thd(const char* name)
 {
 	MYSQL_THD thd= create_background_thd();
 	thd_proc_info(thd, name);
@@ -1535,19 +1879,10 @@ thd_has_edited_nontrans_tables(
 	return((ibool) thd_non_transactional_update(thd));
 }
 
-/* Return high resolution timestamp for the start of the current query */
-UNIV_INTERN
-unsigned long long
-thd_query_start_micro(
-	const THD*	thd)	/*!< in: thread handle */
-{
-	return thd_start_utime(thd);
-}
-
 /******************************************************************//**
 Returns the lock wait timeout for the current connection.
 @return the lock wait timeout, in seconds */
-ulong
+uint&
 thd_lock_wait_timeout(
 /*==================*/
 	THD*	thd)	/*!< in: thread handle, or NULL to query
@@ -1562,12 +1897,8 @@ thd_lock_wait_timeout(
 @param[in]	thd	thread handle, or NULL to query
 			the global innodb_tmpdir.
 @retval NULL if innodb_tmpdir="" */
-const char*
-thd_innodb_tmpdir(
-	THD*	thd)
+const char *thd_innodb_tmpdir(THD *thd)
 {
-	ut_ad(!sync_check_iterate(sync_check()));
-
 	const char*	tmp_dir = THDVAR(thd, tmpdir);
 
 	if (tmp_dir != NULL && *tmp_dir == '\0') {
@@ -1654,6 +1985,143 @@ static void wsrep_abort_transaction(handlerton*, THD *, THD *, my_bool);
 static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
 static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
 #endif /* WITH_WSREP */
+
+#define normalize_table_name(a,b) \
+	normalize_table_name_c_low(a,b,IF_WIN(true,false))
+
+/** Drop any garbage intermediate tables that existed in the system
+after a backup was restored.
+
+In a final phase of Mariabackup, the commit of DDL operations is blocked,
+and those DDL operations will have to be rolled back. Because the
+normal DDL recovery will not run due to the lack of the log file,
+at least some #sql-alter- garbage tables may remain in the InnoDB
+data dictionary (while the data files themselves are missing).
+We will attempt to drop the tables here. */
+#if 0
+static void drop_garbage_tables_after_restore()
+{
+  btr_pcur_t pcur;
+  mtr_t mtr;
+  trx_t *trx= trx_create();
+
+  ut_ad(!purge_sys.enabled());
+  ut_d(purge_sys.stop_FTS());
+
+  mtr.start();
+  if (pcur.open_leaf(true, dict_sys.sys_tables->indexes.start, BTR_SEARCH_LEAF,
+                     &mtr) != DB_SUCCESS)
+    goto all_fail;
+  for (;;)
+  {
+    btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+    if (!btr_pcur_is_on_user_rec(&pcur))
+      break;
+
+    const rec_t *rec= btr_pcur_get_rec(&pcur);
+    if (rec_get_deleted_flag(rec, 0))
+      continue;
+
+    static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility");
+    size_t len;
+    if (rec_get_1byte_offs_flag(rec))
+    {
+      len= rec_1_get_field_end_info(rec, 0);
+      if (len & REC_1BYTE_SQL_NULL_MASK)
+        continue; /* corrupted SYS_TABLES.NAME */
+    }
+    else
+    {
+      len= rec_2_get_field_end_info(rec, 0);
+      static_assert(REC_2BYTE_EXTERN_MASK == 16384, "compatibility");
+      if (len >= REC_2BYTE_EXTERN_MASK)
+        continue; /* corrupted SYS_TABLES.NAME */
+    }
+
+    if (len < tmp_file_prefix_length)
+      continue;
+    if (const char *f= static_cast<const char*>
+        (memchr(rec, '/', len - tmp_file_prefix_length)))
+    {
+      if (memcmp(f + 1, tmp_file_prefix, tmp_file_prefix_length))
+        continue;
+    }
+    else
+      continue;
+
+    btr_pcur_store_position(&pcur, &mtr);
+    btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+    trx_start_for_ddl(trx);
+    std::vector<pfs_os_file_t> deleted;
+    dberr_t err= DB_TABLE_NOT_FOUND;
+    row_mysql_lock_data_dictionary(trx);
+
+    if (dict_table_t *table= dict_sys.load_table
+        ({reinterpret_cast<const char*>(pcur.old_rec), len},
+         DICT_ERR_IGNORE_DROP))
+    {
+      table->acquire();
+      row_mysql_unlock_data_dictionary(trx);
+      err= lock_table_for_trx(table, trx, LOCK_X);
+      if (err == DB_SUCCESS &&
+          (table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)))
+      {
+        fts_optimize_remove_table(table);
+        err= fts_lock_tables(trx, *table);
+      }
+      if (err == DB_SUCCESS)
+        err= lock_sys_tables(trx);
+      row_mysql_lock_data_dictionary(trx);
+      table->release();
+
+      if (err == DB_SUCCESS)
+        err= trx->drop_table(*table);
+      if (err != DB_SUCCESS)
+        goto fail;
+      trx->commit(deleted);
+    }
+    else
+    {
+fail:
+      trx->rollback();
+      sql_print_error("InnoDB: cannot drop %.*s: %s",
+                      static_cast<int>(len), pcur.old_rec, ut_strerr(err));
+    }
+
+    row_mysql_unlock_data_dictionary(trx);
+    for (pfs_os_file_t d : deleted)
+      os_file_close(d);
+
+    mtr.start();
+    if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr) == btr_pcur_t::CORRUPTED)
+      break;
+  }
+
+all_fail:
+  mtr.commit();
+  trx->free();
+  ut_free(pcur.old_rec_buf);
+  ut_d(purge_sys.resume_FTS());
+}
+
+static void innodb_ddl_recovery_done(handlerton*)
+{
+  ut_ad(!ddl_recovery_done);
+  ut_d(ddl_recovery_done= true);
+  if (!srv_read_only_mode && srv_operation <= SRV_OPERATION_EXPORT_RESTORED &&
+      srv_force_recovery < SRV_FORCE_NO_BACKGROUND)
+  {
+    if (srv_start_after_restore && !high_level_read_only)
+      drop_garbage_tables_after_restore();
+    srv_init_purge_tasks();
+    purge_sys.coordinator_startup();
+    srv_wake_purge_thread_if_not_active();
+  }
+}
+#endif
+
 /********************************************************************//**
 Converts an InnoDB error code to a MySQL error code and also tells to MySQL
 about a possible transaction rollback inside InnoDB caused by a lock wait
@@ -1732,7 +2200,7 @@ convert_error_code_to_mysql(
 
 		if (thd) {
 			thd_mark_transaction_to_rollback(
-				thd, (bool) row_rollback_on_timeout);
+				thd, innobase_rollback_on_timeout);
 		}
 
 		return(HA_ERR_LOCK_WAIT_TIMEOUT);
@@ -1756,6 +2224,7 @@ convert_error_code_to_mysql(
 						code should be introduced */
 
 	case DB_CORRUPTION:
+	case DB_PAGE_CORRUPTED:
 		return(HA_ERR_CRASHED);
 
 	case DB_OUT_OF_FILE_SPACE:
@@ -1768,12 +2237,6 @@ convert_error_code_to_mysql(
                          "InnoDB");
 		return(HA_ERR_INTERNAL_ERROR);
 
-	case DB_TABLE_IN_FK_CHECK:
-		return(HA_ERR_TABLE_IN_FK_CHECK);
-
-	case DB_TABLE_IS_BEING_USED:
-		return(HA_ERR_WRONG_COMMAND);
-
 	case DB_TABLE_NOT_FOUND:
 		return(HA_ERR_NO_SUCH_TABLE);
 
@@ -1880,7 +2343,7 @@ innobase_mysql_print_thd(
 
 /******************************************************************//**
 Get the variable length bounds of the given character set. */
-void
+static void
 innobase_get_cset_width(
 /*====================*/
 	ulint	cset,		/*!< in: MySQL charset-collation code */
@@ -1892,7 +2355,7 @@ innobase_get_cset_width(
 	ut_ad(mbminlen);
 	ut_ad(mbmaxlen);
 
-	cs = all_charsets[cset];
+	cs = cset ? get_charset((uint)cset, MYF(MY_WME)) : NULL;
 	if (cs) {
 		*mbminlen = cs->mbminlen;
 		*mbmaxlen = cs->mbmaxlen;
@@ -1920,6 +2383,29 @@ innobase_get_cset_width(
 	}
 }
 
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	unsigned*mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	unsigned*mbmaxlen)	/*!< out: maximum length of a
+				multi-byte character */
+{
+	if (dtype_is_string_type(mtype)) {
+		innobase_get_cset_width(dtype_get_charset_coll(prtype),
+					mbminlen, mbmaxlen);
+		ut_ad(*mbminlen <= *mbmaxlen);
+		ut_ad(*mbminlen < DATA_MBMAX);
+		ut_ad(*mbmaxlen < DATA_MBMAX);
+	} else {
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
 /******************************************************************//**
 Converts an identifier to a table name. */
 void
@@ -2051,31 +2537,6 @@ innobase_get_stmt_unsafe(
 	return NULL;
 }
 
-/**********************************************************************//**
-Get the current setting of the tdc_size global parameter. We do
-a dirty read because for one there is no synchronization object and
-secondly there is little harm in doing so even if we get a torn read.
-@return	value of tdc_size */
-ulint
-innobase_get_table_cache_size(void)
-/*===============================*/
-{
-	return(tdc_size);
-}
-
-/**********************************************************************//**
-Get the current setting of the lower_case_table_names global parameter from
-mysqld.cc. We do a dirty read because for one there is no synchronization
-object and secondly there is little harm in doing so even if we get a torn
-read.
-@return value of lower_case_table_names */
-ulint
-innobase_get_lower_case_table_names(void)
-/*=====================================*/
-{
-	return(lower_case_table_names);
-}
-
 /**
   Test a file path whether it is same as mysql data directory path.
 
@@ -2205,7 +2666,7 @@ __forceinline unsigned int nlz (ulonglong x)
   return (unsigned int) n ^ m;
 #endif
 #elif defined(_M_ARM64)
-  return _CountLeadingZeros(x);
+  return _CountLeadingZeros64(x);
 #endif
 }
 #else
@@ -2365,7 +2826,7 @@ innobase_trx_init(
 	DBUG_ASSERT(thd == trx->mysql_thd);
 
 	/* Ensure that thd_lock_wait_timeout(), which may be called
-	while holding lock_sys.mutex, by lock_rec_enqueue_waiting(),
+	while holding lock_sys.latch, by lock_rec_enqueue_waiting(),
 	will not end up acquiring LOCK_global_system_variables in
 	intern_sys_var_ptr(). */
 	(void) THDVAR(thd, lock_wait_timeout);
@@ -2603,8 +3064,8 @@ ha_innobase::update_thd(
 
 	trx_t*	trx = check_trx_exists(thd);
 
-	ut_ad(trx->dict_operation_lock_mode == 0);
-	ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
+	ut_ad(!trx->dict_operation_lock_mode);
+	ut_ad(!trx->dict_operation);
 
 	if (m_prebuilt->trx != trx) {
 
@@ -2712,9 +3173,9 @@ the query cache.
 @param[in]	table	table object
 @param[in]	trx	transaction object
 @return whether the storing or retrieving from the query cache is permitted */
+TRANSACTIONAL_TARGET
 static bool innobase_query_caching_table_check_low(
-	const dict_table_t*	table,
-	trx_t*			trx)
+	dict_table_t* table, trx_t* trx)
 {
 	/* The following conditions will decide the query cache
 	retrieval or storing into:
@@ -2729,17 +3190,30 @@ static bool innobase_query_caching_table_check_low(
 	For read-only transaction: should satisfy (1) and (3)
 	For read-write transaction: should satisfy (1), (2), (3) */
 
-	if (lock_table_get_n_locks(table)) {
+	const trx_id_t inv = table->query_cache_inv_trx_id;
+
+	if (trx->id && trx->id < inv) {
 		return false;
 	}
 
-	if (trx->id && trx->id < table->query_cache_inv_trx_id) {
+	if (trx->read_view.is_open() && trx->read_view.low_limit_id() < inv) {
 		return false;
 	}
 
-	return !trx->read_view.is_open()
-		|| trx->read_view.low_limit_id()
-		>= table->query_cache_inv_trx_id;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+	if (xbegin()) {
+		if (table->lock_mutex_is_locked())
+			xabort();
+		auto len = UT_LIST_GET_LEN(table->locks);
+		xend();
+		return len == 0;
+	}
+#endif
+
+	table->lock_mutex_lock();
+	auto len= UT_LIST_GET_LEN(table->locks);
+	table->lock_mutex_unlock();
+	return len == 0;
 }
 
 /** Checks if MySQL at the moment is allowed for this table to retrieve a
@@ -2753,7 +3227,7 @@ static bool innobase_query_caching_table_check(
 	const char*	norm_name)
 {
 	dict_table_t*   table = dict_table_open_on_name(
-		norm_name, FALSE, FALSE, DICT_ERR_IGNORE_FK_NOKEY);
+		norm_name, false, DICT_ERR_IGNORE_FK_NOKEY);
 
 	if (table == NULL) {
 		return false;
@@ -2764,7 +3238,7 @@ static bool innobase_query_caching_table_check(
 
 	bool allow = innobase_query_caching_table_check_low(table, trx);
 
-	dict_table_close(table, FALSE, FALSE);
+	dict_table_close(table);
 
 	if (allow) {
 		/* If the isolation level is high, assign a read view for the
@@ -2799,9 +3273,7 @@ read view to it if there is no read view yet.
 Why a deadlock of threads is not possible: the query cache calls this function
 at the start of a SELECT processing. Then the calling thread cannot be
 holding any InnoDB semaphores. The calling thread is holding the
-query cache mutex, and this function will reserve the InnoDB trx_sys.mutex.
-Thus, the 'rank' in sync0mutex.h of the MySQL query cache mutex is above
-the InnoDB trx_sys.mutex.
+query cache mutex, and this function will reserve the trx_sys.mutex.
 @return TRUE if permitted, FALSE if not; note that the value FALSE
 does not mean we should invalidate the query cache: invalidation is
 called explicitly */
@@ -2873,9 +3345,8 @@ innobase_invalidate_query_cache(
 					NOTE that in Windows this is
 					always in LOWER CASE! */
 {
-	/* Note that the sync0mutex.h rank of the query cache mutex is just
-	above the InnoDB trx_sys_t->lock. The caller of this function must
-	not have latches of a lower rank. */
+	/* Note that the query cache mutex is just above the trx_sys.mutex.
+	The caller of this function must not have latches of a lower rank. */
 
 #ifdef HAVE_QUERY_CACHE
         char    qcache_key_name[2 * (NAME_LEN + 1)];
@@ -3137,7 +3608,7 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	innobase_register_trx(ht, m_user_thd, m_prebuilt->trx);
 
 	/* We did the necessary inits in this function, no need to repeat them
-	in row_search_for_mysql */
+	in row_search_mvcc() */
 
 	m_prebuilt->sql_stat_start = FALSE;
 
@@ -3157,6 +3628,7 @@ ha_innobase::init_table_handle_for_HANDLER(void)
 	m_prebuilt->used_in_HANDLER = TRUE;
 
 	reset_template();
+	m_prebuilt->trx->bulk_insert = false;
 }
 
 /*********************************************************************//**
@@ -3232,29 +3704,6 @@ static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
   2ULL << 20,
   LLONG_MAX, 1024*1024L);
 
-static const char*	deprecated_innodb_checksum_algorithm
-	= "Setting innodb_checksum_algorithm to values other than"
-	" crc32, full_crc32, strict_crc32 or strict_full_crc32"
-	" is UNSAFE and DEPRECATED."
-	" These deprecated values will be disallowed in MariaDB 10.6.";
-
-static void innodb_checksum_algorithm_update(THD *thd, st_mysql_sys_var*,
-                                             void *, const void *save)
-{
-  srv_checksum_algorithm= *static_cast<const ulong*>(save);
-  switch (srv_checksum_algorithm) {
-  case SRV_CHECKSUM_ALGORITHM_CRC32:
-  case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-  case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-  case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-    break;
-  default:
-    push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-                        HA_ERR_UNSUPPORTED,
-                        deprecated_innodb_checksum_algorithm);
-  }
-}
-
 /****************************************************************//**
 Gives the file extension of an InnoDB single-table tablespace. */
 static const char* ha_innobase_exts[] = {
@@ -3273,11 +3722,9 @@ static ulonglong innodb_prepare_commit_versioned(THD* thd, ulonglong *trx_id)
 	if (const trx_t* trx = thd_to_trx(thd)) {
 		*trx_id = trx->id;
 
-		for (trx_mod_tables_t::const_iterator t
-			     = trx->mod_tables.begin();
-		     t != trx->mod_tables.end(); t++) {
-			if (t->second.is_versioned()) {
-				DBUG_ASSERT(t->first->versioned_by_id());
+		for (const auto& t : trx->mod_tables) {
+			if (t.second.is_versioned()) {
+				DBUG_ASSERT(t.first->versioned_by_id());
 				DBUG_ASSERT(trx->rsegs.m_redo.rseg);
 
 				return trx_sys.get_new_trx_id();
@@ -3304,136 +3751,6 @@ static void innodb_buffer_pool_size_init()
 	innobase_buffer_pool_size = srv_buf_pool_size;
 }
 
-namespace deprecated {
-/** Deprecated; no effect other than issuing a deprecation warning. */
-char* innodb_file_format;
-/** Deprecated; no effect other than issuing a deprecation warning. */
-char* innodb_large_prefix;
-
-/** Deprecated parameter with no effect */
-static my_bool innodb_log_checksums;
-/** Deprecation message for innodb_log_checksums */
-static const char* innodb_log_checksums_msg
-= "The parameter innodb_log_checksums is deprecated and has no effect.";
-/** Deprecated parameter with no effect */
-static my_bool innodb_log_compressed_pages;
-/** Deprecation message for innodb_log_compressed_pages */
-static const char* innodb_log_compressed_pages_msg
-= "The parameter innodb_log_compressed_pages is deprecated and has no effect.";
-/** Deprecated parameter with no effect */
-static my_bool	innodb_log_optimize_ddl;
-static const char* innodb_log_optimize_ddl_msg
-= "The parameter innodb_log_optimize_ddl is deprecated and has no effect.";
-/** Deprecated parameter with no effect */
-static my_bool innodb_scrub_log;
-/** Deprecation message for innodb_scrub_log */
-static const char* innodb_scrub_log_msg
-= "The parameter innodb_scrub_log is deprecated and has no effect.";
-/** Deprecated parameter with no effect */
-static ulonglong innodb_scrub_log_speed;
-/** Deprecation message for innodb_scrub_log_speed */
-static const char* innodb_scrub_log_speed_msg
-= "The parameter innodb_scrub_log_speed is deprecated and has no effect.";
-/** Deprecated parameter with no effect */
-static ulong innodb_undo_logs;
-/** Deprecation message for innodb_undo_logs */
-static const char* innodb_undo_logs_msg
-= "The parameter innodb_undo_logs is deprecated and has no effect.";
-/** Deprecated parameter with no effect */
-static ulong innodb_buffer_pool_instances;
-/** Deprecated parameter with no effect */
-static ulong innodb_page_cleaners;
-static const char* innodb_page_cleaners_msg
-= "The parameter innodb_page_cleaners is deprecated and has no effect.";
-
-ulong srv_n_log_files;
-static const char* srv_n_log_files_msg
-= "The parameter innodb_log_files_in_group is deprecated and has no effect.";
-
-static my_bool innodb_background_scrub_data_uncompressed;
-
-static const char* innodb_background_scrub_data_uncompressed_msg
-= "The parameter innodb_background_scrub_data_uncompressed is deprecated and"
-  " has no effect.";
-
-static my_bool innodb_background_scrub_data_compressed;
-
-static const char* innodb_background_scrub_data_compressed_msg
-= "The parameter innodb_background_scrub_data_compressed is deprecated and"
-  " has no effect.";
-
-static uint innodb_background_scrub_data_check_interval;
-
-static const char* innodb_background_scrub_data_check_interval_msg
-= "The parameter innodb_background_scrub_data_check_interval is deprecated and"
-  " has no effect.";
-
-static uint innodb_background_scrub_data_interval;
-
-static const char* innodb_background_scrub_data_interval_msg
-= "The parameter innodb_background_scrub_data_interval is deprecated and"
-  " has no effect.";
-
-uint replication_delay;
-uint thread_concurrency;
-uint commit_concurrency;
-uint concurrency_tickets;
-uint adaptive_max_sleep_delay;
-uint thread_sleep_delay;
-
-static const char * const replication_delay_msg
-= "The parameter innodb_replication_delay is deprecated and has no effect.";
-static const char * const thread_concurrency_msg
-= "The parameter innodb_thread_concurrency is deprecated and has no effect.";
-static const char * const commit_concurrency_msg
-= "The parameter innodb_commit_concurrency is deprecated and has no effect.";
-static const char * const concurrency_tickets_msg
-= "The parameter innodb_concurrency_tickets is deprecated and has no effect.";
-static const char * const adaptive_max_sleep_delay_msg
-= "The parameter innodb_adaptive_max_sleep_delay is deprecated and"
-  " has no effect.";
-static const char * const thread_sleep_delay_msg
-= "The parameter innodb_thread_sleep_delay is deprecated and has no effect.";
-
-static void replication_delay_warn(THD* thd, st_mysql_sys_var*, void*,
-                                   const void*)
-{
-  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
-                      replication_delay_msg);
-}
-static void thread_concurrency_warn(THD* thd, st_mysql_sys_var*, void*,
-                                    const void*)
-{
-  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
-                      thread_concurrency_msg);
-}
-static void commit_concurrency_warn(THD* thd, st_mysql_sys_var*, void*,
-                                    const void*)
-{
-  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
-                      commit_concurrency_msg);
-}
-static void concurrency_tickets_warn(THD* thd, st_mysql_sys_var*, void*,
-                                     const void*)
-{
-  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
-                      concurrency_tickets_msg);
-}
-static void adaptive_max_sleep_delay_warn(THD* thd, st_mysql_sys_var*, void*,
-                                          const void*)
-{
-  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
-                      adaptive_max_sleep_delay_msg);
-}
-static void thread_sleep_delay_warn(THD* thd, st_mysql_sys_var*, void*,
-                                    const void*)
-{
-  push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_UNSUPPORTED,
-                      thread_sleep_delay_msg);
-}
-
-} // namespace deprecated
-
 /** Initialize, validate and normalize the InnoDB startup parameters.
 @return failure code
 @retval 0 on success
@@ -3443,94 +3760,8 @@ static int innodb_init_params()
 {
 	DBUG_ENTER("innodb_init_params");
 
-	static char	current_dir[3];
-	char		*default_path;
 	ulong		num_pll_degree;
 
-	if (deprecated::innodb_large_prefix || deprecated::innodb_file_format) {
-		const char* p = deprecated::innodb_file_format
-			? "file_format"
-			: "large_prefix";
-		sql_print_warning("The parameter innodb_%s is deprecated"
-				  " and has no effect."
-				  " It may be removed in future releases."
-				  " See https://mariadb.com/kb/en/library/"
-				  "xtradbinnodb-file-format/", p);
-	}
-
-	if (UNIV_UNLIKELY(!deprecated::innodb_log_checksums)) {
-		sql_print_warning(deprecated::innodb_log_checksums_msg);
-		deprecated::innodb_log_checksums = TRUE;
-	}
-
-	if (UNIV_UNLIKELY(!deprecated::innodb_log_compressed_pages)) {
-		sql_print_warning(deprecated::innodb_log_compressed_pages_msg);
-		deprecated::innodb_log_compressed_pages = TRUE;
-	}
-
-	if (UNIV_UNLIKELY(deprecated::innodb_log_optimize_ddl)) {
-		sql_print_warning(deprecated::innodb_log_optimize_ddl_msg);
-		deprecated::innodb_log_optimize_ddl = FALSE;
-	}
-
-	if (UNIV_UNLIKELY(deprecated::innodb_scrub_log)) {
-		sql_print_warning(deprecated::innodb_scrub_log_msg);
-		deprecated::innodb_scrub_log = FALSE;
-	}
-
-	if (UNIV_UNLIKELY(deprecated::innodb_scrub_log_speed != 256)) {
-		sql_print_warning(deprecated::innodb_scrub_log_speed_msg);
-		deprecated::innodb_scrub_log_speed = 256;
-	}
-
-	if (UNIV_UNLIKELY(deprecated::innodb_buffer_pool_instances)) {
-		sql_print_warning("The parameter innodb_buffer_pool_instances"
-				  " is deprecated and has no effect.");
-	}
-
-	if (UNIV_UNLIKELY(deprecated::innodb_page_cleaners)) {
-		sql_print_warning(deprecated::innodb_page_cleaners_msg);
-	}
-
-	if (UNIV_UNLIKELY(deprecated::srv_n_log_files != 1)) {
-		sql_print_warning(deprecated::srv_n_log_files_msg);
-		deprecated::srv_n_log_files = 1;
-	}
-
-	deprecated::innodb_buffer_pool_instances = 1;
-
-	deprecated::innodb_page_cleaners = 1;
-
-	if (UNIV_UNLIKELY(deprecated::innodb_undo_logs != TRX_SYS_N_RSEGS)) {
-		sql_print_warning(deprecated::innodb_undo_logs_msg);
-		deprecated::innodb_undo_logs = TRX_SYS_N_RSEGS;
-	}
-
-	if (UNIV_UNLIKELY(deprecated::replication_delay)) {
-		sql_print_warning(deprecated::replication_delay_msg);
-		deprecated::replication_delay = 0;
-	}
-	if (UNIV_UNLIKELY(deprecated::thread_concurrency)) {
-		sql_print_warning(deprecated::thread_concurrency_msg);
-		deprecated::thread_concurrency = 0;
-	}
-	if (UNIV_UNLIKELY(deprecated::commit_concurrency)) {
-		sql_print_warning(deprecated::commit_concurrency_msg);
-		deprecated::commit_concurrency = 0;
-	}
-	if (UNIV_UNLIKELY(deprecated::concurrency_tickets)) {
-		sql_print_warning(deprecated::concurrency_tickets_msg);
-		deprecated::concurrency_tickets = 0;
-	}
-	if (UNIV_UNLIKELY(deprecated::adaptive_max_sleep_delay)) {
-		sql_print_warning(deprecated::adaptive_max_sleep_delay_msg);
-		deprecated::adaptive_max_sleep_delay = 0;
-	}
-	if (UNIV_UNLIKELY(deprecated::thread_sleep_delay)) {
-		sql_print_warning(deprecated::thread_sleep_delay_msg);
-		deprecated::thread_sleep_delay = 0;
-	}
-
 	/* Check that values don't overflow on 32-bit systems. */
 	if (sizeof(ulint) == 4) {
 		if (innobase_buffer_pool_size > UINT_MAX32) {
@@ -3555,33 +3786,6 @@ static int innodb_init_params()
 		DBUG_RETURN(HA_ERR_INITIALIZATION);
 	}
 
-	if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) {
-		ib::warn() << "The parameter innodb_lock_schedule_algorithm"
-			" is deprecated, and the setting"
-			" innodb_lock_schedule_algorithm=vats"
-			" may cause corruption. The parameter may be removed"
-			" in future releases.";
-
-#ifdef WITH_WSREP
-		/* Currently, Galera does not support VATS lock schedule algorithm. */
-		if (global_system_variables.wsrep_on) {
-			ib::info() << "For Galera, using innodb_lock_schedule_algorithm=fcfs";
-			innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
-		}
-#endif /* WITH_WSREP */
-	}
-
-#ifdef WITH_WSREP
-	/* Print deprecation info if xtrabackup is used for SST method */
-	if (global_system_variables.wsrep_on
-	    && wsrep_sst_method
-	    && (!strcmp(wsrep_sst_method, "xtrabackup")
-	        || !strcmp(wsrep_sst_method, "xtrabackup-v2"))) {
-		ib::info() << "Galera SST method xtrabackup is deprecated and the "
-			" support for it may be removed in future releases.";
-	}
-#endif /* WITH_WSREP */
-
 #ifndef HAVE_LZ4
 	if (innodb_compression_algorithm == PAGE_LZ4_ALGORITHM) {
 		sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
@@ -3650,19 +3854,11 @@ static int innodb_init_params()
 	Note that when using the embedded server, the datadirectory is not
 	necessarily the current directory of this program. */
 
-	if (mysqld_embedded) {
-		default_path = mysql_real_data_home;
-	} else {
-		/* It's better to use current lib, to keep paths short */
-		current_dir[0] = FN_CURLIB;
-		current_dir[1] = FN_LIBCHAR;
-		current_dir[2] = 0;
-		default_path = current_dir;
-	}
-
-	ut_a(default_path);
-
-	fil_path_to_mysql_datadir = default_path;
+	fil_path_to_mysql_datadir =
+#ifndef HAVE_REPLICATION
+		mysqld_embedded ? mysql_real_data_home :
+#endif
+		"./";
 
 	/* Set InnoDB initialization parameters according to the values
 	read from MySQL .cnf file */
@@ -3670,7 +3866,8 @@ static int innodb_init_params()
 	/* The default dir for data files is the datadir of MySQL */
 
 	srv_data_home = innobase_data_home_dir
-		? innobase_data_home_dir : default_path;
+		? innobase_data_home_dir
+		: const_cast<char*>(fil_path_to_mysql_datadir);
 #ifdef WITH_WSREP
 	/* If we use the wsrep API, then we need to tell the server
 	the path to the data files (for passing it to the SST scripts): */
@@ -3702,7 +3899,6 @@ static int innodb_init_params()
 		srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE());
 	}
 
-	srv_sys_space.set_name("innodb_system");
 	srv_sys_space.set_path(srv_data_home);
 
 	/* Supports raw devices */
@@ -3712,7 +3908,6 @@ static int innodb_init_params()
 		DBUG_RETURN(HA_ERR_INITIALIZATION);
 	}
 
-	srv_tmp_space.set_name("innodb_temporary");
 	srv_tmp_space.set_path(srv_data_home);
 
 	/* Temporary tablespace is in full crc32 format. */
@@ -3727,8 +3922,8 @@ static int innodb_init_params()
 
 	/* Perform all sanity check before we take action of deleting files*/
 	if (srv_sys_space.intersection(&srv_tmp_space)) {
-		sql_print_error("%s and %s file names seem to be the same.",
-			srv_tmp_space.name(), srv_sys_space.name());
+		sql_print_error("innodb_temporary and innodb_system"
+				" file names seem to be the same.");
 		DBUG_RETURN(HA_ERR_INITIALIZATION);
 	}
 
@@ -3737,11 +3932,9 @@ static int innodb_init_params()
 
 	/* ------------ UNDO tablespaces files ---------------------*/
 	if (!srv_undo_dir) {
-		srv_undo_dir = default_path;
+		srv_undo_dir = const_cast<char*>(fil_path_to_mysql_datadir);
 	}
 
-	os_normalize_path(srv_undo_dir);
-
 	if (strchr(srv_undo_dir, ';')) {
 		sql_print_error("syntax error in innodb_undo_directory");
 		DBUG_RETURN(HA_ERR_INITIALIZATION);
@@ -3752,11 +3945,10 @@ static int innodb_init_params()
 	/* The default dir for log files is the datadir of MySQL */
 
 	if (!srv_log_group_home_dir) {
-		srv_log_group_home_dir = default_path;
+		srv_log_group_home_dir
+			= const_cast<char*>(fil_path_to_mysql_datadir);
 	}
 
-	os_normalize_path(srv_log_group_home_dir);
-
 	if (strchr(srv_log_group_home_dir, ';')) {
 		sql_print_error("syntax error in innodb_log_group_home_dir");
 		DBUG_RETURN(HA_ERR_INITIALIZATION);
@@ -3826,18 +4018,6 @@ static int innodb_init_params()
 
 	srv_buf_pool_size = ulint(innobase_buffer_pool_size);
 
-	switch (srv_checksum_algorithm) {
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-		break;
-	default:
-		ib::warn() << deprecated_innodb_checksum_algorithm;
-	}
-
-	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
-
 	if (innobase_open_files < 10) {
 		innobase_open_files = 300;
 		if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) {
@@ -3880,12 +4060,12 @@ static int innodb_init_params()
                   Force O_DIRECT on Unixes (on Windows writes are always
                   unbuffered)
                 */
-		switch (innodb_flush_method) {
+		switch (srv_file_flush_method) {
 		case SRV_O_DIRECT:
 		case SRV_O_DIRECT_NO_FSYNC:
 			break;
 		default:
-			innodb_flush_method = SRV_O_DIRECT;
+			srv_file_flush_method = SRV_O_DIRECT;
 			fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
 		}
 	}
@@ -3896,30 +4076,35 @@ static int innodb_init_params()
 		srv_use_doublewrite_buf = FALSE;
 	}
 
-#ifdef LINUX_NATIVE_AIO
-#elif !defined _WIN32
+#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
 	/* Currently native AIO is supported only on windows and linux
 	and that also when the support is compiled in. In all other
 	cases, we ignore the setting of innodb_use_native_aio. */
 	srv_use_native_aio = FALSE;
 #endif
+#ifdef HAVE_URING
+	if (srv_use_native_aio && io_uring_may_be_unsafe) {
+		sql_print_warning("innodb_use_native_aio may cause "
+				  "hangs with this kernel %s; see "
+				  "https://jira.mariadb.org/browse/MDEV-26674",
+				  io_uring_may_be_unsafe);
+	}
+#endif
 
 #ifndef _WIN32
-	ut_ad(innodb_flush_method <= SRV_O_DIRECT_NO_FSYNC);
+	ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
 #else
-	switch (innodb_flush_method) {
+	switch (srv_file_flush_method) {
 	case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
-		innodb_flush_method = SRV_ALL_O_DIRECT_FSYNC;
+		srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
 		break;
 	case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
-		innodb_flush_method = SRV_FSYNC;
+		srv_file_flush_method = SRV_FSYNC;
 		break;
 	default:
-		ut_ad(innodb_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
+		ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
 	}
 #endif
-	srv_file_flush_method = srv_flush_t(innodb_flush_method);
-
 	innodb_buffer_pool_size_init();
 
 	srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
@@ -3958,7 +4143,7 @@ static int innodb_init(void* p)
 	innobase_hton->commit_checkpoint_request = innodb_log_flush_request;
 	innobase_hton->create = innobase_create_handler;
 
-	innobase_hton->drop_database = innobase_drop_database;
+	innobase_hton->drop_database = innodb_drop_database;
 	innobase_hton->panic = innobase_end;
 	innobase_hton->pre_shutdown = innodb_preshutdown;
 
@@ -4133,7 +4318,6 @@ innobase_end(handlerton*, ha_panic_function)
 
 
 		innodb_shutdown();
-
 		mysql_mutex_destroy(&log_requests.mutex);
 	}
 
@@ -4150,9 +4334,8 @@ innobase_commit_low(
 #ifdef WITH_WSREP
 	const char* tmp = 0;
 	const bool is_wsrep = trx->is_wsrep();
-	THD* thd = trx->mysql_thd;
 	if (is_wsrep) {
-		tmp = thd_proc_info(thd, "innobase_commit_low()");
+		tmp = thd_proc_info(trx->mysql_thd, "innobase_commit_low()");
 	}
 #endif /* WITH_WSREP */
 	if (trx_is_started(trx)) {
@@ -4166,7 +4349,7 @@ innobase_commit_low(
 
 #ifdef WITH_WSREP
 	if (is_wsrep) {
-		thd_proc_info(thd, tmp);
+		thd_proc_info(trx->mysql_thd, tmp);
 	}
 #endif /* WITH_WSREP */
 }
@@ -4258,7 +4441,7 @@ innobase_commit_ordered_2(
 	/* If the transaction is not run in 2pc, we must assign wsrep
 	XID here in order to get it written in rollback segment. */
 	if (trx->is_wsrep()) {
-		thd_get_xid(thd, (MYSQL_XID*)trx->xid);
+		thd_get_xid(thd, &reinterpret_cast<MYSQL_XID&>(trx->xid));
 	}
 #endif /* WITH_WSREP */
 
@@ -4341,8 +4524,8 @@ innobase_commit(
 
 	trx_t*	trx = check_trx_exists(thd);
 
-	ut_ad(trx->dict_operation_lock_mode == 0);
-	ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
+	ut_ad(!trx->dict_operation_lock_mode);
+	ut_ad(!trx->dict_operation);
 
 	/* Transaction is deregistered only in a commit or a rollback. If
 	it is deregistered we know there cannot be resources to be freed
@@ -4430,8 +4613,8 @@ innobase_rollback(
 
 	trx_t*	trx = check_trx_exists(thd);
 
-	ut_ad(trx->dict_operation_lock_mode == 0);
-	ut_ad(trx->dict_operation == TRX_DICT_OP_NONE);
+	ut_ad(!trx->dict_operation_lock_mode);
+	ut_ad(!trx->dict_operation);
 
 	/* Reset the number AUTO-INC rows required */
 
@@ -4453,8 +4636,9 @@ innobase_rollback(
 	trx is being rolled back due to BF abort, clear XID in order
 	to avoid writing it to rollback segment out of order. The XID
 	will be reassigned when the transaction is replayed. */
-	if (trx->state != TRX_STATE_NOT_STARTED && wsrep_is_wsrep_xid(trx->xid)) {
-		trx->xid->null();
+	if (trx->state != TRX_STATE_NOT_STARTED
+	    && wsrep_is_wsrep_xid(&trx->xid)) {
+		trx->xid.null();
 	}
 #endif /* WITH_WSREP */
 	if (rollback_trx
@@ -4787,8 +4971,6 @@ static int innobase_close_connection(handlerton *hton, THD *thd)
   return 0;
 }
 
-void lock_cancel_waiting_and_release(lock_t *lock);
-
 /** Cancel any pending lock request associated with the current THD.
 @sa THD::awake() @sa ha_kill_query() */
 static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels)
@@ -4798,23 +4980,15 @@ static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels)
   if (trx_t* trx= thd_to_trx(thd))
   {
     ut_ad(trx->mysql_thd == thd);
+    if (!trx->lock.wait_lock);
 #ifdef WITH_WSREP
-    if (wsrep_thd_is_aborting(thd) || trx->lock.was_chosen_as_wsrep_victim)
+    else if (trx->is_wsrep() && wsrep_thd_is_aborting(thd))
       /* if victim has been signaled by BF thread and/or aborting is already
       progressing, following query aborting is not necessary any more.
-      Also, BF thread should own trx mutex for the victim. */
-      DBUG_VOID_RETURN;
+      Also, BF thread should own trx mutex for the victim. */;
 #endif /* WITH_WSREP */
-    lock_mutex_enter();
-    if (lock_t *lock= trx->lock.wait_lock)
-    {
-      trx_mutex_enter(trx);
-      if (trx->is_wsrep() && wsrep_thd_is_aborting(thd))
-        trx->lock.was_chosen_as_deadlock_victim= TRUE;
-      lock_cancel_waiting_and_release(lock);
-      trx_mutex_exit(trx);
-    }
-    lock_mutex_exit();
+    else
+      lock_sys_t::cancel(trx);
   }
 
   DBUG_VOID_RETURN;
@@ -4872,17 +5046,6 @@ ha_innobase::table_flags() const
 }
 
 /****************************************************************//**
-Returns the table type (storage engine name).
-@return table type */
-
-const char*
-ha_innobase::table_type() const
-/*===========================*/
-{
-	return(innobase_hton_name);
-}
-
-/****************************************************************//**
 Returns the index type.
 @return index type */
 
@@ -5001,33 +5164,26 @@ ha_innobase::keys_to_use_for_scanning()
 	return(&key_map_full);
 }
 
-/****************************************************************//**
-Ensures that if there's a concurrent inplace ADD INDEX, being-indexed virtual
-columns are computed. They are not marked as indexed in the old table, so the
-server won't add them to the read_set automatically */
-void
-ha_innobase::column_bitmaps_signal()
-/*================================*/
+/** Ensure that indexed virtual columns will be computed. */
+void ha_innobase::column_bitmaps_signal()
 {
-	if (!table->vfield || table->current_lock != F_WRLCK) {
-		return;
-	}
-
-	dict_index_t*	clust_index = dict_table_get_first_index(m_prebuilt->table);
-	uint	num_v = 0;
-	for (uint j = 0; j < table->s->virtual_fields; j++) {
-		if (table->vfield[j]->stored_in_db()) {
-			continue;
-		}
+  if (!table->vfield || table->current_lock != F_WRLCK)
+    return;
 
-		dict_col_t*	col = &m_prebuilt->table->v_cols[num_v].m_col;
-		if (col->ord_part ||
-		    (dict_index_is_online_ddl(clust_index) &&
-		     row_log_col_is_indexed(clust_index, num_v))) {
-			table->mark_virtual_column_with_deps(table->vfield[j]);
-		}
-		num_v++;
-	}
+  dict_index_t* clust_index= dict_table_get_first_index(m_prebuilt->table);
+  uint num_v= 0;
+  for (uint j = 0; j < table->s->virtual_fields; j++)
+  {
+    if (table->vfield[j]->stored_in_db())
+      continue;
+
+    dict_col_t *col= &m_prebuilt->table->v_cols[num_v].m_col;
+    if (col->ord_part ||
+        (dict_index_is_online_ddl(clust_index) &&
+         row_log_col_is_indexed(clust_index, num_v)))
+      table->mark_virtual_column_with_deps(table->vfield[j]);
+    num_v++;
+  }
 }
 
 
@@ -5042,10 +5198,6 @@ ha_innobase::table_cache_type()
 	return(HA_CACHE_TBL_ASKTRANSACT);
 }
 
-/****************************************************************//**
-Determines if the primary key is clustered index.
-@return true */
-
 /** Normalizes a table name string.
 A normalized name consists of the database name catenated to '/'
 and table name. For example: test/mytable.
@@ -5060,7 +5212,7 @@ normalize_table_name_c_low(
 	char*           norm_name,      /* out: normalized name as a
 					null-terminated string */
 	const char*     name,           /* in: table name string */
-	ibool           set_lower_case) /* in: TRUE if we want to set
+	bool            set_lower_case) /* in: TRUE if we want to set
 					 name to lower case */
 {
 	char*	name_ptr;
@@ -5128,29 +5280,11 @@ create_table_info_t::create_table_info_t(
 	  m_default_row_format(innodb_default_row_format),
 	  m_create_info(create_info),
 	  m_table_name(table_name), m_table(NULL),
-	  m_drop_before_rollback(false),
 	  m_remote_path(remote_path),
 	  m_innodb_file_per_table(file_per_table)
 {
 }
 
-/** Normalizes a table name string.
-A normalized name consists of the database name catenated to '/'
-and table name. For example: test/mytable.
-On Windows, normalization puts both the database name and the
-table name always to lower case if "set_lower_case" is set to TRUE.
-@param[out]	norm_name	Normalized name, null-terminated.
-@param[in]	name		Name to normalize.
-@param[in]	set_lower_case	True if we also should fold to lower case. */
-void
-create_table_info_t::normalize_table_name_low(
-	char*		norm_name,
-	const char*	name,
-	ibool		set_lower_case)
-{
-	normalize_table_name_c_low(norm_name, name, set_lower_case);
-}
-
 #if !defined(DBUG_OFF)
 /*********************************************************************
 Test normalize_table_name_low(). */
@@ -5205,7 +5339,7 @@ test_normalize_table_name_low()
 		       " testing \"%s\", expected \"%s\"... ",
 		       test_data[i][0], test_data[i][1]);
 
-		create_table_info_t::normalize_table_name_low(
+		normalize_table_name_c_low(
 			norm_name, test_data[i][0], FALSE);
 
 		if (strcmp(norm_name, test_data[i][1]) == 0) {
@@ -5431,7 +5565,7 @@ is done when the table first opened.
 @param[in,out]	s_templ		InnoDB template structure
 @param[in]	add_v		new virtual columns added along with
 				add index call
-@param[in]	locked		true if dict_sys mutex is held */
+@param[in]	locked		true if dict_sys.latch is held */
 void
 innobase_build_v_templ(
 	const TABLE*		table,
@@ -5454,12 +5588,18 @@ innobase_build_v_templ(
 	ut_ad(n_v_col > 0);
 
 	if (!locked) {
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 	}
 
+#if 0
+	/* This does not (need to) hold for ctx->new_table in
+	alter_rebuild_apply_log() */
+	ut_ad(dict_sys.locked());
+#endif
+
 	if (s_templ->vtempl) {
 		if (!locked) {
-			mutex_exit(&dict_sys.mutex);
+			dict_sys.unlock();
 		}
 		DBUG_VOID_RETURN;
 	}
@@ -5565,7 +5705,7 @@ innobase_build_v_templ(
 	}
 
 	if (!locked) {
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unlock();
 	}
 
 	s_templ->db_name = table->s->db.str;
@@ -5626,11 +5766,7 @@ func_exit:
 /********************************************************************//**
 Get the upper limit of the MySQL integral and floating-point type.
 @return maximum allowed value for the field */
-UNIV_INTERN
-ulonglong
-innobase_get_int_col_max_value(
-/*===========================*/
-	const Field*	field)	/*!< in: MySQL field */
+ulonglong innobase_get_int_col_max_value(const Field *field)
 {
 	ulonglong	max_value = 0;
 
@@ -5702,7 +5838,7 @@ initialize_auto_increment(dict_table_t* table, const Field* field)
 
 	const unsigned	col_no = innodb_col_no(field);
 
-	table->autoinc_mutex.lock();
+	table->autoinc_mutex.wr_lock();
 
 	table->persistent_autoinc = static_cast<uint16_t>(
 		dict_table_get_nth_col_pos(table, col_no, NULL) + 1)
@@ -5713,7 +5849,7 @@ initialize_auto_increment(dict_table_t* table, const Field* field)
 		table->persistent_autoinc without
 		autoinc_mutex protection, and there might be multiple
 		ha_innobase::open() executing concurrently. */
-	} else if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
+	} else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
 		/* If the recovery level is set so high that writes
 		are disabled we force the AUTOINC counter to 0
 		value effectively disabling writes to the table.
@@ -5733,7 +5869,7 @@ initialize_auto_increment(dict_table_t* table, const Field* field)
 			innobase_get_int_col_max_value(field));
 	}
 
-	table->autoinc_mutex.unlock();
+	table->autoinc_mutex.wr_unlock();
 }
 
 /** Open an InnoDB table
@@ -5792,7 +5928,7 @@ ha_innobase::open(const char* name, int, uint)
 		or force recovery can still use it, but not others. */
 		ib_table->file_unreadable = true;
 		ib_table->corrupted = true;
-		dict_table_close(ib_table, FALSE, FALSE);
+		ib_table->release();
 		set_my_errno(ENOENT);
 		DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 	}
@@ -5834,7 +5970,7 @@ ha_innobase::open(const char* name, int, uint)
 				ret_err = HA_ERR_DECRYPTION_FAILED;
 			}
 
-			dict_table_close(ib_table, FALSE, FALSE);
+			ib_table->release();
 			DBUG_RETURN(ret_err);
 		}
 	}
@@ -5852,7 +5988,7 @@ ha_innobase::open(const char* name, int, uint)
 	key_used_on_scan = m_primary_key;
 
 	if (ib_table->n_v_cols) {
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 		if (ib_table->vc_templ == NULL) {
 			ib_table->vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
 			innobase_build_v_templ(
@@ -5860,7 +5996,7 @@ ha_innobase::open(const char* name, int, uint)
 				true);
 		}
 
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unlock();
 	}
 
 	if (!check_index_consistency(table, ib_table)) {
@@ -6037,8 +6173,9 @@ ha_innobase::open_dict_table(
 	dict_err_ignore_t	ignore_err)
 {
 	DBUG_ENTER("ha_innobase::open_dict_table");
-	dict_table_t*	ib_table = dict_table_open_on_name(norm_name, FALSE,
-							   TRUE, ignore_err);
+	/* FIXME: try_drop_aborted */
+	dict_table_t*	ib_table = dict_table_open_on_name(norm_name, false,
+							   ignore_err);
 
 	if (NULL == ib_table && is_partition) {
 		/* MySQL partition engine hard codes the file name
@@ -6058,7 +6195,7 @@ ha_innobase::open_dict_table(
 		sensitive platform in Windows, we might need to
 		check the existence of table name without lower
 		case in the system table. */
-		if (innobase_get_lower_case_table_names() == 1) {
+		if (lower_case_table_names == 1) {
 			char	par_case_name[FN_REFLEN];
 
 #ifndef _WIN32
@@ -6072,14 +6209,12 @@ ha_innobase::open_dict_table(
 			whether there exists table name in
 			system table whose name is
 			not being normalized to lower case */
-			create_table_info_t::
-				normalize_table_name_low(
-					par_case_name,
-					table_name, FALSE);
+			normalize_table_name_c_low(
+				par_case_name, table_name, false);
 #endif
+			/* FIXME: try_drop_aborted */
 			ib_table = dict_table_open_on_name(
-				par_case_name, FALSE, TRUE,
-				ignore_err);
+				par_case_name, false, ignore_err);
 		}
 
 		if (ib_table != NULL) {
@@ -6150,7 +6285,7 @@ ha_innobase::close()
 {
 	DBUG_ENTER("ha_innobase::close");
 
-	row_prebuilt_free(m_prebuilt, FALSE);
+	row_prebuilt_free(m_prebuilt);
 
 	if (m_upd_buf != NULL) {
 		ut_ad(m_upd_buf_size != 0);
@@ -6159,18 +6294,14 @@ ha_innobase::close()
 		m_upd_buf_size = 0;
 	}
 
-	MONITOR_INC(MONITOR_TABLE_CLOSE);
-
 	DBUG_RETURN(0);
 }
 
 /* The following accessor functions should really be inside MySQL code! */
 
 #ifdef WITH_WSREP
-UNIV_INTERN
 ulint
 wsrep_innobase_mysql_sort(
-/*======================*/
 					/* out: str contains sort string */
 	int		mysql_type,	/* in: MySQL type */
 	uint		charset_number,	/* in: number of the charset */
@@ -7205,7 +7336,7 @@ ha_innobase::build_template(
 			/* We must at least fetch all primary key cols. Note
 			that if the clustered index was internally generated
 			by InnoDB on the row id (no primary key was
-			defined), then row_search_for_mysql() will always
+			defined), then row_search_mvcc() will always
 			retrieve the row id to a special buffer in the
 			m_prebuilt struct. */
 
@@ -7548,7 +7679,7 @@ ha_innobase::innobase_lock_autoinc(void)
 	switch (innobase_autoinc_lock_mode) {
 	case AUTOINC_NO_LOCKING:
 		/* Acquire only the AUTOINC mutex. */
-		m_prebuilt->table->autoinc_mutex.lock();
+		m_prebuilt->table->autoinc_mutex.wr_lock();
 		break;
 
 	case AUTOINC_NEW_STYLE_LOCKING:
@@ -7562,14 +7693,14 @@ ha_innobase::innobase_lock_autoinc(void)
 		case SQLCOM_REPLACE:
 		case SQLCOM_END: // RBR event
 			/* Acquire the AUTOINC mutex. */
-			m_prebuilt->table->autoinc_mutex.lock();
+			m_prebuilt->table->autoinc_mutex.wr_lock();
 			/* We need to check that another transaction isn't
 			already holding the AUTOINC lock on the table. */
 			if (!m_prebuilt->table->n_waiting_or_granted_auto_inc_locks) {
 				/* Do not fall back to old style locking. */
 				DBUG_RETURN(error);
 			}
-			m_prebuilt->table->autoinc_mutex.unlock();
+			m_prebuilt->table->autoinc_mutex.wr_unlock();
 		}
 		/* Use old style locking. */
 		/* fall through */
@@ -7581,7 +7712,7 @@ ha_innobase::innobase_lock_autoinc(void)
 		if (error == DB_SUCCESS) {
 
 			/* Acquire the AUTOINC mutex. */
-			m_prebuilt->table->autoinc_mutex.lock();
+			m_prebuilt->table->autoinc_mutex.wr_lock();
 		}
 		break;
 
@@ -7609,12 +7740,33 @@ ha_innobase::innobase_set_max_autoinc(
 	if (error == DB_SUCCESS) {
 
 		dict_table_autoinc_update_if_greater(m_prebuilt->table, auto_inc);
-		m_prebuilt->table->autoinc_mutex.unlock();
+		m_prebuilt->table->autoinc_mutex.wr_unlock();
 	}
 
 	return(error);
 }
 
+/** @return whether the table is read-only */
+bool ha_innobase::is_read_only(bool altering_to_supported) const
+{
+  ut_ad(m_prebuilt->trx == thd_to_trx(m_user_thd));
+
+  if (high_level_read_only)
+  {
+    ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+    return true;
+  }
+
+  if (altering_to_supported)
+    return false;
+
+  if (!DICT_TF_GET_ZIP_SSIZE(m_prebuilt->table->flags) ||
+      !innodb_read_only_compressed)
+    return false;
+
+  return true;
+}
+
 /********************************************************************//**
 Stores a row in an InnoDB database, to the table specified in this
 handle.
@@ -7637,13 +7789,10 @@ ha_innobase::write_row(
 	trx_t*		trx = thd_to_trx(m_user_thd);
 
 	/* Validation checks before we commence write_row operation. */
-	if (high_level_read_only) {
-		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+	if (is_read_only()) {
 		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 	}
 
-	ut_a(m_prebuilt->trx == trx);
-
 	if (!trx_is_started(trx)) {
 		trx->will_lock = true;
 	}
@@ -7828,6 +7977,7 @@ report_error:
 
 #ifdef WITH_WSREP
 	if (!error_result && trx->is_wsrep()
+	    && !trx->is_bulk_insert()
 	    && wsrep_thd_is_local(m_user_thd)
 	    && !wsrep_thd_ignore_table(m_user_thd)
 	    && !wsrep_consistency_check(m_user_thd)
@@ -8030,34 +8180,13 @@ calc_row_difference(
 			}
 		}
 
-#ifdef UNIV_DEBUG
-		bool	online_ord_part = false;
-#endif
-
 		if (is_virtual) {
 			/* If the virtual column is not indexed,
 			we shall ignore it for update */
 			if (!col->ord_part) {
-				/* Check whether there is a table-rebuilding
-				online ALTER TABLE in progress, and this
-				virtual column could be newly indexed, thus
-				it will be materialized. Then we will have
-				to log its update.
-				Note, we do not support online dropping virtual
-				column while adding new index, nor with
-				online alter column order while adding index,
-				so the virtual column sequence must not change
-				if it is online operation */
-				if (dict_index_is_online_ddl(clust_index)
-				    && row_log_col_is_indexed(clust_index,
-							      num_v)) {
-#ifdef UNIV_DEBUG
-					online_ord_part = true;
-#endif
-				} else {
-					num_v++;
-					continue;
-				}
+			next:
+				num_v++;
+				continue;
 			}
 
 			if (!uvect->old_vrow) {
@@ -8083,8 +8212,7 @@ calc_row_difference(
 					prebuilt, vfield, o_len,
 					col, old_mysql_row_col,
 					col_pack_len, buf);
-			       num_v++;
-			       continue;
+				goto next;
 			}
 		}
 
@@ -8133,7 +8261,7 @@ calc_row_difference(
 				upd_fld_set_virtual_col(ufield);
 				ufield->field_no = num_v;
 
-				ut_ad(col->ord_part || online_ord_part);
+				ut_ad(col->ord_part);
 				ufield->old_v_val = static_cast<dfield_t*>(
 					mem_heap_alloc(
 						uvect->heap,
@@ -8218,7 +8346,7 @@ calc_row_difference(
 				prebuilt, vfield, o_len,
 				col, old_mysql_row_col,
 				col_pack_len, buf);
-			ut_ad(col->ord_part || online_ord_part);
+			ut_ad(col->ord_part);
 			num_v++;
 		}
 	}
@@ -8362,6 +8490,40 @@ wsrep_calc_row_hash(
 
 	return(0);
 }
+
+/** Append table-level exclusive key.
+@param thd   MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+ATTRIBUTE_COLD bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table)
+{
+  return true;
+#if 0
+  char db_buf[NAME_LEN + 1];
+  char tbl_buf[NAME_LEN + 1];
+  ulint db_buf_len, tbl_buf_len;
+
+  if (!table.parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len))
+  {
+    WSREP_ERROR("Parse_name for table key append failed: %s",
+                wsrep_thd_query(thd));
+    return true;
+  }
+
+  /* Append table-level exclusive key */
+  const int rcode = wsrep_thd_append_table_key(thd, db_buf,
+                                               tbl_buf, WSREP_SERVICE_KEY_EXCLUSIVE);
+  if (rcode)
+  {
+    WSREP_ERROR("Appending table key failed: %s, %d",
+                wsrep_thd_query(thd), rcode);
+    return true;
+  }
+
+  return false;
+#endif
+}
 #endif /* WITH_WSREP */
 
 /**
@@ -8387,10 +8549,7 @@ ha_innobase::update_row(
 
 	DBUG_ENTER("ha_innobase::update_row");
 
-	ut_a(m_prebuilt->trx == trx);
-
-	if (high_level_read_only) {
-		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+	if (is_read_only()) {
 		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 	} else if (!trx_is_started(trx)) {
 		trx->will_lock = true;
@@ -8535,11 +8694,16 @@ func_exit:
 	    && !wsrep_thd_ignore_table(m_user_thd)) {
 		DBUG_PRINT("wsrep", ("update row key"));
 
-		if (wsrep_append_keys(m_user_thd,
-				      wsrep_protocol_version >= 4
-				      ? WSREP_SERVICE_KEY_UPDATE
-				      : WSREP_SERVICE_KEY_EXCLUSIVE,
-				      old_row, new_row)){
+		/* We use table-level exclusive key for SEQUENCES
+		   and normal key append for others. */
+		if (table->s->table_type == TABLE_TYPE_SEQUENCE) {
+			if (wsrep_append_table_key(m_user_thd, *m_prebuilt->table))
+				DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+		} else if (wsrep_append_keys(m_user_thd,
+					     wsrep_protocol_version >= 4
+					     ? WSREP_SERVICE_KEY_UPDATE
+					     : WSREP_SERVICE_KEY_EXCLUSIVE,
+					     old_row, new_row)) {
 			WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
 			DBUG_PRINT("wsrep", ("row key failed"));
 			DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
@@ -8564,10 +8728,7 @@ ha_innobase::delete_row(
 
 	DBUG_ENTER("ha_innobase::delete_row");
 
-	ut_a(m_prebuilt->trx == trx);
-
-	if (high_level_read_only) {
-		ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+	if (is_read_only()) {
 		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 	} else if (!trx_is_started(trx)) {
 		trx->will_lock = true;
@@ -8758,7 +8919,7 @@ statement issued by the user. We also increment trx->n_mysql_tables_in_use.
 instructions to m_prebuilt->template of the table handle instance in
 ::index_read. The template is used to save CPU time in large joins.
 
-  3) In row_search_for_mysql, if m_prebuilt->sql_stat_start is true, we
+  3) In row_search_mvcc(), if m_prebuilt->sql_stat_start is true, we
 allocate a new consistent read view for the trx if it does not yet have one,
 or in the case of a locking read, set an InnoDB 'intention' table level
 lock on the table.
@@ -9060,7 +9221,7 @@ ha_innobase::change_active_index(
 		}
 
 		/* The caller seems to ignore this.  Thus, we must check
-		this again in row_search_for_mysql(). */
+		this again in row_search_mvcc(). */
 		DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
 				0, NULL));
 	}
@@ -9447,12 +9608,10 @@ ha_innobase::ft_init_ext(
 		}
 	}
 
-	/* FIXME: utf32 and utf16 are not compatible with some
-	string function used. So to convert them to uft8 before
-	we proceed. */
-	if (strcmp(char_set->csname, "utf32") == 0
-	    || strcmp(char_set->csname, "utf16") == 0) {
-
+        /* Multi byte character sets like utf32 and utf16 are not
+           compatible with some string function used. So to convert them
+           to uft8 before we proceed. */
+	if (char_set->mbminlen != 1) {
 		buf_tmp_used = innobase_convert_string(
 			buf_tmp, sizeof(buf_tmp) - 1,
 			&my_charset_utf8mb3_general_ci,
@@ -9680,7 +9839,7 @@ next_record:
 			tuple, index, &search_doc_id);
 
 		if (ret == DB_SUCCESS) {
-			ret = row_search_for_mysql(
+			ret = row_search_mvcc(
 				buf, PAGE_CUR_GE, m_prebuilt,
 				ROW_SEL_EXACT, 0);
 		}
@@ -9807,12 +9966,15 @@ wsrep_append_foreign_key(
 		foreign->referenced_table : foreign->foreign_table)) {
 		WSREP_DEBUG("pulling %s table into cache",
 			    (referenced) ? "referenced" : "foreign");
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 
 		if (referenced) {
 			foreign->referenced_table =
-				dict_table_get_low(
-					foreign->referenced_table_name_lookup);
+				dict_sys.load_table(
+					{foreign->referenced_table_name_lookup,
+					 strlen(foreign->
+						referenced_table_name_lookup)
+					});
 			if (foreign->referenced_table) {
 				foreign->referenced_index =
 					dict_foreign_find_index(
@@ -9824,8 +9986,10 @@ wsrep_append_foreign_key(
 			}
 		} else {
 	  		foreign->foreign_table =
-				dict_table_get_low(
-					foreign->foreign_table_name_lookup);
+				dict_sys.load_table(
+					{foreign->foreign_table_name_lookup,
+					 strlen(foreign->
+						foreign_table_name_lookup)});
 
 			if (foreign->foreign_table) {
 				foreign->foreign_index =
@@ -9837,7 +10001,7 @@ wsrep_append_foreign_key(
 						TRUE, FALSE);
 			}
 		}
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unlock();
 	}
 
 	if ( !((referenced) ?
@@ -9956,6 +10120,8 @@ wsrep_append_key(
 					(shared, exclusive, semi...) */
 )
 {
+	ut_ad(!trx->is_bulk_insert());
+
 	DBUG_ENTER("wsrep_append_key");
 	DBUG_PRINT("enter",
 		    ("thd: %lu trx: %lld", thd_get_thread_id(thd),
@@ -10446,7 +10612,6 @@ create_table_info_t::create_table_def()
 	DBUG_PRINT("enter", ("table_name: %s", m_table_name));
 
 	DBUG_ASSERT(m_trx->mysql_thd == m_thd);
-	DBUG_ASSERT(!m_drop_before_rollback);
 
 	/* MySQL does the name length check. But we do additional check
 	on the name length here */
@@ -10496,8 +10661,8 @@ create_table_info_t::create_table_def()
 	const ulint actual_n_cols = n_cols
 		+ (m_flags2 & DICT_TF2_FTS && !has_doc_id_col);
 
-	table = dict_mem_table_create(m_table_name, NULL,
-				      actual_n_cols, num_v, m_flags, m_flags2);
+	table = dict_table_t::create({m_table_name,table_name_len}, nullptr,
+				     actual_n_cols, num_v, m_flags, m_flags2);
 
 	/* Set the hidden doc_id column. */
 	if (m_flags2 & DICT_TF2_FTS) {
@@ -10553,7 +10718,6 @@ create_table_info_t::create_table_def()
 				table->name.m_name, field->field_name.str);
 err_col:
 			dict_mem_table_free(table);
-			ut_ad(trx_state_eq(m_trx, TRX_STATE_NOT_STARTED));
 			DBUG_RETURN(HA_ERR_GENERIC);
 		}
 
@@ -10714,29 +10878,21 @@ err_col:
 					    "temporary table creation.");
 		}
 
-		m_trx->table_id = table->id
-			= dict_sys.get_temporary_table_id();
+		table->id = dict_sys.acquire_temporary_table_id();
 		ut_ad(dict_tf_get_rec_format(table->flags)
 		      != REC_FORMAT_COMPRESSED);
 		table->space_id = SRV_TMP_SPACE_ID;
 		table->space = fil_system.temp_space;
 		table->add_to_cache();
 	} else {
-		if (err == DB_SUCCESS) {
-			err = row_create_table_for_mysql(
-				table, m_trx,
-				fil_encryption_t(options->encryption),
-				uint32_t(options->encryption_key_id));
-			m_drop_before_rollback = (err == DB_SUCCESS);
-		}
+		ut_ad(dict_sys.sys_tables_exist());
+
+		err = row_create_table_for_mysql(table, m_trx);
 
 		DBUG_EXECUTE_IF("ib_crash_during_create_for_encryption",
 				DBUG_SUICIDE(););
 	}
 
-	DBUG_EXECUTE_IF("ib_create_err_tablespace_exist",
-			err = DB_TABLESPACE_EXISTS;);
-
 	switch (err) {
 	case DB_SUCCESS:
 		ut_ad(table);
@@ -10745,7 +10901,6 @@ err_col:
 	default:
 		break;
 	case DB_DUPLICATE_KEY:
-	case DB_TABLESPACE_EXISTS:
 		char display_name[FN_REFLEN];
 		char* buf_end = innobase_convert_identifier(
 			display_name, sizeof(display_name) - 1,
@@ -10754,9 +10909,7 @@ err_col:
 
 		*buf_end = '\0';
 
-		my_error(err == DB_DUPLICATE_KEY
-			 ? ER_TABLE_EXISTS_ERROR
-			 : ER_TABLESPACE_EXISTS, MYF(0), display_name);
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), display_name);
 	}
 
 	DBUG_RETURN(convert_error_code_to_mysql(err, m_flags, m_thd));
@@ -10785,6 +10938,7 @@ create_index(
 
 	/* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
 	ut_a(innobase_strcasecmp(key->name.str, innobase_index_reserve_name) != 0);
+	const ha_table_option_struct& o = *form->s->option_struct;
 
 	if (key->flags & (HA_SPATIAL | HA_FULLTEXT)) {
 		/* Only one of these can be specified at a time. */
@@ -10811,7 +10965,9 @@ create_index(
 
 		DBUG_RETURN(convert_error_code_to_mysql(
 				    row_create_index_for_mysql(
-					    index, trx, NULL),
+					    index, trx, NULL,
+					    fil_encryption_t(o.encryption),
+					    uint32_t(o.encryption_key_id)),
 				    table->flags, NULL));
 	}
 
@@ -10909,7 +11065,9 @@ create_index(
 	ulint flags = table->flags;
 
 	error = convert_error_code_to_mysql(
-		row_create_index_for_mysql(index, trx, field_lengths),
+		row_create_index_for_mysql(index, trx, field_lengths,
+					   fil_encryption_t(o.encryption),
+					   uint32_t(o.encryption_key_id)),
 		flags, NULL);
 
 	my_free(field_lengths);
@@ -11006,9 +11164,7 @@ create_table_info_t::create_options_are_invalid()
 
 	/* Check if a non-zero KEY_BLOCK_SIZE was specified. */
 	if (has_key_block_size) {
-		if (is_temp) {
-			my_error(ER_UNSUPPORT_COMPRESSED_TEMPORARY_TABLE,
-				 MYF(0));
+		if (is_temp || innodb_read_only_compressed) {
 			return("KEY_BLOCK_SIZE");
 		}
 
@@ -11063,9 +11219,7 @@ create_table_info_t::create_options_are_invalid()
 	other incompatibilities. */
 	switch (row_format) {
 	case ROW_TYPE_COMPRESSED:
-		if (is_temp) {
-			my_error(ER_UNSUPPORT_COMPRESSED_TEMPORARY_TABLE,
-				 MYF(0));
+		if (is_temp || innodb_read_only_compressed) {
 			return("ROW_FORMAT");
 		}
 		if (!m_allow_file_per_table) {
@@ -11305,8 +11459,7 @@ ha_innobase::update_create_info(
 		return;
 	}
 
-	/* Update the DATA DIRECTORY name from SYS_DATAFILES. */
-	dict_get_and_save_data_dir_path(m_prebuilt->table, false);
+	dict_get_and_save_data_dir_path(m_prebuilt->table);
 
 	if (m_prebuilt->table->data_dir_path) {
 		create_info->data_file_name = m_prebuilt->table->data_dir_path;
@@ -11323,6 +11476,8 @@ innobase_fts_load_stopword(
 	trx_t*		trx,	/*!< in: transaction */
 	THD*		thd)	/*!< in: current thread */
 {
+  ut_ad(dict_sys.locked());
+
   const char *stopword_table= THDVAR(thd, ft_user_stopword_table);
   if (!stopword_table)
   {
@@ -11332,8 +11487,11 @@ innobase_fts_load_stopword(
     mysql_mutex_unlock(&LOCK_global_system_variables);
   }
 
-  return fts_load_stopword(table, trx, stopword_table,
-                           THDVAR(thd, ft_enable_stopword), false);
+  table->fts->dict_locked= true;
+  bool success= fts_load_stopword(table, trx, stopword_table,
+                                  THDVAR(thd, ft_enable_stopword), false);
+  table->fts->dict_locked= false;
+  return success;
 }
 
 /** Parse the table name into normal name and remote path if needed.
@@ -11614,29 +11772,33 @@ index_bad:
 		zip_ssize = 0;
 	}
 
+	ulint level = 0;
+
 	if (is_temp) {
 		m_flags2 |= DICT_TF2_TEMPORARY;
-	} else if (m_use_file_per_table) {
-		m_flags2 |= DICT_TF2_USE_FILE_PER_TABLE;
-	}
+	} else {
+		if (m_use_file_per_table) {
+			m_flags2 |= DICT_TF2_USE_FILE_PER_TABLE;
+		}
 
-	ulint level = ulint(options->page_compression_level);
-	if (!level) {
-		level = page_zip_level;
-		if (!level && options->page_compressed) {
-			push_warning_printf(
-				m_thd, Sql_condition::WARN_LEVEL_WARN,
-				ER_ILLEGAL_HA_CREATE_OPTION,
-				"InnoDB: PAGE_COMPRESSED requires"
-				" PAGE_COMPRESSION_LEVEL or"
-				" innodb_compression_level > 0");
-			DBUG_RETURN(false);
+		level = ulint(options->page_compression_level);
+		if (!level) {
+			level = page_zip_level;
+			if (!level && options->page_compressed) {
+				push_warning_printf(
+					m_thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: PAGE_COMPRESSED requires"
+					" PAGE_COMPRESSION_LEVEL or"
+					" innodb_compression_level > 0");
+				DBUG_RETURN(false);
+			}
 		}
 	}
 
 	/* Set the table flags */
 	dict_tf_set(&m_flags, innodb_row_format, zip_ssize,
-		    m_use_data_dir, options->page_compressed, level);
+		    m_use_data_dir, level && options->page_compressed, level);
 
 	if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) {
 		m_flags |= DICT_TF_MASK_NO_ROLLBACK;
@@ -11788,10 +11950,10 @@ innobase_parse_hint_from_comment(
 
 			/* x-lock index is needed to exclude concurrent
 			pessimistic tree operations */
-			rw_lock_x_lock(dict_index_get_lock(index));
+			index->lock.x_lock(SRW_LOCK_CALL);
 			index->merge_threshold = merge_threshold_table
 				& ((1U << 6) - 1);
-			rw_lock_x_unlock(dict_index_get_lock(index));
+			index->lock.x_unlock();
 
 			continue;
 		}
@@ -11808,11 +11970,11 @@ innobase_parse_hint_from_comment(
 
 				/* x-lock index is needed to exclude concurrent
 				pessimistic tree operations */
-				rw_lock_x_lock(dict_index_get_lock(index));
+				index->lock.x_lock(SRW_LOCK_CALL);
 				index->merge_threshold
 					= merge_threshold_index[i]
 					& ((1U << 6) - 1);
-				rw_lock_x_unlock(dict_index_get_lock(index));
+				index->lock.x_unlock();
 				is_found[i] = true;
 
 				break;
@@ -12180,7 +12342,7 @@ create_table_info_t::create_foreign_keys()
 	ut_ad(alter_info);
 	List_iterator_fast<Key> key_it(alter_info->key_list);
 
-	dict_table_t* table = dict_table_get_low(name);
+	dict_table_t* table = dict_sys.find_table({name,strlen(name)});
 	if (!table) {
 		ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
 				"%s table %s foreign key constraint"
@@ -12524,7 +12686,7 @@ create_table_info_t::create_foreign_keys()
 
 	trx_start_if_not_started_xa(m_trx, true);
 
-	trx_set_dict_operation(m_trx, TRX_DICT_OP_TABLE);
+	m_trx->dict_operation = true;
 
 	error = dict_create_add_foreigns_to_dictionary(local_fk_set, table,
 						       m_trx);
@@ -12566,9 +12728,6 @@ int create_table_info_t::create_table(bool create_fk)
 		DBUG_RETURN(error);
 	}
 
-	DBUG_ASSERT(m_drop_before_rollback
-		    == !(m_flags2 & DICT_TF2_TEMPORARY));
-
 	/* Create the keys */
 
 	if (m_form->s->keys == 0 || primary_key_no == -1) {
@@ -12579,8 +12738,12 @@ int create_table_info_t::create_table(bool create_fk)
 		dict_index_t* index = dict_mem_index_create(
 			m_table, innobase_index_reserve_name,
 			DICT_CLUSTERED, 0);
+		const ha_table_option_struct& o = *m_form->s->option_struct;
 		error = convert_error_code_to_mysql(
-			row_create_index_for_mysql(index, m_trx, NULL),
+			row_create_index_for_mysql(
+				index, m_trx, NULL,
+				fil_encryption_t(o.encryption),
+				uint32_t(o.encryption_key_id)),
 			flags, m_thd);
 		if (error) {
 			DBUG_RETURN(error);
@@ -12665,15 +12828,18 @@ int create_table_info_t::create_table(bool create_fk)
 	dberr_t err = create_fk ? create_foreign_keys() : DB_SUCCESS;
 
 	if (err == DB_SUCCESS) {
+		const dict_err_ignore_t ignore_err = m_trx->check_foreigns
+			? DICT_ERR_IGNORE_NONE : DICT_ERR_IGNORE_FK_NOKEY;
+
 		/* Check that also referencing constraints are ok */
 		dict_names_t	fk_tables;
-		err = dict_load_foreigns(m_table_name, NULL,
-						false, true,
-						DICT_ERR_IGNORE_NONE,
-						fk_tables);
+		err = dict_load_foreigns(m_table_name, nullptr,
+					 m_trx->id, true,
+					 ignore_err, fk_tables);
 		while (err == DB_SUCCESS && !fk_tables.empty()) {
-			dict_load_table(fk_tables.front(),
-					DICT_ERR_IGNORE_NONE);
+			dict_sys.load_table(
+				{fk_tables.front(), strlen(fk_tables.front())},
+				ignore_err);
 			fk_tables.pop_front();
 		}
 	}
@@ -12954,96 +13120,59 @@ bool create_table_info_t::row_size_is_acceptable(
   return true;
 }
 
-/** Update a new table in an InnoDB database.
-@return error number */
-int
-create_table_info_t::create_table_update_dict()
+void create_table_info_t::create_table_update_dict(dict_table_t *table,
+                                                   THD *thd,
+                                                   const HA_CREATE_INFO &info,
+                                                   const TABLE &t)
 {
-	dict_table_t*	innobase_table;
-
-	DBUG_ENTER("create_table_update_dict");
-
-	innobase_table = dict_table_open_on_name(
-		m_table_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
-
-	DBUG_ASSERT(innobase_table != 0);
-	if (innobase_table->fts != NULL) {
-		if (innobase_table->fts_doc_id_index == NULL) {
-			innobase_table->fts_doc_id_index
-				= dict_table_get_index_on_name(
-					innobase_table, FTS_DOC_ID_INDEX_NAME);
-			DBUG_ASSERT(innobase_table->fts_doc_id_index != NULL);
-		} else {
-			DBUG_ASSERT(innobase_table->fts_doc_id_index
-				    == dict_table_get_index_on_name(
-						innobase_table,
-						FTS_DOC_ID_INDEX_NAME));
-		}
-	}
-
-	DBUG_ASSERT((innobase_table->fts == NULL)
-		    == (innobase_table->fts_doc_id_index == NULL));
-
-	innobase_copy_frm_flags_from_create_info(innobase_table, m_create_info);
-
-	dict_stats_update(innobase_table, DICT_STATS_EMPTY_TABLE);
+  ut_ad(dict_sys.locked());
 
-	/* Load server stopword into FTS cache */
-	if (m_flags2 & DICT_TF2_FTS) {
-		if (!innobase_fts_load_stopword(innobase_table, NULL, m_thd)) {
-			dict_table_close(innobase_table, FALSE, FALSE);
-			DBUG_RETURN(-1);
-		}
+  DBUG_ASSERT(table->get_ref_count());
+  if (table->fts)
+  {
+    if (!table->fts_doc_id_index)
+      table->fts_doc_id_index=
+        dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+    else
+      DBUG_ASSERT(table->fts_doc_id_index ==
+                  dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME));
+  }
 
-		mutex_enter(&dict_sys.mutex);
-		fts_optimize_add_table(innobase_table);
-		mutex_exit(&dict_sys.mutex);
-	}
+  DBUG_ASSERT(!table->fts == !table->fts_doc_id_index);
 
-	if (const Field* ai = m_form->found_next_number_field) {
-		ut_ad(ai->stored_in_db());
+  innobase_copy_frm_flags_from_create_info(table, &info);
 
-		ib_uint64_t	autoinc = m_create_info->auto_increment_value;
+  /* Load server stopword into FTS cache */
+  if (table->flags2 & DICT_TF2_FTS &&
+      innobase_fts_load_stopword(table, nullptr, thd))
+    fts_optimize_add_table(table);
 
-		if (autoinc == 0) {
-			autoinc = 1;
-		}
+  if (const Field *ai = t.found_next_number_field)
+  {
+    ut_ad(ai->stored_in_db());
+    ib_uint64_t autoinc= info.auto_increment_value;
+    if (autoinc == 0)
+      autoinc= 1;
 
-		innobase_table->autoinc_mutex.lock();
-		dict_table_autoinc_initialize(innobase_table, autoinc);
+    table->autoinc_mutex.wr_lock();
+    dict_table_autoinc_initialize(table, autoinc);
 
-		if (innobase_table->is_temporary()) {
-			/* AUTO_INCREMENT is not persistent for
-			TEMPORARY TABLE. Temporary tables are never
-			evicted. Keep the counter in memory only. */
-		} else {
-			const unsigned	col_no = innodb_col_no(ai);
-
-			innobase_table->persistent_autoinc
-				= static_cast<uint16_t>(
-					dict_table_get_nth_col_pos(
-						innobase_table, col_no, NULL)
-					+ 1)
-				& dict_index_t::MAX_N_FIELDS;
-
-			/* Persist the "last used" value, which
-			typically is AUTO_INCREMENT - 1.
-			In btr_create(), the value 0 was already written. */
-			if (--autoinc) {
-				btr_write_autoinc(
-					dict_table_get_first_index(
-						innobase_table),
-					autoinc);
-			}
-		}
-
-		innobase_table->autoinc_mutex.unlock();
-	}
+    if (!table->is_temporary())
+    {
+      const unsigned col_no= innodb_col_no(ai);
+      table->persistent_autoinc= static_cast<uint16_t>
+        (dict_table_get_nth_col_pos(table, col_no, nullptr) + 1) &
+        dict_index_t::MAX_N_FIELDS;
+      /* Persist the "last used" value, which typically is AUTO_INCREMENT - 1.
+      In btr_create(), the value 0 was already written. */
+      if (--autoinc)
+        btr_write_autoinc(dict_table_get_first_index(table), autoinc);
+    }
 
-	innobase_parse_hint_from_comment(m_thd, innobase_table, m_form->s);
+    table->autoinc_mutex.wr_unlock();
+  }
 
-	dict_table_close(innobase_table, FALSE, FALSE);
-	DBUG_RETURN(0);
+  innobase_parse_hint_from_comment(thd, table, t.s);
 }
 
 /** Allocate a new trx. */
@@ -13051,9 +13180,7 @@ void
 create_table_info_t::allocate_trx()
 {
 	m_trx = innobase_trx_allocate(m_thd);
-
 	m_trx->will_lock = true;
-	m_trx->ddl = true;
 }
 
 /** Create a new table to an InnoDB database.
@@ -13062,86 +13189,80 @@ create_table_info_t::allocate_trx()
 @param[in]	create_info	Create info (including create statement string).
 @param[in]	file_per_table	whether to create .ibd file
 @param[in,out]	trx		dictionary transaction, or NULL to create new
-@return	0 if success else error number. */
-inline int
-ha_innobase::create(
-	const char*	name,
-	TABLE*		form,
-	HA_CREATE_INFO*	create_info,
-	bool		file_per_table,
-	trx_t*		trx)
+@return error code
+@retval	0 on success */
+int
+ha_innobase::create(const char *name, TABLE *form, HA_CREATE_INFO *create_info,
+                    bool file_per_table, trx_t *trx= nullptr)
 {
-	int		error;
-	char		norm_name[FN_REFLEN];	/* {database}/{tablename} */
-	char		remote_path[FN_REFLEN];	/* Absolute path of table */
-
-	DBUG_ENTER("ha_innobase::create");
-
-	DBUG_ASSERT(form->s == table_share);
-	DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE
-		    || table_share->table_type == TABLE_TYPE_NORMAL);
+  char norm_name[FN_REFLEN];	/* {database}/{tablename} */
+  char remote_path[FN_REFLEN];	/* Absolute path of table */
 
-	create_table_info_t	info(ha_thd(),
-				     form,
-				     create_info,
-				     norm_name,
-				     remote_path,
-				     file_per_table, trx);
+  DBUG_ENTER("ha_innobase::create");
+  DBUG_ASSERT(form->s == table_share);
+  DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE ||
+              table_share->table_type == TABLE_TYPE_NORMAL);
 
-	if ((error = info.initialize())
-	    || (error = info.prepare_create_table(name, !trx))) {
-		if (trx) {
-			trx_rollback_for_mysql(trx);
-			row_mysql_unlock_data_dictionary(trx);
-		}
-		DBUG_RETURN(error);
-	}
+  create_table_info_t info(ha_thd(), form, create_info, norm_name,
+                           remote_path, file_per_table, trx);
 
-	const bool own_trx = !trx;
-
-	if (own_trx) {
-		info.allocate_trx();
-		trx = info.trx();
-		/* Latch the InnoDB data dictionary exclusively so that no deadlocks
-		or lock waits can happen in it during a table create operation.
-		Drop table etc. do this latching in row0mysql.cc. */
-		row_mysql_lock_data_dictionary(trx);
-		DBUG_ASSERT(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
-	}
+  int error= info.initialize();
+  if (!error)
+    error= info.prepare_create_table(name, !trx);
+  if (error)
+    DBUG_RETURN(error);
 
-	if ((error = info.create_table(own_trx))) {
-		/* Drop the being-created table before rollback,
-		so that rollback can possibly rename back a table
-		that could have been renamed before the failed creation. */
-		if (info.drop_before_rollback()) {
-			trx->error_state = DB_SUCCESS;
-			row_drop_table_for_mysql(info.table_name(),
-						 trx, SQLCOM_TRUNCATE, true,
-						 false);
-		}
-		trx_rollback_for_mysql(trx);
-		row_mysql_unlock_data_dictionary(trx);
-		goto func_exit;
-	}
+  const bool own_trx= !trx;
+  if (own_trx)
+  {
+    info.allocate_trx();
+    trx= info.trx();
+    DBUG_ASSERT(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
 
-	innobase_commit_low(trx);
-	row_mysql_unlock_data_dictionary(trx);
+    if (!(info.flags2() & DICT_TF2_TEMPORARY))
+    {
+      trx_start_for_ddl(trx);
+      if (dberr_t err= lock_sys_tables(trx))
+        error= convert_error_code_to_mysql(err, 0, nullptr);
+    }
+    row_mysql_lock_data_dictionary(trx);
+  }
 
-	/* Flush the log to reduce probability that the .frm files and
-	the InnoDB data dictionary get out-of-sync if the user runs
-	with innodb_flush_log_at_trx_commit = 0 */
-	log_buffer_flush_to_disk();
+  if (!error)
+    error= info.create_table(own_trx);
 
-	ut_ad(!srv_read_only_mode);
+  if (own_trx || (info.flags2() & DICT_TF2_TEMPORARY))
+  {
+    if (error)
+      trx_rollback_for_mysql(trx);
+    else
+    {
+      std::vector<pfs_os_file_t> deleted;
+      trx->commit(deleted);
+      ut_ad(deleted.empty());
+      info.table()->acquire();
+      info.create_table_update_dict(info.table(), info.thd(),
+                                    *create_info, *form);
+    }
 
-	error = info.create_table_update_dict();
+    if (own_trx)
+    {
+      row_mysql_unlock_data_dictionary(trx);
 
-func_exit:
-	if (own_trx) {
-		trx->free();
-	}
+      if (!error)
+      {
+        dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE);
+        if (!info.table()->is_temporary())
+          log_write_up_to(trx->commit_lsn, true);
+        info.table()->release();
+      }
+      trx->free();
+    }
+  }
+  else if (!error && m_prebuilt)
+    m_prebuilt->table= info.table();
 
-	DBUG_RETURN(error);
+  DBUG_RETURN(error);
 }
 
 /** Create a new table to an InnoDB database.
@@ -13149,13 +13270,10 @@ func_exit:
 @param[in]	form		Table format; columns and index information.
 @param[in]	create_info	Create info (including create statement string).
 @return	0 if success else error number. */
-int
-ha_innobase::create(
-	const char*	name,
-	TABLE*		form,
-	HA_CREATE_INFO*	create_info)
+int ha_innobase::create(const char *name, TABLE *form,
+                        HA_CREATE_INFO *create_info)
 {
-	return create(name, form, create_info, srv_file_per_table);
+  return create(name, form, create_info, srv_file_per_table);
 }
 
 /*****************************************************************//**
@@ -13174,7 +13292,7 @@ ha_innobase::discard_or_import_tablespace(
 	ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
 	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
 
-	if (high_level_read_only) {
+	if (is_read_only()) {
 		DBUG_RETURN(HA_ERR_TABLE_READONLY);
 	}
 
@@ -13196,15 +13314,18 @@ ha_innobase::discard_or_import_tablespace(
 	}
 
 	trx_start_if_not_started(m_prebuilt->trx, true);
+	m_prebuilt->trx->dict_operation = true;
 
 	/* Obtain an exclusive lock on the table. */
-	dberr_t	err = row_mysql_lock_table(
-		m_prebuilt->trx, m_prebuilt->table, LOCK_X,
-		discard ? "setting table lock for DISCARD TABLESPACE"
-			: "setting table lock for IMPORT TABLESPACE");
+	dberr_t	err = lock_table_for_trx(m_prebuilt->table,
+					 m_prebuilt->trx, LOCK_X);
+	if (err == DB_SUCCESS) {
+		err = lock_sys_tables(m_prebuilt->trx);
+	}
 
 	if (err != DB_SUCCESS) {
 		/* unable to lock the table: do nothing */
+		m_prebuilt->trx->commit();
 	} else if (discard) {
 
 		/* Discarding an already discarded tablespace should be an
@@ -13220,8 +13341,7 @@ ha_innobase::discard_or_import_tablespace(
 		}
 
 		err = row_discard_tablespace_for_mysql(
-			m_prebuilt->table->name.m_name, m_prebuilt->trx);
-
+			m_prebuilt->table, m_prebuilt->trx);
 	} else if (m_prebuilt->table->is_readable()) {
 		/* Commit the transaction in order to
 		release the table lock. */
@@ -13250,8 +13370,7 @@ ha_innobase::discard_or_import_tablespace(
 		}
 	}
 
-	/* Commit the transaction in order to release the table lock. */
-	trx_commit_for_mysql(m_prebuilt->trx);
+	ut_ad(m_prebuilt->trx->state == TRX_STATE_NOT_STARTED);
 
 	if (discard || err != DB_SUCCESS) {
 		DBUG_RETURN(convert_error_code_to_mysql(
@@ -13280,295 +13399,342 @@ ha_innobase::discard_or_import_tablespace(
 	DBUG_RETURN(0);
 }
 
-/**
-   @return 1 if frm file exists
-   @return 0 if it doesn't exists
-*/
-
-static bool frm_file_exists(const char *path)
-{
-  char buff[FN_REFLEN];
-  strxnmov(buff, FN_REFLEN, path, reg_ext, NullS);
-  return !access(buff, F_OK);
-}
-
 
-/**
-Drops a table from an InnoDB database. Before calling this function,
-MySQL calls innobase_commit to commit the transaction of the current user.
-Then the current user cannot have locks set on the table. Drop table
-operation inside InnoDB will remove all locks any user has on the table
-inside InnoDB.
-@param[in]	name	table name
-@param[in]	sqlcom	SQLCOM_DROP_DB, SQLCOM_TRUNCATE, ...
+/** DROP TABLE (possibly as part of DROP DATABASE, CREATE/ALTER TABLE)
+@param name   table name
 @return error number */
-inline int ha_innobase::delete_table(const char* name, enum_sql_command sqlcom)
+int ha_innobase::delete_table(const char *name)
 {
-	dberr_t	err;
-	THD*	thd = ha_thd();
-	char	norm_name[FN_REFLEN];
+  DBUG_ENTER("ha_innobase::delete_table");
+  if (high_level_read_only)
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
 
-	DBUG_ENTER("ha_innobase::delete_table");
+  THD *thd= ha_thd();
 
-	DBUG_EXECUTE_IF(
-		"test_normalize_table_name_low",
-		test_normalize_table_name_low();
-	);
-	DBUG_EXECUTE_IF(
-		"test_ut_format_name",
-		test_ut_format_name();
-	);
+  DBUG_EXECUTE_IF("test_normalize_table_name_low",
+                  test_normalize_table_name_low(););
+  DBUG_EXECUTE_IF("test_ut_format_name", test_ut_format_name(););
 
-	/* Strangely, MySQL passes the table name without the '.frm'
-	extension, in contrast to ::create */
-	normalize_table_name(norm_name, name);
-
-	if (high_level_read_only) {
-		DBUG_RETURN(HA_ERR_TABLE_READONLY);
-	}
-
-	trx_t*	parent_trx = check_trx_exists(thd);
-
-	/* Remove the to-be-dropped table from the list of modified tables
-	by parent_trx. Otherwise we may end up with an orphaned pointer to
-	the table object from parent_trx::mod_tables. This could happen in:
-	SET AUTOCOMMIT=0;
-	CREATE TABLE t (PRIMARY KEY (a)) ENGINE=INNODB SELECT 1 AS a UNION
-	ALL SELECT 1 AS a; */
-	trx_mod_tables_t::const_iterator	iter;
-
-	for (iter = parent_trx->mod_tables.begin();
-	     iter != parent_trx->mod_tables.end();
-	     ++iter) {
-
-		dict_table_t*	table_to_drop = iter->first;
-
-		if (strcmp(norm_name, table_to_drop->name.m_name) == 0) {
-			parent_trx->mod_tables.erase(table_to_drop);
-			break;
-		}
-	}
-
-	trx_t*	trx = innobase_trx_allocate(thd);
-
-	ulint	name_len = strlen(name);
-
-	ut_a(name_len < 1000);
-
-	trx->will_lock = true;
-
-	/* Drop the table in InnoDB */
+  trx_t *parent_trx= check_trx_exists(thd);
+  dict_table_t *table;
 
-	err = row_drop_table_for_mysql(norm_name, trx, sqlcom);
-
-	if (err == DB_TABLE_NOT_FOUND
-	    && innobase_get_lower_case_table_names() == 1) {
-		char*	is_part = is_partition(norm_name);
-
-		if (is_part) {
-			char	par_case_name[FN_REFLEN];
-
-#ifndef __WIN__
-			/* Check for the table using lower
-			case name, including the partition
-			separator "P" */
-			strcpy(par_case_name, norm_name);
-			innobase_casedn_str(par_case_name);
-#else
-			/* On Windows platfrom, check
-			whether there exists table name in
-			system table whose name is
-			not being normalized to lower case */
-			normalize_table_name_c_low(
-				par_case_name, name, FALSE);
+  {
+    char norm_name[FN_REFLEN];
+    normalize_table_name(norm_name, name);
+    span<const char> n{norm_name, strlen(norm_name)};
+
+    dict_sys.lock(SRW_LOCK_CALL);
+    table= dict_sys.load_table(n, DICT_ERR_IGNORE_DROP);
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    if (!table && lower_case_table_names == 1 && is_partition(norm_name))
+    {
+      IF_WIN(normalize_table_name_c_low(norm_name, name, false),
+             innobase_casedn_str(norm_name));
+      table= dict_sys.load_table(n, DICT_ERR_IGNORE_DROP);
+    }
 #endif
-			err = row_drop_table_for_mysql(
-				par_case_name, trx, sqlcom);
-		}
-	}
-
-	if (err == DB_TABLE_NOT_FOUND &&
-            frm_file_exists(name))
-        {
-		/* Test to drop all tables which matches db/tablename + '#'.
-		Only partitions can have '#' as non-first character in
-		the table name!
-
-		Temporary table names always start with '#', partitions are
-		the only 'tables' that can have '#' after the first character
-		and table name must have length > 0. User tables cannot have
-		'#' since it would be translated to @0023. Therefor this should
-		only match partitions. */
-		uint	len = (uint) strlen(norm_name);
-		ulint	num_partitions;
-		ut_a(len < FN_REFLEN);
-		norm_name[len] = '#';
-		norm_name[len + 1] = 0;
-		err = row_drop_database_for_mysql(norm_name, trx,
-			&num_partitions);
-		norm_name[len] = 0;
-		table_name_t tbl_name(norm_name);
-		if (num_partitions == 0 && !tbl_name.is_temporary()) {
-			ib::error() << "Table " << tbl_name <<
-				" does not exist in the InnoDB"
-				" internal data dictionary though MariaDB is"
-				" trying to drop it. Have you copied the .frm"
-				" file of the table to the MariaDB database"
-				" directory from another database? "
-				<< TROUBLESHOOTING_MSG;
-		}
-		if (num_partitions == 0) {
-			err = DB_TABLE_NOT_FOUND;
-		}
-	}
-
-	if (err == DB_TABLE_NOT_FOUND
-	    && innobase_get_lower_case_table_names() == 1) {
-		char*	is_part = is_partition(norm_name);
-
-		if (is_part != NULL) {
-			char	par_case_name[FN_REFLEN];
-
-#ifndef _WIN32
-			/* Check for the table using lower
-			case name, including the partition
-			separator "P" */
-			strcpy(par_case_name, norm_name);
-			innobase_casedn_str(par_case_name);
-#else
-			/* On Windows platfrom, check
-			whether there exists table name in
-			system table whose name is
-			not being normalized to lower case */
-			create_table_info_t::normalize_table_name_low(
-				par_case_name, name, FALSE);
-#endif /* _WIN32 */
-			err = row_drop_table_for_mysql(
-				par_case_name, trx, sqlcom, true);
-		}
-	}
-
-	ut_ad(!srv_read_only_mode);
-	/* Flush the log to reduce probability that the .frm files and
-	the InnoDB data dictionary get out-of-sync if the user runs
-	with innodb_flush_log_at_trx_commit = 0 */
+    if (!table)
+    {
+      dict_sys.unlock();
+      DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+    }
+  }
 
-	log_buffer_flush_to_disk();
+  if (table->is_temporary())
+  {
+    dict_sys.unlock();
+    parent_trx->mod_tables.erase(table); /* CREATE...SELECT error handling */
+    btr_drop_temporary_table(*table);
+    dict_sys.lock(SRW_LOCK_CALL);
+    dict_sys.remove(table);
+    dict_sys.unlock();
+    DBUG_RETURN(0);
+  }
 
-	innobase_commit_low(trx);
+  table->acquire();
+  dict_sys.unlock();
 
-	trx->free();
+  trx_t *trx= parent_trx;
+  dberr_t err= DB_SUCCESS;
+  if (!trx->lock.table_locks.empty() &&
+      false)
+  {
+    /* CREATE TABLE...PRIMARY KEY...SELECT ought to be dropping the
+    table because a duplicate key was detected or a timeout occurred.
+
+    We shall hijack the existing transaction to drop the table and
+    commit the transaction.  If this is a partitioned table, one
+    partition will use this hijacked transaction; others will use a
+    separate transaction, one per partition. */
+    ut_ad(!trx->dict_operation_lock_mode);
+    ut_ad(trx->will_lock);
+    ut_ad(trx->state == TRX_STATE_ACTIVE);
+    trx->dict_operation= true;
+  }
+  else
+  {
+    trx= innobase_trx_allocate(thd);
+    trx_start_for_ddl(trx);
+
+    if (table->name.is_temporary())
+      /* There is no need to lock any FOREIGN KEY child tables. */;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    else if (table->name.part())
+      /* FOREIGN KEY constraints cannot exist on partitioned tables. */;
+#endif
+    else
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      for (const dict_foreign_t* f : table->referenced_set)
+        if (dict_table_t* child= f->foreign_table)
+          if ((err= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS)
+            break;
+      dict_sys.unfreeze();
+    }
+  }
 
-	DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
-}
+  dict_table_t *table_stats= nullptr, *index_stats= nullptr;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  if (err == DB_SUCCESS)
+    err= lock_table_for_trx(table, trx, LOCK_X);
 
-/** Drop an InnoDB table.
-@param[in]	name	table name
-@return error number */
-int ha_innobase::delete_table(const char* name)
-{
-	enum_sql_command sqlcom = enum_sql_command(thd_sql_command(ha_thd()));
-	/* SQLCOM_TRUNCATE should be passed via ha_innobase::truncate() only.
-
-	On client disconnect, when dropping temporary tables, the
-	previous sqlcom would not be overwritten.  In such a case, we
-	will have thd_kill_level() != NOT_KILLED, !m_prebuilt can
-	hold, and sqlcom could be anything, including TRUNCATE.
-
-	The sqlcom only matters for persistent tables; no persistent
-	metadata or FOREIGN KEY metadata is kept for temporary
-	tables. Therefore, we relax the assertion. If there is a bug
-	that slips through this assertion due to !m_prebuilt, the
-	worst impact should be that on DROP TABLE of a persistent
-	table, FOREIGN KEY constraints will be ignored and their
-	metadata will not be removed. */
-	DBUG_ASSERT(sqlcom != SQLCOM_TRUNCATE
-		    || (thd_kill_level(ha_thd()) != THD_IS_NOT_KILLED
-			&& (!m_prebuilt
-			    || m_prebuilt->table->is_temporary())));
-	return delete_table(name, sqlcom);
-}
-
-/** Remove all tables in the named database inside InnoDB.
-@param[in]	hton	handlerton from InnoDB
-@param[in]	path	Database path; Inside InnoDB the name of the last
-directory in the path is used as the database name.
-For example, in 'mysql/data/test' the database name is 'test'. */
+  const bool fts= err == DB_SUCCESS &&
+    (table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS));
+  const enum_sql_command sqlcom= enum_sql_command(thd_sql_command(thd));
 
-static
-void
-innobase_drop_database(
-	handlerton*	hton,
-	char*		path)
-{
-	char*	namebuf;
+  if (fts)
+  {
+    fts_optimize_remove_table(table);
+    purge_sys.stop_FTS(*table);
+    err= fts_lock_tables(trx, *table);
+  }
 
-	/* Get the transaction associated with the current thd, or create one
-	if not yet created */
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  const bool rollback_add_partition=
+    (sqlcom == SQLCOM_ALTER_TABLE && table->name.part());
 
-	DBUG_ASSERT(hton == innodb_hton_ptr);
+  if (rollback_add_partition)
+  {
+    if (!fts)
+      purge_sys.stop_FTS();
+    /* This looks like the rollback of ALTER TABLE...ADD PARTITION
+    that was caused by MDL timeout. We could have written undo log
+    for inserting the data into the new partitions. */
+    if (table->stat_persistent != DICT_STATS_PERSISTENT_OFF)
+    {
+      /* We do not really know if we are holding MDL_EXCLUSIVE. Even
+      though this code is handling the case that we are not holding
+      it, we might actually hold it. We want to avoid a deadlock
+      with dict_stats_process_entry_from_recalc_pool(). */
+      dict_stats_recalc_pool_del(table->id, true);
+      /* If statistics calculation is still using this table, we will
+      catch it below while waiting for purge to stop using this table. */
+    }
+  }
+#endif
 
-	if (high_level_read_only) {
-		return;
-	}
+  DEBUG_SYNC(thd, "before_delete_table_stats");
 
-	THD*	thd = current_thd;
+  if (err == DB_SUCCESS && dict_stats_is_persistent_enabled(table) &&
+      !table->is_stats_table())
+  {
+    table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (table_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      table_stats= dict_acquire_mdl_shared<false>(table_stats,
+                                                  thd, &mdl_table);
+      dict_sys.unfreeze();
+    }
 
-	ulint	len = 0;
-	char*	ptr = strend(path) - 2;
+    index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (index_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      index_stats= dict_acquire_mdl_shared<false>(index_stats,
+                                                  thd, &mdl_index);
+      dict_sys.unfreeze();
+    }
 
-	while (ptr >= path && *ptr != '\\' && *ptr != '/') {
-		ptr--;
-		len++;
-	}
+    const bool skip_wait{table->name.is_temporary()};
 
-	ptr++;
-	namebuf = (char*) my_malloc(PSI_INSTRUMENT_ME, (uint) len + 2, MYF(0));
+    if (table_stats && index_stats &&
+        !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
+        !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
+        !(err= lock_table_for_trx(table_stats, trx, LOCK_X, skip_wait)))
+      err= lock_table_for_trx(index_stats, trx, LOCK_X, skip_wait);
 
-	memcpy(namebuf, ptr, len);
-	namebuf[len] = '/';
-	namebuf[len + 1] = '\0';
+    if (err != DB_SUCCESS && skip_wait)
+    {
+      /* We may skip deleting statistics if we cannot lock the tables,
+      when the table carries a temporary name. */
+      ut_ad(err == DB_LOCK_WAIT);
+      ut_ad(trx->error_state == DB_SUCCESS);
+      err= DB_SUCCESS;
+      dict_table_close(table_stats, false, thd, mdl_table);
+      dict_table_close(index_stats, false, thd, mdl_index);
+      table_stats= nullptr;
+      index_stats= nullptr;
+    }
+  }
 
-#ifdef	_WIN32
-	innobase_casedn_str(namebuf);
-#endif /* _WIN32 */
+  if (err == DB_SUCCESS)
+  {
+    if (!table->space)
+    {
+      const char *data_dir_path= DICT_TF_HAS_DATA_DIR(table->flags)
+        ? table->data_dir_path : nullptr;
+      char *path= fil_make_filepath(data_dir_path, table->name, CFG,
+                                    data_dir_path != nullptr);
+      os_file_delete_if_exists(innodb_data_file_key, path, nullptr);
+      ut_free(path);
+      path= fil_make_filepath(data_dir_path, table->name, IBD,
+                              data_dir_path != nullptr);
+      os_file_delete_if_exists(innodb_data_file_key, path, nullptr);
+      ut_free(path);
+      if (data_dir_path)
+      {
+        path= fil_make_filepath(nullptr, table->name, ISL, false);
+        os_file_delete_if_exists(innodb_data_file_key, path, nullptr);
+        ut_free(path);
+      }
+    }
+    err= lock_sys_tables(trx);
+  }
 
-	trx_t*	trx = innobase_trx_allocate(thd);
-	trx->will_lock = true;
+  dict_sys.lock(SRW_LOCK_CALL);
 
-	ulint	dummy;
+  if (!table->release() && err == DB_SUCCESS)
+  {
+    /* Wait for purge threads to stop using the table. */
+    for (uint n= 15;;)
+    {
+      dict_sys.unlock();
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
+      dict_sys.lock(SRW_LOCK_CALL);
 
-	row_drop_database_for_mysql(namebuf, trx, &dummy);
+      if (!--n)
+      {
+        err= DB_LOCK_WAIT_TIMEOUT;
+        break;
+      }
+      if (!table->get_ref_count())
+        break;
+    }
+  }
 
-	my_free(namebuf);
+  trx->dict_operation_lock_mode= true;
 
-	/* Flush the log to reduce probability that the .frm files and
-	the InnoDB data dictionary get out-of-sync if the user runs
-	with innodb_flush_log_at_trx_commit = 0 */
+  if (err != DB_SUCCESS)
+  {
+err_exit:
+    trx->dict_operation_lock_mode= false;
+    trx->rollback();
+    switch (err) {
+    case DB_CANNOT_DROP_CONSTRAINT:
+    case DB_LOCK_WAIT_TIMEOUT:
+      break;
+    default:
+      ib::error() << "DROP TABLE " << table->name << ": " << err;
+    }
+    if (fts)
+    {
+      fts_optimize_add_table(table);
+      purge_sys.resume_FTS();
+    }
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    else if (rollback_add_partition)
+      purge_sys.resume_FTS();
+#endif
+    if (table_stats)
+      dict_table_close(table_stats, true, thd, mdl_table);
+    if (index_stats)
+      dict_table_close(index_stats, true, thd, mdl_index);
+    dict_sys.unlock();
+    if (trx != parent_trx)
+      trx->free();
+    DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
+  }
 
-	log_buffer_flush_to_disk();
+  if (!table->no_rollback() && trx->check_foreigns)
+  {
+    const bool drop_db= sqlcom == SQLCOM_DROP_DB;
+    for (auto foreign : table->referenced_set)
+    {
+      /* We should allow dropping a referenced table if creating
+      that referenced table has failed for some reason. For example
+      if referenced table is created but it column types that are
+      referenced do not match. */
+      if (foreign->foreign_table == table ||
+          (drop_db &&
+           dict_tables_have_same_db(table->name.m_name,
+                                    foreign->foreign_table_name_lookup)))
+        continue;
+      mysql_mutex_lock(&dict_foreign_err_mutex);
+      rewind(dict_foreign_err_file);
+      ut_print_timestamp(dict_foreign_err_file);
+      fputs("  Cannot drop table ", dict_foreign_err_file);
+      ut_print_name(dict_foreign_err_file, trx, table->name.m_name);
+      fputs("\nbecause it is referenced by ", dict_foreign_err_file);
+      ut_print_name(dict_foreign_err_file, trx, foreign->foreign_table_name);
+      putc('\n', dict_foreign_err_file);
+      mysql_mutex_unlock(&dict_foreign_err_mutex);
+      err= DB_CANNOT_DROP_CONSTRAINT;
+      goto err_exit;
+    }
+  }
 
-	innobase_commit_low(trx);
+  if (!table->no_rollback())
+  {
+    err= trx->drop_table_foreign(table->name);
+    if (err == DB_SUCCESS && table_stats && index_stats)
+      err= trx->drop_table_statistics(table->name);
+    if (err != DB_SUCCESS)
+      goto err_exit;
+  }
 
-	trx->free();
+  err= trx->drop_table(*table);
+  if (err != DB_SUCCESS)
+    goto err_exit;
+
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+  row_mysql_unlock_data_dictionary(trx);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+  log_write_up_to(trx->commit_lsn, true);
+  if (trx != parent_trx)
+    trx->free();
+  if (!fts)
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (!rollback_add_partition)
+#endif
+    DBUG_RETURN(0);
+  purge_sys.resume_FTS();
+  DBUG_RETURN(0);
 }
 
 /** Rename an InnoDB table.
 @param[in,out]	trx	InnoDB data dictionary transaction
 @param[in]	from	old table name
 @param[in]	to	new table name
-@param[in]	commit	whether to commit trx (and to enforce FOREIGN KEY)
+@param[in]	use_fk	whether to enforce FOREIGN KEY
 @return DB_SUCCESS or error code */
-inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
-                                     const char *to, bool commit)
+static dberr_t innobase_rename_table(trx_t *trx, const char *from,
+                                     const char *to, bool use_fk)
 {
 	dberr_t	error;
 	char	norm_to[FN_REFLEN];
 	char	norm_from[FN_REFLEN];
 
 	DBUG_ENTER("innobase_rename_table");
-	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX
-		    || trx_get_dict_operation(trx) == TRX_DICT_OP_TABLE);
+	DBUG_ASSERT(trx->dict_operation);
 
 	ut_ad(!srv_read_only_mode);
 
@@ -13577,21 +13743,13 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
 
 	DEBUG_SYNC_C("innodb_rename_table_ready");
 
-	trx_start_if_not_started(trx, true);
 	ut_ad(trx->will_lock);
 
-	if (commit) {
-		/* Serialize data dictionary operations with dictionary mutex:
-		no deadlocks can occur then in these operations. */
-		row_mysql_lock_data_dictionary(trx);
-	}
-
-	error = row_rename_table_for_mysql(norm_from, norm_to, trx, commit,
-					   commit);
+	error = row_rename_table_for_mysql(norm_from, norm_to, trx, use_fk);
 
 	if (error != DB_SUCCESS) {
 		if (error == DB_TABLE_NOT_FOUND
-		    && innobase_get_lower_case_table_names() == 1) {
+		    && lower_case_table_names == 1) {
 			char*	is_part = is_partition(norm_from);
 
 			if (is_part) {
@@ -13607,13 +13765,12 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
 				whether there exists table name in
 				system table whose name is
 				not being normalized to lower case */
-				create_table_info_t::normalize_table_name_low(
-					par_case_name, from, FALSE);
+				normalize_table_name_c_low(
+					par_case_name, from, false);
 #endif /* _WIN32 */
 				trx_start_if_not_started(trx, true);
 				error = row_rename_table_for_mysql(
-					par_case_name, norm_to, trx,
-					true, false);
+					par_case_name, norm_to, trx, false);
 			}
 		}
 
@@ -13637,16 +13794,6 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
 		}
 	}
 
-	if (commit) {
-		row_mysql_unlock_data_dictionary(trx);
-	}
-
-	/* Flush the log to reduce probability that the .frm
-	files and the InnoDB data dictionary get out-of-sync
-	if the user runs with innodb_flush_log_at_trx_commit = 0 */
-
-	log_buffer_flush_to_disk();
-
 	DBUG_RETURN(error);
 }
 
@@ -13655,108 +13802,251 @@ inline dberr_t innobase_rename_table(trx_t *trx, const char *from,
 @retval	0	on success */
 int ha_innobase::truncate()
 {
-	DBUG_ENTER("ha_innobase::truncate");
+  DBUG_ENTER("ha_innobase::truncate");
 
-	if (high_level_read_only) {
-		DBUG_RETURN(HA_ERR_TABLE_READONLY);
-	}
+  update_thd();
 
-	update_thd();
+  if (is_read_only())
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
 
-	HA_CREATE_INFO	info;
-	mem_heap_t*	heap = mem_heap_create(1000);
-	dict_table_t*	ib_table = m_prebuilt->table;
-	const auto	update_time = ib_table->update_time;
-	const auto	stored_lock = m_prebuilt->stored_select_lock_type;
-	info.init();
-	update_create_info_from_table(&info, table);
+  HA_CREATE_INFO info;
+  dict_table_t *ib_table= m_prebuilt->table;
+  info.init();
+  update_create_info_from_table(&info, table);
+  switch (dict_tf_get_rec_format(ib_table->flags)) {
+  case REC_FORMAT_REDUNDANT:
+    info.row_type= ROW_TYPE_REDUNDANT;
+    break;
+  case REC_FORMAT_COMPACT:
+    info.row_type= ROW_TYPE_COMPACT;
+    break;
+  case REC_FORMAT_COMPRESSED:
+    info.row_type= ROW_TYPE_COMPRESSED;
+    break;
+  case REC_FORMAT_DYNAMIC:
+    info.row_type= ROW_TYPE_DYNAMIC;
+    break;
+  }
 
-	if (ib_table->is_temporary()) {
-		info.options|= HA_LEX_CREATE_TMP_TABLE;
-	} else {
-		if (!ib_table->space) {
-			ib_senderrf(m_user_thd,
-				    IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
-				    table->s->table_name.str);
-		}
+  const auto stored_lock= m_prebuilt->stored_select_lock_type;
+  trx_t *trx= innobase_trx_allocate(m_user_thd);
+  trx_start_for_ddl(trx);
 
-		dict_get_and_save_data_dir_path(ib_table, false);
-	}
+  if (ib_table->is_temporary())
+  {
+    info.options|= HA_LEX_CREATE_TMP_TABLE;
+    btr_drop_temporary_table(*ib_table);
+    m_prebuilt->table= nullptr;
+    row_prebuilt_free(m_prebuilt);
+    m_prebuilt= nullptr;
+    my_free(m_upd_buf);
+    m_upd_buf= nullptr;
+    m_upd_buf_size= 0;
+
+    row_mysql_lock_data_dictionary(trx);
+    ib_table->release();
+    dict_sys.remove(ib_table, false, true);
+    int err= create(ib_table->name.m_name, table, &info, true, trx);
+    row_mysql_unlock_data_dictionary(trx);
+
+    ut_ad(!err);
+    if (!err)
+    {
+      err= open(ib_table->name.m_name, 0, 0);
+      m_prebuilt->table->release();
+      m_prebuilt->stored_select_lock_type= stored_lock;
+    }
 
-	char* data_file_name = ib_table->data_dir_path;
+    trx->free();
 
-	if (data_file_name) {
-		info.data_file_name = data_file_name
-			= mem_heap_strdup(heap, data_file_name);
-	}
+#ifdef BTR_CUR_HASH_ADAPT
+    if (UT_LIST_GET_LEN(ib_table->freed_indexes))
+    {
+      ib_table->vc_templ= nullptr;
+      ib_table->id= 0;
+    }
+    else
+#endif /* BTR_CUR_HASH_ADAPT */
+    dict_mem_table_free(ib_table);
 
-	const char* temp_name = dict_mem_create_temporary_tablename(
-		heap, ib_table->name.m_name, ib_table->id);
-	const char* name = mem_heap_strdup(heap, ib_table->name.m_name);
-	trx_t*	trx = innobase_trx_allocate(m_user_thd);
-	trx->will_lock = true;
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-	row_mysql_lock_data_dictionary(trx);
-	dict_stats_wait_bg_to_stop_using_table(ib_table, trx);
-
-	int err = convert_error_code_to_mysql(
-		innobase_rename_table(trx, ib_table->name.m_name, temp_name,
-				      false),
-		ib_table->flags, m_user_thd);
-	if (err) {
-		trx_rollback_for_mysql(trx);
-		row_mysql_unlock_data_dictionary(trx);
-	} else {
-		switch (dict_tf_get_rec_format(ib_table->flags)) {
-		case REC_FORMAT_REDUNDANT:
-			info.row_type = ROW_TYPE_REDUNDANT;
-			break;
-		case REC_FORMAT_COMPACT:
-			info.row_type = ROW_TYPE_COMPACT;
-			break;
-		case REC_FORMAT_COMPRESSED:
-			info.row_type = ROW_TYPE_COMPRESSED;
-			break;
-		case REC_FORMAT_DYNAMIC:
-			info.row_type = ROW_TYPE_DYNAMIC;
-			break;
-		}
+    DBUG_RETURN(err);
+  }
 
-		err = create(name, table, &info,
-			     ib_table->is_temporary()
-			     || dict_table_is_file_per_table(ib_table), trx);
-	}
+  mem_heap_t *heap= mem_heap_create(1000);
 
-	trx->free();
+  if (!ib_table->space)
+    ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
+                table->s->table_name.str);
 
-	if (!err) {
-		/* Reopen the newly created table, and drop the
-		original table that was renamed to temp_name. */
+  dict_get_and_save_data_dir_path(ib_table);
+  info.data_file_name= ib_table->data_dir_path;
+  const char *temp_name=
+    dict_mem_create_temporary_tablename(heap,
+                                        ib_table->name.m_name, ib_table->id);
+  const char *name= mem_heap_strdup(heap, ib_table->name.m_name);
 
-		row_prebuilt_t* prebuilt = m_prebuilt;
-		uchar* upd_buf = m_upd_buf;
-		ulint upd_buf_size = m_upd_buf_size;
-		/* Mimic ha_innobase::close(). */
-		m_prebuilt = NULL;
-		m_upd_buf = NULL;
-		m_upd_buf_size = 0;
-		err = open(name, 0, 0);
-		if (!err) {
-			m_prebuilt->stored_select_lock_type = stored_lock;
-			m_prebuilt->table->update_time = update_time;
-			row_prebuilt_free(prebuilt, FALSE);
-			delete_table(temp_name, SQLCOM_TRUNCATE);
-			my_free(upd_buf);
-		} else {
-			/* Revert to the old table before truncation. */
-			m_prebuilt = prebuilt;
-			m_upd_buf = upd_buf;
-			m_upd_buf_size = upd_buf_size;
-		}
-	}
+  dict_table_t *table_stats = nullptr, *index_stats = nullptr;
+  MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
 
-	mem_heap_free(heap);
-	DBUG_RETURN(err);
+  dberr_t error= DB_SUCCESS;
+
+  dict_sys.freeze(SRW_LOCK_CALL);
+  for (const dict_foreign_t *f : ib_table->referenced_set)
+    if (dict_table_t *child= f->foreign_table)
+      if ((error= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS)
+        break;
+  dict_sys.unfreeze();
+
+  if (error == DB_SUCCESS)
+    error= lock_table_for_trx(ib_table, trx, LOCK_X);
+
+  const bool fts= error == DB_SUCCESS &&
+    ib_table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+
+  if (fts)
+  {
+    fts_optimize_remove_table(ib_table);
+    purge_sys.stop_FTS(*ib_table);
+    error= fts_lock_tables(trx, *ib_table);
+  }
+
+  /* Wait for purge threads to stop using the table. */
+  for (uint n = 15; ib_table->get_ref_count() > 1; )
+  {
+    if (!--n)
+    {
+      error= DB_LOCK_WAIT_TIMEOUT;
+      break;
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+
+  if (error == DB_SUCCESS && dict_stats_is_persistent_enabled(ib_table) &&
+      !ib_table->is_stats_table())
+  {
+    table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (table_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      table_stats= dict_acquire_mdl_shared<false>(table_stats, m_user_thd,
+                                                  &mdl_table);
+      dict_sys.unfreeze();
+    }
+    index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (index_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      index_stats= dict_acquire_mdl_shared<false>(index_stats, m_user_thd,
+                                                  &mdl_index);
+      dict_sys.unfreeze();
+    }
+
+    if (table_stats && index_stats &&
+        !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
+        !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
+        !(error= lock_table_for_trx(table_stats, trx, LOCK_X)))
+      error= lock_table_for_trx(index_stats, trx, LOCK_X);
+  }
+
+  if (error == DB_SUCCESS)
+    error= lock_sys_tables(trx);
+
+  std::vector<pfs_os_file_t> deleted;
+
+  row_mysql_lock_data_dictionary(trx);
+
+  if (error == DB_SUCCESS)
+  {
+    error= innobase_rename_table(trx, ib_table->name.m_name, temp_name, false);
+    if (error == DB_SUCCESS)
+      error= trx->drop_table(*ib_table);
+  }
+
+  int err = convert_error_code_to_mysql(error, ib_table->flags, m_user_thd);
+  const auto update_time = ib_table->update_time;
+
+  if (err)
+  {
+    trx_rollback_for_mysql(trx);
+    if (fts)
+      fts_optimize_add_table(ib_table);
+  }
+  else
+  {
+    const auto def_trx_id= ib_table->def_trx_id;
+    ib_table->release();
+    m_prebuilt->table= nullptr;
+
+    err= create(name, table, &info, dict_table_is_file_per_table(ib_table),
+                trx);
+    if (!err)
+    {
+      m_prebuilt->table->acquire();
+      create_table_info_t::create_table_update_dict(m_prebuilt->table,
+                                                    m_user_thd, info, *table);
+      trx->commit(deleted);
+    }
+    else
+    {
+      trx_rollback_for_mysql(trx);
+      m_prebuilt->table= dict_table_open_on_name(name, true,
+                                                 DICT_ERR_IGNORE_FK_NOKEY);
+      m_prebuilt->table->def_trx_id= def_trx_id;
+    }
+    dict_names_t fk_tables;
+    dict_load_foreigns(m_prebuilt->table->name.m_name, nullptr, 1, true,
+                       DICT_ERR_IGNORE_FK_NOKEY, fk_tables);
+    for (const char *f : fk_tables)
+      dict_sys.load_table({f, strlen(f)});
+  }
+
+  if (fts)
+    purge_sys.resume_FTS();
+
+  row_mysql_unlock_data_dictionary(trx);
+  for (pfs_os_file_t d : deleted) os_file_close(d);
+
+  if (!err)
+  {
+    dict_stats_update(m_prebuilt->table, DICT_STATS_EMPTY_TABLE);
+    log_write_up_to(trx->commit_lsn, true);
+    row_prebuilt_t *prebuilt= m_prebuilt;
+    uchar *upd_buf= m_upd_buf;
+    ulint upd_buf_size= m_upd_buf_size;
+    /* Mimic ha_innobase::close(). */
+    m_prebuilt= nullptr;
+    m_upd_buf= nullptr;
+    m_upd_buf_size= 0;
+
+    err= open(name, 0, 0);
+    if (!err)
+    {
+      m_prebuilt->stored_select_lock_type= stored_lock;
+      m_prebuilt->table->update_time= update_time;
+      row_prebuilt_free(prebuilt);
+      my_free(upd_buf);
+    }
+    else
+    {
+      /* Revert to the old table. */
+      m_prebuilt= prebuilt;
+      m_upd_buf= upd_buf;
+      m_upd_buf_size= upd_buf_size;
+    }
+  }
+
+  trx->free();
+
+  mem_heap_free(heap);
+
+  if (table_stats)
+    dict_table_close(table_stats, false, m_user_thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, false, m_user_thd, mdl_index);
+
+  DBUG_RETURN(err);
 }
 
 /*********************************************************************//**
@@ -13779,56 +14069,145 @@ ha_innobase::rename_table(
 	}
 
 	trx_t*	trx = innobase_trx_allocate(thd);
-	trx->will_lock = true;
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	trx_start_for_ddl(trx);
 
-	dberr_t	error = innobase_rename_table(trx, from, to, true);
+	dict_table_t *table_stats = nullptr, *index_stats = nullptr;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	char norm_from[MAX_FULL_NAME_LEN];
+	char norm_to[MAX_FULL_NAME_LEN];
 
-	DEBUG_SYNC(thd, "after_innobase_rename_table");
-
-	innobase_commit_low(trx);
+	normalize_table_name(norm_from, from);
+	normalize_table_name(norm_to, to);
 
-	trx->free();
+	dberr_t error = DB_SUCCESS;
+	const bool from_temp = dict_table_t::is_temporary_name(norm_from);
+
+	if (from_temp) {
+		/* There is no need to lock any FOREIGN KEY child tables. */
+	} else if (dict_table_t *table = dict_table_open_on_name(
+		    norm_from, false, DICT_ERR_IGNORE_FK_NOKEY)) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		for (const dict_foreign_t* f : table->referenced_set) {
+			if (dict_table_t* child = f->foreign_table) {
+				error = lock_table_for_trx(child, trx, LOCK_X);
+				if (error != DB_SUCCESS) {
+					break;
+				}
+			}
+		}
+		dict_sys.unfreeze();
+		if (error == DB_SUCCESS) {
+			error = lock_table_for_trx(table, trx, LOCK_X);
+		}
+		table->release();
+	}
+
+	if (strcmp(norm_from, TABLE_STATS_NAME)
+	    && strcmp(norm_from, INDEX_STATS_NAME)
+	    && strcmp(norm_to, TABLE_STATS_NAME)
+	    && strcmp(norm_to, INDEX_STATS_NAME)) {
+		table_stats = dict_table_open_on_name(TABLE_STATS_NAME, false,
+						      DICT_ERR_IGNORE_NONE);
+		if (table_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			table_stats = dict_acquire_mdl_shared<false>(
+				table_stats, thd, &mdl_table);
+			dict_sys.unfreeze();
+		}
+		index_stats = dict_table_open_on_name(INDEX_STATS_NAME, false,
+						      DICT_ERR_IGNORE_NONE);
+		if (index_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			index_stats = dict_acquire_mdl_shared<false>(
+				index_stats, thd, &mdl_index);
+			dict_sys.unfreeze();
+		}
+
+		if (error == DB_SUCCESS && table_stats && index_stats
+		    && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME)
+		    && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
+			error = lock_table_for_trx(table_stats, trx, LOCK_X,
+						   from_temp);
+			if (error == DB_SUCCESS) {
+				error = lock_table_for_trx(index_stats, trx,
+							   LOCK_X, from_temp);
+			}
+			if (error != DB_SUCCESS && from_temp) {
+				ut_ad(error == DB_LOCK_WAIT);
+				ut_ad(trx->error_state == DB_SUCCESS);
+				error = DB_SUCCESS;
+				/* We may skip renaming statistics if
+				we cannot lock the tables, when the
+				table is being renamed from from a
+				temporary name. */
+				dict_table_close(table_stats, false, thd,
+						 mdl_table);
+				dict_table_close(index_stats, false, thd,
+						 mdl_index);
+				table_stats = nullptr;
+				index_stats = nullptr;
+			}
+		}
+	}
 
 	if (error == DB_SUCCESS) {
-		char	norm_from[MAX_FULL_NAME_LEN];
-		char	norm_to[MAX_FULL_NAME_LEN];
-		char	errstr[512];
-		dberr_t	ret;
+		error = lock_table_for_trx(dict_sys.sys_tables, trx, LOCK_X);
+		if (error == DB_SUCCESS) {
+			error = lock_table_for_trx(dict_sys.sys_foreign, trx,
+						   LOCK_X);
+			if (error == DB_SUCCESS) {
+				error = lock_table_for_trx(
+					dict_sys.sys_foreign_cols,
+					trx, LOCK_X);
+			}
+		}
+	}
 
-		normalize_table_name(norm_from, from);
-		normalize_table_name(norm_to, to);
+	row_mysql_lock_data_dictionary(trx);
 
-		ret = dict_stats_rename_table(norm_from, norm_to,
-					      errstr, sizeof(errstr));
+	if (error == DB_SUCCESS) {
+		error = innobase_rename_table(trx, from, to, true);
+	}
 
-		if (ret != DB_SUCCESS) {
-			ib::error() << errstr;
+	DEBUG_SYNC(thd, "after_innobase_rename_table");
 
-			push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
-				     ER_LOCK_WAIT_TIMEOUT, errstr);
+	if (error == DB_SUCCESS && table_stats && index_stats) {
+		error = dict_stats_rename_table(norm_from, norm_to, trx);
+		if (error == DB_DUPLICATE_KEY) {
+			/* The duplicate may also occur in
+			mysql.innodb_index_stats.  */
+			my_error(ER_DUP_KEY, MYF(0),
+				 "mysql.innodb_table_stats");
+			error = DB_ERROR;
 		}
 	}
 
-	/* Add a special case to handle the Duplicated Key error
-	and return DB_ERROR instead.
-	This is to avoid a possible SIGSEGV error from mysql error
-	handling code. Currently, mysql handles the Duplicated Key
-	error by re-entering the storage layer and getting dup key
-	info by calling get_dup_key(). This operation requires a valid
-	table handle ('row_prebuilt_t' structure) which could no
-	longer be available in the error handling stage. The suggested
-	solution is to report a 'table exists' error message (since
-	the dup key error here is due to an existing table whose name
-	is the one we are trying to rename to) and return the generic
-	error code. */
+	if (error == DB_SUCCESS) {
+		innobase_commit_low(trx);
+	} else {
+		trx->rollback();
+	}
+
+	if (table_stats) {
+		dict_table_close(table_stats, true, thd, mdl_table);
+	}
+	if (index_stats) {
+		dict_table_close(index_stats, true, thd, mdl_index);
+	}
+	row_mysql_unlock_data_dictionary(trx);
+	if (error == DB_SUCCESS) {
+		log_write_up_to(trx->commit_lsn, true);
+	}
+	trx->free();
+
 	if (error == DB_DUPLICATE_KEY) {
+		/* We are not able to deal with handler::get_dup_key()
+		during DDL operations, because the duplicate key would
+		exist in metadata tables, not in the user table. */
 		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
-
 		error = DB_ERROR;
 	} else if (error == DB_LOCK_WAIT_TIMEOUT) {
 		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0), to);
-
 		error = DB_LOCK_WAIT;
 	}
 
@@ -14099,15 +14478,6 @@ ha_innobase::read_time(
 	return(ranges + (double) rows / (double) total_rows * time_for_scan);
 }
 
-/** Update the system variable with the given value of the InnoDB
-buffer pool size.
-@param[in]	buf_pool_size	given value of buffer pool size.*/
-void
-innodb_set_buf_pool_size(ulonglong buf_pool_size)
-{
-	innobase_buffer_pool_size = buf_pool_size;
-}
-
 /*********************************************************************//**
 Calculates the key number used inside MySQL for an Innobase index.
 @return the key number used inside MySQL */
@@ -14318,8 +14688,6 @@ ha_innobase::info_low(
 
 	DEBUG_SYNC_C("ha_innobase_info_low");
 
-	ut_ad(!mutex_own(&dict_sys.mutex));
-
 	/* If we are forcing recovery at a high level, we will suppress
 	statistics calculation on tables, because that may crash the
 	server if an index is badly corrupted. */
@@ -14336,7 +14704,12 @@ ha_innobase::info_low(
 	DBUG_ASSERT(ib_table->get_ref_count() > 0);
 
 	if (!ib_table->is_readable()) {
+		ib_table->stats_mutex_lock();
 		ib_table->stat_initialized = true;
+		ib_table->stat_n_rows = 0;
+		ib_table->stat_clustered_index_size = 0;
+		ib_table->stat_sum_of_other_index_sizes = 0;
+		ib_table->stats_mutex_unlock();
 	}
 
 	if (flag & HA_STATUS_TIME) {
@@ -14348,15 +14721,11 @@ ha_innobase::info_low(
 			m_prebuilt->trx->op_info = "updating table statistics";
 
 			if (dict_stats_is_persistent_enabled(ib_table)) {
-
 				if (is_analyze) {
-					row_mysql_lock_data_dictionary(
-						m_prebuilt->trx);
-					dict_stats_recalc_pool_del(ib_table);
-					dict_stats_wait_bg_to_stop_using_table(
-						ib_table, m_prebuilt->trx);
-					row_mysql_unlock_data_dictionary(
-						m_prebuilt->trx);
+					if (!srv_read_only_mode) {
+						dict_stats_recalc_pool_del(
+							ib_table->id, false);
+					}
 					opt = DICT_STATS_RECALC_PERSISTENT;
 				} else {
 					/* This is e.g. 'SHOW INDEXES', fetch
@@ -14369,13 +14738,6 @@ ha_innobase::info_low(
 
 			ret = dict_stats_update(ib_table, opt);
 
-			if (opt == DICT_STATS_RECALC_PERSISTENT) {
-				mutex_enter(&dict_sys.mutex);
-				ib_table->stats_bg_flag
-					&= byte(~BG_STAT_SHOULD_QUIT);
-				mutex_exit(&dict_sys.mutex);
-			}
-
 			if (ret != DB_SUCCESS) {
 				m_prebuilt->trx->op_info = "";
 				DBUG_RETURN(HA_ERR_GENERIC);
@@ -14389,8 +14751,6 @@ ha_innobase::info_low(
 		stats.update_time = (ulong) ib_table->update_time;
 	}
 
-	DBUG_EXECUTE_IF("dict_sys_mutex_avoid", goto func_exit;);
-
 	dict_stats_init(ib_table);
 
 	if (flag & HA_STATUS_VARIABLE) {
@@ -14398,7 +14758,7 @@ ha_innobase::info_low(
 		ulint	stat_clustered_index_size;
 		ulint	stat_sum_of_other_index_sizes;
 
-		mutex_enter(&dict_sys.mutex);
+		ib_table->stats_mutex_lock();
 
 		ut_a(ib_table->stat_initialized);
 
@@ -14410,7 +14770,7 @@ ha_innobase::info_low(
 		stat_sum_of_other_index_sizes
 			= ib_table->stat_sum_of_other_index_sizes;
 
-		mutex_exit(&dict_sys.mutex);
+		ib_table->stats_mutex_unlock();
 
 		/*
 		The MySQL optimizer seems to assume in a left join that n_rows
@@ -14453,11 +14813,11 @@ ha_innobase::info_low(
 			stats.index_file_length
 				= ulonglong(stat_sum_of_other_index_sizes)
 				* size;
-			rw_lock_s_lock(&space->latch);
+			space->s_lock();
 			stats.delete_length = 1024
 				* fsp_get_available_space_in_free_extents(
 					*space);
-			rw_lock_s_unlock(&space->latch);
+			space->s_unlock();
 		}
 		stats.check_time = 0;
 		stats.mrr_length_per_rec= (uint)ref_length +  8; // 8 = max(sizeof(void *));
@@ -14528,10 +14888,9 @@ ha_innobase::info_low(
 			stats.create_time = (ulong) stat_info.ctime;
 		}
 
-		struct Locking {
-			Locking() { mutex_enter(&dict_sys.mutex); }
-			~Locking() { mutex_exit(&dict_sys.mutex); }
-		} locking;
+		ib_table->stats_mutex_lock();
+		auto _ = make_scope_exit([ib_table]() {
+			ib_table->stats_mutex_unlock(); });
 
 		ut_a(ib_table->stat_initialized);
 
@@ -14566,7 +14925,7 @@ ha_innobase::info_low(
 					sql_print_error(
 						"Index %s of %s has %u columns"
 					        " unique inside InnoDB, but "
-						"MySQL is asking statistics for"
+						"server is asking statistics for"
 					        " %lu columns. Have you mixed "
 						"up .frm files from different "
 						" installations? %s",
@@ -14612,7 +14971,7 @@ ha_innobase::info_low(
 		}
 	}
 
-	if (srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE) {
+	if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
 
 		goto func_exit;
 
@@ -14683,90 +15042,53 @@ ha_innobase::analyze(THD*, HA_CHECK_OPT*)
 /*****************************************************************//**
 Defragment table.
 @return	error number */
-inline int ha_innobase::defragment_table(const char *name)
+inline int ha_innobase::defragment_table()
 {
-	char		norm_name[FN_REFLEN];
-	dict_table_t*	table = NULL;
-	dict_index_t*	index = NULL;
-	int		ret = 0;
-	dberr_t		err = DB_SUCCESS;
-
-	normalize_table_name(norm_name, name);
-
-	table = dict_table_open_on_name(norm_name, FALSE,
-		FALSE, DICT_ERR_IGNORE_FK_NOKEY);
-
-	for (index = dict_table_get_first_index(table); index;
-	     index = dict_table_get_next_index(index)) {
-
-		if (index->is_corrupted()) {
-			continue;
-		}
-
-		if (dict_index_is_spatial(index)) {
-			/* Do not try to defragment spatial indexes,
-			because doing it properly would require
-			appropriate logic around the SSN (split
-			sequence number). */
-			continue;
-		}
-
-		if (index->page == FIL_NULL) {
-			/* Do not defragment auxiliary tables related
-			to FULLTEXT INDEX. */
-			ut_ad(index->type & DICT_FTS);
-			continue;
-		}
-
-		if (btr_defragment_find_index(index)) {
-			// We borrow this error code. When the same index is
-			// already in the defragmentation queue, issue another
-			// defragmentation only introduces overhead. We return
-			// an error here to let the user know this is not
-			// necessary. Note that this will fail a query that's
-			// trying to defragment a full table if one of the
-			// indicies in that table is already in defragmentation.
-			// We choose this behavior so user is aware of this
-			// rather than silently defragment other indicies of
-			// that table.
-			ret = ER_SP_ALREADY_EXISTS;
-			break;
-		}
-
-		os_event_t event = btr_defragment_add_index(index, &err);
+  for (dict_index_t *index= dict_table_get_first_index(m_prebuilt->table);
+       index; index= dict_table_get_next_index(index))
+  {
+    if (!index->is_btree())
+      continue;
 
-		if (err != DB_SUCCESS) {
-			push_warning_printf(
-				current_thd,
-				Sql_condition::WARN_LEVEL_WARN,
-				ER_NO_SUCH_TABLE,
-				"Table %s is encrypted but encryption service or"
-				" used key_id is not available. "
-				" Can't continue checking table.",
-				index->table->name.m_name);
+    if (btr_defragment_find_index(index))
+    {
+      // We borrow this error code. When the same index is already in
+      // the defragmentation queue, issuing another defragmentation
+      // only introduces overhead. We return an error here to let the
+      // user know this is not necessary. Note that this will fail a
+      // query that's trying to defragment a full table if one of the
+      // indicies in that table is already in defragmentation.  We
+      // choose this behavior so user is aware of this rather than
+      // silently defragment other indicies of that table.
+      return ER_SP_ALREADY_EXISTS;
+    }
 
-			ret = convert_error_code_to_mysql(err, 0, current_thd);
-			break;
-		}
+    btr_pcur_t pcur;
 
-		if (event) {
-			while(os_event_wait_time(event, 1000000)) {
-				if (thd_killed(current_thd)) {
-					btr_defragment_remove_index(index);
-					ret = ER_QUERY_INTERRUPTED;
-					break;
-				}
-			}
-			os_event_destroy(event);
-		}
+    mtr_t mtr;
+    mtr.start();
+    if (dberr_t err= pcur.open_leaf(true, index, BTR_SEARCH_LEAF, &mtr))
+    {
+      mtr.commit();
+      return convert_error_code_to_mysql(err, 0, m_user_thd);
+    }
+    else if (btr_pcur_get_block(&pcur)->page.id().page_no() == index->page)
+    {
+      mtr.commit();
+      continue;
+    }
 
-		if (ret) {
-			break;
-		}
-	}
+    btr_pcur_move_to_next(&pcur, &mtr);
+    btr_pcur_store_position(&pcur, &mtr);
+    mtr.commit();
+    ut_ad(pcur.index() == index);
+    const bool interrupted= btr_defragment_add_index(&pcur, m_user_thd);
+    ut_free(pcur.old_rec_buf);
+    if (interrupted)
+      return ER_QUERY_INTERRUPTED;
+  }
 
-	dict_table_close(table, FALSE, FALSE);
-	return ret;
+  return 0;
 }
 
 /**********************************************************************//**
@@ -14790,8 +15112,10 @@ ha_innobase::optimize(
 	calls to OPTIMIZE, which is undesirable. */
 	bool try_alter = true;
 
-	if (!m_prebuilt->table->is_temporary() && srv_defragment) {
-		int err = defragment_table(m_prebuilt->table->name.m_name);
+	if (!m_prebuilt->table->is_temporary()
+	    && m_prebuilt->table->is_readable()
+	    && srv_defragment) {
+		int err = defragment_table();
 
 		if (err == 0) {
 			try_alter = false;
@@ -14831,7 +15155,6 @@ ha_innobase::check(
 	THD*		thd,		/*!< in: user thread handle */
 	HA_CHECK_OPT*	check_opt)	/*!< in: check options */
 {
-	dict_index_t*	index;
 	ulint		n_rows;
 	ulint		n_rows_in_table	= ULINT_UNDEFINED;
 	bool		is_ok		= true;
@@ -14839,8 +15162,10 @@ ha_innobase::check(
 
 	DBUG_ENTER("ha_innobase::check");
 	DBUG_ASSERT(thd == ha_thd());
+	DBUG_ASSERT(thd == m_user_thd);
 	ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
 	ut_a(m_prebuilt->trx == thd_to_trx(thd));
+	ut_ad(m_prebuilt->trx->mysql_thd == thd);
 
 	if (m_prebuilt->mysql_template == NULL) {
 		/* Build the template; we will use a dummy template
@@ -14850,7 +15175,6 @@ ha_innobase::check(
 	}
 
 	if (!m_prebuilt->table->space) {
-
 		ib_senderrf(
 			thd,
 			IB_LOG_LEVEL_ERROR,
@@ -14858,10 +15182,7 @@ ha_innobase::check(
 			table->s->table_name.str);
 
 		DBUG_RETURN(HA_ADMIN_CORRUPT);
-
-	} else if (!m_prebuilt->table->is_readable() &&
-		   !m_prebuilt->table->space) {
-
+	} else if (!m_prebuilt->table->is_readable()) {
 		ib_senderrf(
 			thd, IB_LOG_LEVEL_ERROR,
 			ER_TABLESPACE_MISSING,
@@ -14872,31 +15193,6 @@ ha_innobase::check(
 
 	m_prebuilt->trx->op_info = "checking table";
 
-	if (m_prebuilt->table->corrupted) {
-		/* If some previous operation has marked the table as
-		corrupted in memory, and has not propagated such to
-		clustered index, we will do so here */
-		index = dict_table_get_first_index(m_prebuilt->table);
-
-		if (!index->is_corrupted()) {
-			dict_set_corrupted(
-				index, m_prebuilt->trx, "CHECK TABLE");
-		}
-
-		push_warning_printf(m_user_thd,
-				    Sql_condition::WARN_LEVEL_WARN,
-				    HA_ERR_INDEX_CORRUPT,
-				    "InnoDB: Index %s is marked as"
-				    " corrupted",
-				    index->name());
-
-		/* Now that the table is already marked as corrupted,
-		there is no need to check any index of this table */
-		m_prebuilt->trx->op_info = "";
-
-		DBUG_RETURN(HA_ADMIN_CORRUPT);
-	}
-
 	uint old_isolation_level = m_prebuilt->trx->isolation_level;
 
 	/* We must run the index record counts at an isolation level
@@ -14904,49 +15200,37 @@ ha_innobase::check(
 	of records in some index; to play safe, we normally use
 	REPEATABLE READ here */
 	m_prebuilt->trx->isolation_level = high_level_read_only
+		&& !m_prebuilt->table->is_temporary()
 		? TRX_ISO_READ_UNCOMMITTED
 		: TRX_ISO_REPEATABLE_READ;
 
-	ut_ad(!m_prebuilt->table->corrupted);
+	trx_start_if_not_started(m_prebuilt->trx, false);
+	m_prebuilt->trx->read_view.open(m_prebuilt->trx);
 
-	for (index = dict_table_get_first_index(m_prebuilt->table);
-	     index != NULL;
+	for (dict_index_t* index
+	     = dict_table_get_first_index(m_prebuilt->table);
+	     index;
 	     index = dict_table_get_next_index(index)) {
 		/* If this is an index being created or dropped, skip */
 		if (!index->is_committed()) {
 			continue;
 		}
+		if (index->type & DICT_FTS) {
+			/* We do not check any FULLTEXT INDEX. */
+			continue;
+		}
 
-		if (!(check_opt->flags & T_QUICK)
-		    && !index->is_corrupted()) {
-
-			dberr_t err = btr_validate_index(
-					index, m_prebuilt->trx);
-
-			if (err != DB_SUCCESS) {
-				is_ok = false;
-
-				if (err == DB_DECRYPTION_FAILED) {
-					push_warning_printf(
-						thd,
-						Sql_condition::WARN_LEVEL_WARN,
-						ER_NO_SUCH_TABLE,
-						"Table %s is encrypted but encryption service or"
-						" used key_id is not available. "
-						" Can't continue checking table.",
-						index->table->name.m_name);
-				} else {
-					push_warning_printf(
-						thd,
-						Sql_condition::WARN_LEVEL_WARN,
-						ER_NOT_KEYFILE,
-						"InnoDB: The B-tree of"
-						" index %s is corrupted.",
-						index->name());
-				}
-
-				continue;
-			}
+		if ((check_opt->flags & T_QUICK) || index->is_corrupted()) {
+		} else if (btr_validate_index(index, m_prebuilt->trx)
+			   != DB_SUCCESS) {
+			is_ok = false;
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_NOT_KEYFILE,
+					    "InnoDB: The B-tree of"
+					    " index %s is corrupted.",
+					    index->name());
+			continue;
 		}
 
 		/* Instead of invoking change_active_index(), set up
@@ -14961,15 +15245,14 @@ ha_innobase::check(
 			"dict_set_index_corrupted",
 			if (!index->is_primary()) {
 				m_prebuilt->index_usable = FALSE;
-				// row_mysql_lock_data_dictionary(m_prebuilt->trx);
-				dict_set_corrupted(index, m_prebuilt->trx, "dict_set_index_corrupted");
-				// row_mysql_unlock_data_dictionary(m_prebuilt->trx);
+				dict_set_corrupted(index,
+						   "dict_set_index_corrupted");
 			});
 
 		if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) {
 			if (index->is_corrupted()) {
 				push_warning_printf(
-					m_user_thd,
+					thd,
 					Sql_condition::WARN_LEVEL_WARN,
 					HA_ERR_INDEX_CORRUPT,
 					"InnoDB: Index %s is marked as"
@@ -14978,7 +15261,7 @@ ha_innobase::check(
 				is_ok = false;
 			} else {
 				push_warning_printf(
-					m_user_thd,
+					thd,
 					Sql_condition::WARN_LEVEL_WARN,
 					HA_ERR_TABLE_DEF_CHANGED,
 					"InnoDB: Insufficient history for"
@@ -14991,18 +15274,22 @@ ha_innobase::check(
 		m_prebuilt->sql_stat_start = TRUE;
 		m_prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
 		m_prebuilt->n_template = 0;
-		m_prebuilt->need_to_access_clustered = FALSE;
+		m_prebuilt->read_just_key = 0;
+		m_prebuilt->autoinc_error = DB_SUCCESS;
+		m_prebuilt->need_to_access_clustered =
+			!!(check_opt->flags & T_EXTEND);
 
 		dtuple_set_n_fields(m_prebuilt->search_tuple, 0);
 
 		m_prebuilt->select_lock_type = LOCK_NONE;
 
 		/* Scan this index. */
-		if (dict_index_is_spatial(index)) {
+		if (index->is_spatial()) {
 			ret = row_count_rtree_recs(m_prebuilt, &n_rows);
+		} else if (index->type & DICT_FTS) {
+			ret = DB_SUCCESS;
 		} else {
-			ret = row_scan_index_for_mysql(
-				m_prebuilt, index, &n_rows);
+			ret = row_check_index(m_prebuilt, &n_rows);
 		}
 
 		DBUG_EXECUTE_IF(
@@ -15011,11 +15298,18 @@ ha_innobase::check(
 				ret = DB_CORRUPTION;
 			});
 
-		if (ret == DB_INTERRUPTED || thd_killed(m_user_thd)) {
+		if (ret == DB_INTERRUPTED || thd_killed(thd)) {
 			/* Do not report error since this could happen
 			during shutdown */
 			break;
 		}
+
+		if (ret == DB_SUCCESS
+		    && m_prebuilt->autoinc_error != DB_MISSING_HISTORY) {
+			/* See if any non-fatal errors were reported. */
+			ret = m_prebuilt->autoinc_error;
+		}
+
 		if (ret != DB_SUCCESS) {
 			/* Assume some kind of corruption. */
 			push_warning_printf(
@@ -15025,8 +15319,7 @@ ha_innobase::check(
 				" index %s is corrupted.",
 				index->name());
 			is_ok = false;
-			dict_set_corrupted(
-				index, m_prebuilt->trx, "CHECK TABLE-check index");
+			dict_set_corrupted(index, "CHECK TABLE-check index");
 		}
 
 
@@ -15041,9 +15334,7 @@ ha_innobase::check(
 				" entries, should be " ULINTPF ".",
 				index->name(), n_rows, n_rows_in_table);
 			is_ok = false;
-			dict_set_corrupted(
-				index, m_prebuilt->trx,
-				"CHECK TABLE; Wrong count");
+			dict_set_corrupted(index, "CHECK TABLE; Wrong count");
 		}
 	}
 
@@ -15204,12 +15495,9 @@ get_foreign_key_info(
 	/* Load referenced table to update FK referenced key name. */
 	if (foreign->referenced_table == NULL) {
 
-		dict_table_t*	ref_table;
-
-		ut_ad(mutex_own(&dict_sys.mutex));
-		ref_table = dict_table_open_on_name(
+		dict_table_t*	ref_table = dict_table_open_on_name(
 			foreign->referenced_table_name_lookup,
-			TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+			true, DICT_ERR_IGNORE_NONE);
 
 		if (ref_table == NULL) {
 
@@ -15222,8 +15510,7 @@ get_foreign_key_info(
 					<< foreign->foreign_table_name;
  			}
 		} else {
-
-			dict_table_close(ref_table, TRUE, FALSE);
+			dict_table_close(ref_table, true);
 		}
 	}
 
@@ -15261,7 +15548,7 @@ ha_innobase::get_foreign_key_list(
 
 	m_prebuilt->trx->op_info = "getting list of foreign keys";
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.lock(SRW_LOCK_CALL);
 
 	for (dict_foreign_set::iterator it
 		= m_prebuilt->table->foreign_set.begin();
@@ -15278,7 +15565,7 @@ ha_innobase::get_foreign_key_list(
 		}
 	}
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unlock();
 
 	m_prebuilt->trx->op_info = "";
 
@@ -15299,7 +15586,7 @@ ha_innobase::get_parent_foreign_key_list(
 
 	m_prebuilt->trx->op_info = "getting list of referencing foreign keys";
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 
 	for (dict_foreign_set::iterator it
 		= m_prebuilt->table->referenced_set.begin();
@@ -15316,7 +15603,7 @@ ha_innobase::get_parent_foreign_key_list(
 		}
 	}
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 
 	m_prebuilt->trx->op_info = "";
 
@@ -15333,32 +15620,13 @@ struct table_list_item {
 	const char*		name;
 };
 
-/*****************************************************************//**
-Checks if ALTER TABLE may change the storage engine of the table.
-Changing storage engines is not allowed for tables for which there
-are foreign key constraints (parent or child tables).
-@return TRUE if can switch engines */
-
-bool
-ha_innobase::can_switch_engines(void)
-/*=================================*/
+/** @return whether ALTER TABLE may change the storage engine of the table */
+bool ha_innobase::can_switch_engines()
 {
-	DBUG_ENTER("ha_innobase::can_switch_engines");
-
-	update_thd();
-
-	m_prebuilt->trx->op_info =
-			"determining if there are foreign key constraints";
-
-	row_mysql_freeze_data_dictionary(m_prebuilt->trx);
-
-	bool	can_switch = m_prebuilt->table->referenced_set.empty()
-		&& m_prebuilt->table->foreign_set.empty();
-
-	row_mysql_unfreeze_data_dictionary(m_prebuilt->trx);
-	m_prebuilt->trx->op_info = "";
-
-	DBUG_RETURN(can_switch);
+  DBUG_ENTER("ha_innobase::can_switch_engines");
+  update_thd();
+  DBUG_RETURN(m_prebuilt->table->foreign_set.empty() &&
+              m_prebuilt->table->referenced_set.empty());
 }
 
 /*******************************************************************//**
@@ -15368,30 +15636,12 @@ delete is then allowed internally to resolve a duplicate key conflict in
 REPLACE, not an update.
 @return > 0 if referenced by a FOREIGN KEY */
 
-uint
-ha_innobase::referenced_by_foreign_key(void)
-/*========================================*/
+uint ha_innobase::referenced_by_foreign_key()
 {
-	if (dict_table_is_referenced_by_foreign_key(m_prebuilt->table)) {
-
-		return(1);
-	}
-
-	return(0);
-}
-
-/*******************************************************************//**
-Frees the foreign key create info for a table stored in InnoDB, if it is
-non-NULL. */
-
-void
-ha_innobase::free_foreign_key_create_info(
-/*======================================*/
-	char*	str)	/*!< in, own: create info string to free */
-{
-	if (str != NULL) {
-		my_free(str);
-	}
+  dict_sys.freeze(SRW_LOCK_CALL);
+  const bool empty= m_prebuilt->table->referenced_set.empty();
+  dict_sys.unfreeze();
+  return !empty;
 }
 
 /*******************************************************************//**
@@ -15404,11 +15654,9 @@ ha_innobase::extra(
 	enum ha_extra_function operation)
 			   /*!< in: HA_EXTRA_FLUSH or some other flag */
 {
-	check_trx_exists(ha_thd());
-
-	/* Warning: since it is not sure that MySQL calls external_lock
-	before calling this function, the trx field in m_prebuilt can be
-	obsolete! */
+	/* Warning: since it is not sure that MariaDB calls external_lock()
+	before calling this function, m_prebuilt->trx can be obsolete! */
+	trx_t* trx = check_trx_exists(ha_thd());
 
 	switch (operation) {
 	case HA_EXTRA_FLUSH:
@@ -15418,7 +15666,16 @@ ha_innobase::extra(
 		break;
 	case HA_EXTRA_RESET_STATE:
 		reset_template();
-		thd_to_trx(ha_thd())->duplicates = 0;
+		trx->duplicates = 0;
+		/* fall through */
+		/* HA_EXTRA_IGNORE_INSERT is very similar to
+		HA_EXTRA_IGNORE_DUP_KEY, but with one crucial difference:
+		we want !trx->duplicates for INSERT IGNORE so that
+		row_ins_duplicate_error_in_clust() will acquire a
+		shared lock instead of an exclusive lock. */
+	stmt_boundary:
+		trx->end_bulk_insert(*m_prebuilt->table);
+		trx->bulk_insert = false;
 		break;
 	case HA_EXTRA_NO_KEYREAD:
 		m_prebuilt->read_just_key = 0;
@@ -15429,40 +15686,45 @@ ha_innobase::extra(
 	case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
 		m_prebuilt->keep_other_fields_on_keyread = 1;
 		break;
-
-		/* IMPORTANT: m_prebuilt->trx can be obsolete in
-		this method, because it is not sure that MySQL
-		calls external_lock before this method with the
-		parameters below.  We must not invoke update_thd()
-		either, because the calling threads may change.
-		CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
 	case HA_EXTRA_INSERT_WITH_UPDATE:
-		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
-		break;
+		trx->duplicates |= TRX_DUP_IGNORE;
+		goto stmt_boundary;
 	case HA_EXTRA_NO_IGNORE_DUP_KEY:
-		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE;
-		break;
+		trx->duplicates &= ~TRX_DUP_IGNORE;
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		goto stmt_boundary;
 	case HA_EXTRA_WRITE_CAN_REPLACE:
-		thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
-		break;
+		trx->duplicates |= TRX_DUP_REPLACE;
+		goto stmt_boundary;
 	case HA_EXTRA_WRITE_CANNOT_REPLACE:
-		thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
-		break;
+		trx->duplicates &= ~TRX_DUP_REPLACE;
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		goto stmt_boundary;
 	case HA_EXTRA_BEGIN_ALTER_COPY:
 		m_prebuilt->table->skip_alter_undo = 1;
 		if (m_prebuilt->table->is_temporary()
 		    || !m_prebuilt->table->versioned_by_id()) {
 			break;
 		}
-		trx_start_if_not_started(m_prebuilt->trx, true);
-		m_prebuilt->trx->mod_tables.insert(
-			trx_mod_tables_t::value_type(
-				const_cast<dict_table_t*>(m_prebuilt->table),
-				0))
+		ut_ad(trx == m_prebuilt->trx);
+		trx_start_if_not_started(trx, true);
+		trx->mod_tables.emplace(
+			const_cast<dict_table_t*>(m_prebuilt->table), 0)
 			.first->second.set_versioned(0);
 		break;
 	case HA_EXTRA_END_ALTER_COPY:
 		m_prebuilt->table->skip_alter_undo = 0;
+		if (!m_prebuilt->table->is_temporary()) {
+			log_buffer_flush_to_disk();
+		}
 		break;
 	default:/* Do nothing */
 		;
@@ -15490,6 +15752,7 @@ ha_innobase::reset()
 	/* This is a statement level counter. */
 	m_prebuilt->autoinc_last_value = 0;
 
+	m_prebuilt->skip_locked = false;
 	return(0);
 }
 
@@ -15525,16 +15788,35 @@ ha_innobase::start_stmt(
 	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
 	trx->n_autoinc_rows = 0;
 
-	m_prebuilt->sql_stat_start = TRUE;
+	const auto sql_command = thd_sql_command(thd);
+
 	m_prebuilt->hint_need_to_fetch_extra_cols = 0;
 	reset_template();
 
+	switch (sql_command) {
+	case SQLCOM_INSERT:
+	case SQLCOM_INSERT_SELECT:
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		/* fall through */
+	default:
+		trx->end_bulk_insert(*m_prebuilt->table);
+		if (!trx->bulk_insert) {
+			break;
+		}
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+	}
+
+	m_prebuilt->sql_stat_start = TRUE;
+
 	if (m_prebuilt->table->is_temporary()
 	    && m_mysql_has_locked
 	    && m_prebuilt->select_lock_type == LOCK_NONE) {
-		dberr_t error;
-
-		switch (thd_sql_command(thd)) {
+		switch (sql_command) {
 		case SQLCOM_INSERT:
 		case SQLCOM_UPDATE:
 		case SQLCOM_DELETE:
@@ -15542,12 +15824,9 @@ ha_innobase::start_stmt(
 			init_table_handle_for_HANDLER();
 			m_prebuilt->select_lock_type = LOCK_X;
 			m_prebuilt->stored_select_lock_type = LOCK_X;
-			error = row_lock_table(m_prebuilt);
-
-			if (error != DB_SUCCESS) {
-				int	st = convert_error_code_to_mysql(
-					error, 0, thd);
-				DBUG_RETURN(st);
+			if (dberr_t error = row_lock_table(m_prebuilt)) {
+				DBUG_RETURN(convert_error_code_to_mysql(
+						    error, 0, thd));
 			}
 			break;
 		}
@@ -15561,9 +15840,9 @@ ha_innobase::start_stmt(
 
 		m_prebuilt->select_lock_type = LOCK_X;
 
-	} else if (trx->isolation_level != TRX_ISO_SERIALIZABLE
-		   && thd_sql_command(thd) == SQLCOM_SELECT
-		   && lock_type == TL_READ) {
+	} else if (sql_command == SQLCOM_SELECT
+		   && lock_type == TL_READ
+		   && trx->isolation_level != TRX_ISO_SERIALIZABLE) {
 
 		/* For other than temporary tables, we obtain
 		no lock for consistent read (plain SELECT). */
@@ -15671,9 +15950,11 @@ ha_innobase::external_lock(
 		}
 	}
 
+	const auto sql_command = thd_sql_command(thd);
+
 	/* Check for UPDATEs in read-only mode. */
 	if (srv_read_only_mode) {
-		switch (thd_sql_command(thd)) {
+		switch (sql_command) {
 		case SQLCOM_CREATE_TABLE:
 			if (lock_type != F_WRLCK) {
 				break;
@@ -15700,12 +15981,29 @@ ha_innobase::external_lock(
 	m_prebuilt->hint_need_to_fetch_extra_cols = 0;
 
 	reset_template();
+	switch (sql_command) {
+	case SQLCOM_INSERT:
+	case SQLCOM_INSERT_SELECT:
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		/* fall through */
+	default:
+		trx->end_bulk_insert(*m_prebuilt->table);
+		if (!trx->bulk_insert) {
+			break;
+		}
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+	}
 
 	switch (m_prebuilt->table->quiesce) {
 	case QUIESCE_START:
 		/* Check for FLUSH TABLE t WITH READ LOCK; */
 		if (!srv_read_only_mode
-		    && thd_sql_command(thd) == SQLCOM_FLUSH
+		    && sql_command == SQLCOM_FLUSH
 		    && lock_type == F_RDLCK) {
 
 			if (!m_prebuilt->table->space) {
@@ -15789,7 +16087,7 @@ ha_innobase::external_lock(
 
 		if (m_prebuilt->select_lock_type != LOCK_NONE) {
 
-			if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES
+			if (sql_command == SQLCOM_LOCK_TABLES
 			    && THDVAR(thd, table_locks)
 			    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
 			    && thd_in_lock_tables(thd)) {
@@ -15907,7 +16205,7 @@ innodb_show_status(
 	char*	str;
 	size_t	flen;
 
-	mutex_enter(&srv_monitor_file_mutex);
+	mysql_mutex_lock(&srv_monitor_file_mutex);
 	rewind(srv_monitor_file);
 
 	srv_printf_innodb_monitor(srv_monitor_file, FALSE,
@@ -15934,7 +16232,7 @@ innodb_show_status(
 
 	if (!(str = (char*) my_malloc(PSI_INSTRUMENT_ME,
 		      usable_len + 1, MYF(0)))) {
-		mutex_exit(&srv_monitor_file_mutex);
+		mysql_mutex_unlock(&srv_monitor_file_mutex);
 		DBUG_RETURN(1);
 	}
 
@@ -15962,7 +16260,7 @@ innodb_show_status(
 		flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
 	}
 
-	mutex_exit(&srv_monitor_file_mutex);
+	mysql_mutex_unlock(&srv_monitor_file_mutex);
 
 	ret_val= stat_print(
 		thd, innobase_hton_name,
@@ -15974,335 +16272,6 @@ innodb_show_status(
 	DBUG_RETURN(ret_val);
 }
 
-/** Callback for collecting mutex statistics */
-struct ShowStatus {
-
-	/** For tracking the mutex metrics */
-	struct Value {
-
-		/** Constructor
-		@param[in]	name		Name of the mutex
-		@param[in]	spins		Number of spins
-		@param[in]	os_waits	OS waits so far
-		@param[in]	calls		Number of calls to enter() */
-		Value(const char*	name,
-		      ulint		spins,
-		      uint64_t		waits,
-		      uint64_t		calls)
-			:
-			m_name(name),
-			m_spins(spins),
-			m_waits(waits),
-			m_calls(calls)
-		{
-			/* No op */
-		}
-
-		/** Mutex name */
-		std::string		m_name;
-
-		/** Spins so far */
-		ulint			m_spins;
-
-		/** Waits so far */
-		uint64_t		m_waits;
-
-		/** Number of calls so far */
-		uint64_t		m_calls;
-	};
-
-	/** Order by m_waits, in descending order. */
-	struct OrderByWaits
-	{
-		/** @return true if rhs < lhs */
-		bool operator()(
-			const Value& lhs,
-			const Value& rhs) const
-			UNIV_NOTHROW
-		{
-			return(rhs.m_waits < lhs.m_waits);
-		}
-	};
-
-	typedef std::vector<Value, ut_allocator<Value> > Values;
-
-	/** Collect the individual latch counts */
-	struct GetCount {
-		typedef latch_meta_t::CounterType::Count Count;
-
-		/** Constructor
-		@param[in]	name		Latch name
-		@param[in,out]	values		Put the values here */
-		GetCount(
-			const char*	name,
-			Values*		values)
-			UNIV_NOTHROW
-			:
-			m_name(name),
-			m_values(values)
-		{
-			/* No op */
-		}
-
-		/** Collect the latch metrics. Ignore entries where the
-		spins and waits are zero.
-		@param[in]	count		The latch metrics */
-		void operator()(Count* count) const UNIV_NOTHROW
-		{
-			if (count->m_spins > 0 || count->m_waits > 0) {
-
-				m_values->push_back(Value(
-					m_name,
-					count->m_spins,
-					count->m_waits,
-					count->m_calls));
-			}
-		}
-
-		/** The latch name */
-		const char*	m_name;
-
-		/** For collecting the active mutex stats. */
-		Values*		m_values;
-	};
-
-	/** Constructor */
-	ShowStatus() = default;
-
-	/** Callback for collecting the stats
-	@param[in]	latch_meta		Latch meta data
-	@return always returns true */
-	bool operator()(latch_meta_t& latch_meta)
-		UNIV_NOTHROW
-	{
-		latch_meta.get_counter()->iterate(
-			GetCount(latch_meta.get_name(), &m_values));
-
-		return(true);
-	}
-
-	/** Implements the SHOW MUTEX STATUS command, for mutexes.
-	The table structure is like so: Engine | Mutex Name | Status
-	We store the metrics  in the "Status" column as:
-
-		spins=N,waits=N,calls=N"
-
-	The user has to parse the dataunfortunately
-	@param[in,out]	thd		the MySQL query thread of the caller
-	@param[in,out]	stat_print	function for printing statistics
-	@return true on success. */
-	bool to_string(
-		THD*		thd,
-		stat_print_fn*	stat_print)
-		UNIV_NOTHROW;
-
-	/** For collecting the active mutex stats. */
-	Values		m_values;
-};
-
-/** Implements the SHOW MUTEX STATUS command, for mutexes.
-The table structure is like so: Engine | Mutex Name | Status
-We store the metrics  in the "Status" column as:
-
-	spins=N,waits=N,calls=N"
-
-The user has to parse the dataunfortunately
-@param[in,out]	thd		the MySQL query thread of the caller
-@param[in,out]	stat_print	function for printing statistics
-@return true on success. */
-bool
-ShowStatus::to_string(
-	THD*		thd,
-	stat_print_fn*	stat_print)
-	UNIV_NOTHROW
-{
-	uint		hton_name_len = (uint) strlen(innobase_hton_name);
-
-	std::sort(m_values.begin(), m_values.end(), OrderByWaits());
-
-	Values::iterator	end = m_values.end();
-
-	for (Values::iterator it = m_values.begin(); it != end; ++it) {
-
-		int	name_len;
-		char	name_buf[IO_SIZE];
-
-		name_len = snprintf(
-			name_buf, sizeof(name_buf), "%s", it->m_name.c_str());
-
-		int	status_len;
-		char	status_buf[IO_SIZE];
-
-		status_len = snprintf(
-			status_buf, sizeof(status_buf),
-			"spins=%lu,waits=%lu,calls=%llu",
-			static_cast<ulong>(it->m_spins),
-			static_cast<long>(it->m_waits),
-			(ulonglong) it->m_calls);
-
-		if (stat_print(thd, innobase_hton_name,
-			       hton_name_len,
-			       name_buf, static_cast<uint>(name_len),
-			       status_buf, static_cast<uint>(status_len))) {
-
-			return(false);
-		}
-	}
-
-	return(true);
-}
-
-/** Implements the SHOW MUTEX STATUS command, for mutexes.
-@param[in,out]	hton		the innodb handlerton
-@param[in,out]	thd		the MySQL query thread of the caller
-@param[in,out]	stat_print	function for printing statistics
-@return 0 on success. */
-static
-int
-innodb_show_mutex_status(
-	handlerton*
-#ifdef DBUG_ASSERT_EXISTS
-	hton
-#endif
-	,
-	THD*		thd,
-	stat_print_fn*	stat_print)
-{
-	DBUG_ENTER("innodb_show_mutex_status");
-
-	ShowStatus	collector;
-
-	DBUG_ASSERT(hton == innodb_hton_ptr);
-
-	mutex_monitor.iterate(collector);
-
-	if (!collector.to_string(thd, stat_print)) {
-		DBUG_RETURN(1);
-	}
-
-	DBUG_RETURN(0);
-}
-
-/** Implements the SHOW MUTEX STATUS command.
-@param[in,out]	hton		the innodb handlerton
-@param[in,out]	thd		the MySQL query thread of the caller
-@param[in,out]	stat_print	function for printing statistics
-@return 0 on success. */
-static
-int
-innodb_show_rwlock_status(
-	handlerton*
-#ifdef DBUG_ASSERT_EXISTS
-	hton
-#endif
-	,
-	THD*		thd,
-	stat_print_fn*	stat_print)
-{
-	DBUG_ENTER("innodb_show_rwlock_status");
-
-	const rw_lock_t* block_rwlock= nullptr;
-	ulint		block_rwlock_oswait_count = 0;
-	uint		hton_name_len = (uint) strlen(innobase_hton_name);
-
-	DBUG_ASSERT(hton == innodb_hton_ptr);
-
-	mutex_enter(&rw_lock_list_mutex);
-
-	for (const rw_lock_t& rw_lock : rw_lock_list) {
-
-		if (rw_lock.count_os_wait == 0) {
-			continue;
-		}
-
-		int		buf1len;
-		char		buf1[IO_SIZE];
-
-		if (rw_lock.is_block_lock) {
-
-			block_rwlock = &rw_lock;
-			block_rwlock_oswait_count += rw_lock.count_os_wait;
-
-			continue;
-		}
-
-		buf1len = snprintf(
-			buf1, sizeof buf1, "rwlock: %s:%u",
-			innobase_basename(rw_lock.cfile_name),
-			rw_lock.cline);
-
-		int		buf2len;
-		char		buf2[IO_SIZE];
-
-		buf2len = snprintf(
-			buf2, sizeof buf2, "waits=%u",
-			rw_lock.count_os_wait);
-
-		if (stat_print(thd, innobase_hton_name,
-			       hton_name_len,
-			       buf1, static_cast<uint>(buf1len),
-			       buf2, static_cast<uint>(buf2len))) {
-
-			mutex_exit(&rw_lock_list_mutex);
-
-			DBUG_RETURN(1);
-		}
-	}
-
-	if (block_rwlock != NULL) {
-
-		int		buf1len;
-		char		buf1[IO_SIZE];
-
-		buf1len = snprintf(
-			buf1, sizeof buf1, "sum rwlock: %s:%u",
-			innobase_basename(block_rwlock->cfile_name),
-			block_rwlock->cline);
-
-		int		buf2len;
-		char		buf2[IO_SIZE];
-
-		buf2len = snprintf(
-			buf2, sizeof buf2, "waits=" ULINTPF,
-			block_rwlock_oswait_count);
-
-		if (stat_print(thd, innobase_hton_name,
-			       hton_name_len,
-			       buf1, static_cast<uint>(buf1len),
-			       buf2, static_cast<uint>(buf2len))) {
-
-			mutex_exit(&rw_lock_list_mutex);
-
-			DBUG_RETURN(1);
-		}
-	}
-
-	mutex_exit(&rw_lock_list_mutex);
-
-	DBUG_RETURN(0);
-}
-
-/** Implements the SHOW MUTEX STATUS command.
-@param[in,out]	hton		the innodb handlerton
-@param[in,out]	thd		the MySQL query thread of the caller
-@param[in,out]	stat_print	function for printing statistics
-@return 0 on success. */
-static
-int
-innodb_show_latch_status(
-	handlerton*	hton,
-	THD*		thd,
-	stat_print_fn*	stat_print)
-{
-	int	ret = innodb_show_mutex_status(hton, thd, stat_print);
-
-	if (ret != 0) {
-		return(ret);
-	}
-
-	return(innodb_show_rwlock_status(hton, thd, stat_print));
-}
-
 /************************************************************************//**
 Return 0 on success and non-zero on failure. Note: the bool return type
 seems to be abused here, should be an int. */
@@ -16324,8 +16293,6 @@ innobase_show_status(
 		return(innodb_show_status(hton, thd, stat_print) != 0);
 
 	case HA_ENGINE_MUTEX:
-		return(innodb_show_latch_status(hton, thd, stat_print) != 0);
-
 	case HA_ENGINE_LOGS:
 		/* Not handled */
 		break;
@@ -16471,11 +16438,12 @@ ha_innobase::store_lock(
 		are processing a stored procedure or function, or
 		2) (we do not know when TL_READ_HIGH_PRIORITY is used), or
 		3) this is a SELECT ... IN SHARE MODE, or
-		4) we are doing a complex SQL statement like
+		4) this is a SELECT ... IN SHARE MODE SKIP LOCKED, or
+		5) we are doing a complex SQL statement like
 		INSERT INTO ... SELECT ... and the logical logging (MySQL
 		binlog) requires the use of a locking read, or
 		MySQL is doing LOCK TABLES ... READ.
-		5) we let InnoDB do locking reads for all SQL statements that
+		6) we let InnoDB do locking reads for all SQL statements that
 		are not simple SELECTs; note that select_lock_type in this
 		case may get strengthened in ::external_lock() to LOCK_X.
 		Note that we MUST use a locking read in all data modifying
@@ -16521,6 +16489,7 @@ ha_innobase::store_lock(
 		m_prebuilt->select_lock_type = LOCK_NONE;
 		m_prebuilt->stored_select_lock_type = LOCK_NONE;
 	}
+	m_prebuilt->skip_locked= false;
 
 	if (!trx_is_started(trx)
 	    && (m_prebuilt->select_lock_type != LOCK_NONE
@@ -16555,7 +16524,7 @@ ha_innobase::innobase_get_autoinc(
 		/* It should have been initialized during open. */
 		if (*value == 0) {
 			m_prebuilt->autoinc_error = DB_UNSUPPORTED;
-			m_prebuilt->table->autoinc_mutex.unlock();
+			m_prebuilt->table->autoinc_mutex.wr_unlock();
 		}
 	}
 
@@ -16579,7 +16548,7 @@ ha_innobase::innobase_peek_autoinc(void)
 
 	innodb_table = m_prebuilt->table;
 
-	innodb_table->autoinc_mutex.lock();
+	innodb_table->autoinc_mutex.wr_lock();
 
 	auto_inc = dict_table_autoinc_read(innodb_table);
 
@@ -16588,7 +16557,7 @@ ha_innobase::innobase_peek_autoinc(void)
 			" '" << innodb_table->name << "'";
 	}
 
-	innodb_table->autoinc_mutex.unlock();
+	innodb_table->autoinc_mutex.wr_unlock();
 
 	return(auto_inc);
 }
@@ -16695,7 +16664,7 @@ ha_innobase::get_auto_increment(
 		/* Out of range number. Let handler::update_auto_increment()
 		take care of this */
 		m_prebuilt->autoinc_last_value = 0;
-		m_prebuilt->table->autoinc_mutex.unlock();
+		m_prebuilt->table->autoinc_mutex.wr_unlock();
 		*nb_reserved_values= 0;
 		return;
 	}
@@ -16738,7 +16707,7 @@ ha_innobase::get_auto_increment(
 	m_prebuilt->autoinc_offset = offset;
 	m_prebuilt->autoinc_increment = increment;
 
-	m_prebuilt->table->autoinc_mutex.unlock();
+	m_prebuilt->table->autoinc_mutex.wr_unlock();
 }
 
 /*******************************************************************//**
@@ -17006,7 +16975,7 @@ innobase_xa_prepare(
 
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
-	thd_get_xid(thd, (MYSQL_XID*) trx->xid);
+	thd_get_xid(thd, &reinterpret_cast<MYSQL_XID&>(trx->xid));
 
 	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
 
@@ -17138,8 +17107,8 @@ int innobase_rollback_by_xid(handlerton* hton, XID* xid)
 		/* If a wsrep transaction is being rolled back during
 		the recovery, we must clear the xid in order to avoid
 		writing serialisation history for rolled back transaction. */
-		if (wsrep_is_wsrep_xid(trx->xid)) {
-			trx->xid->null();
+		if (wsrep_is_wsrep_xid(&trx->xid)) {
+			trx->xid.null();
 		}
 #endif /* WITH_WSREP */
 		int ret = innobase_rollback_trx(trx);
@@ -17452,10 +17421,10 @@ static int innodb_ft_aux_table_validate(THD *thd, st_mysql_sys_var*,
 
 	if (const char* table_name = value->val_str(value, buf, &len)) {
 		if (dict_table_t* table = dict_table_open_on_name(
-			    table_name, FALSE, TRUE, DICT_ERR_IGNORE_NONE)) {
+			    table_name, false, DICT_ERR_IGNORE_NONE)) {
 			const table_id_t id = dict_table_has_fts_index(table)
 				? table->id : 0;
-			dict_table_close(table, FALSE, FALSE);
+			dict_table_close(table);
 			if (id) {
 				innodb_ft_aux_table_id = id;
 				if (table_name == buf) {
@@ -17582,8 +17551,10 @@ func_exit:
 	if (block != NULL) {
 		ib::info() << "Dirtying page: " << block->page.id();
 		mtr.write<1,mtr_t::FORCED>(*block,
-					   block->frame + FIL_PAGE_SPACE_ID,
-					   block->frame[FIL_PAGE_SPACE_ID]);
+					   block->page.frame
+					   + FIL_PAGE_SPACE_ID,
+					   block->page.frame
+					   [FIL_PAGE_SPACE_ID]);
 	}
 	mtr.commit();
 	log_write_up_to(mtr.commit_lsn(), true);
@@ -17624,11 +17595,6 @@ innodb_monitor_set_option(
 			srv_mon_process_existing_counter(
 				monitor_id, MONITOR_TURN_ON);
 		}
-
-		if (MONITOR_IS_ON(MONITOR_LATCHES)) {
-
-			mutex_monitor.enable();
-		}
 		break;
 
 	case MONITOR_TURN_OFF:
@@ -17639,25 +17605,14 @@ innodb_monitor_set_option(
 
 		MONITOR_OFF(monitor_id);
 		MONITOR_SET_OFF(monitor_id);
-
-		if (!MONITOR_IS_ON(MONITOR_LATCHES)) {
-
-			mutex_monitor.disable();
-		}
 		break;
 
 	case MONITOR_RESET_VALUE:
 		srv_mon_reset(monitor_id);
-
-		if (monitor_id == (MONITOR_LATCHES)) {
-
-			mutex_monitor.reset();
-		}
 		break;
 
 	case MONITOR_RESET_ALL_VALUE:
 		srv_mon_reset_all(monitor_id);
-		mutex_monitor.reset();
 		break;
 
 	default:
@@ -18015,7 +17970,8 @@ static bool innodb_buffer_pool_evict_uncompressed()
 	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
 	     block != NULL; ) {
 		buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
-		ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->page.in_file());
+		ut_ad(block->page.belongs_to_unzip_LRU());
 		ut_ad(block->in_unzip_LRU_list);
 		ut_ad(block->page.in_LRU_list);
 
@@ -18054,7 +18010,8 @@ innodb_buffer_pool_evict_update(THD*, st_mysql_sys_var*, void*,
 					return;
 				}
 
-				os_thread_sleep(10000);
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(10));
 			}
 
 			/* We failed to evict all uncompressed pages. */
@@ -18304,20 +18261,11 @@ innobase_fts_find_ranking(FT_INFO* fts_hdl, uchar*, uint)
 }
 
 #ifdef UNIV_DEBUG
-static my_bool	innodb_background_drop_list_empty = TRUE;
 static my_bool	innodb_log_checkpoint_now = TRUE;
 static my_bool	innodb_buf_flush_list_now = TRUE;
 static uint	innodb_merge_threshold_set_all_debug
 	= DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
 
-/** Wait for the background drop list to become empty. */
-static
-void
-wait_background_drop_list_empty(THD*, st_mysql_sys_var*, void*, const void*)
-{
-	row_wait_for_background_drop_list_empty();
-}
-
 /****************************************************************//**
 Force innodb to checkpoint. */
 static
@@ -18598,107 +18546,6 @@ innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save)
 	mysql_mutex_lock(&LOCK_global_system_variables);
 }
 
-/** Issue a deprecation warning for SET GLOBAL innodb_log_checksums.
-@param[in,out]	thd	client connection */
-static void
-innodb_log_checksums_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-			    HA_ERR_UNSUPPORTED,
-			    deprecated::innodb_log_checksums_msg);
-}
-
-/** Issue a deprecation warning for SET GLOBAL innodb_log_compressed_pages.
-@param[in,out]	thd	client connection */
-static void
-innodb_log_compressed_pages_warn(THD* thd, st_mysql_sys_var*, void*,
-				 const void*)
-{
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-			    HA_ERR_UNSUPPORTED,
-			    deprecated::innodb_log_compressed_pages_msg);
-}
-
-/** Issue a deprecation warning for SET GLOBAL innodb_log_optimize_ddl.
-@param[in,out]	thd	client connection */
-static void
-innodb_log_optimize_ddl_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-			    HA_ERR_UNSUPPORTED,
-			    deprecated::innodb_log_optimize_ddl_msg);
-}
-
-/** Issue a deprecation warning for SET GLOBAL innodb_page_cleaners.
-@param[in,out]	thd	client connection */
-static void
-innodb_page_cleaners_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-			    HA_ERR_UNSUPPORTED,
-			    deprecated::innodb_page_cleaners_msg);
-}
-
-/** Issue a deprecation warning for SET GLOBAL innodb_undo_logs.
-@param[in,out]	thd	client connection */
-static void
-innodb_undo_logs_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-			    HA_ERR_UNSUPPORTED,
-			    deprecated::innodb_undo_logs_msg);
-}
-
-/** Issue a deprecation warning for SET GLOBAL innodb_scrub_log_speed.
-@param[in,out]	thd	client connection */
-static void
-innodb_scrub_log_speed_warn(THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-			    HA_ERR_UNSUPPORTED,
-			    deprecated::innodb_scrub_log_speed_msg);
-}
-
-static void
-innodb_background_scrub_data_uncompressed_warn(THD* thd, st_mysql_sys_var*,
-					       void*, const void*)
-{
-	push_warning_printf(
-		thd, Sql_condition::WARN_LEVEL_WARN,
-		HA_ERR_UNSUPPORTED,
-		deprecated::innodb_background_scrub_data_uncompressed_msg);
-}
-
-static void
-innodb_background_scrub_data_compressed_warn(THD* thd, st_mysql_sys_var*,
-					     void*, const void*)
-{
-	push_warning_printf(
-		thd, Sql_condition::WARN_LEVEL_WARN,
-		HA_ERR_UNSUPPORTED,
-		deprecated::innodb_background_scrub_data_compressed_msg);
-}
-
-static void
-innodb_background_scrub_data_check_interval_warn(
-	THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(
-		thd, Sql_condition::WARN_LEVEL_WARN,
-		HA_ERR_UNSUPPORTED,
-		deprecated::innodb_background_scrub_data_check_interval_msg);
-}
-
-static void
-innodb_background_scrub_data_interval_warn(
-	THD* thd, st_mysql_sys_var*, void*, const void*)
-{
-	push_warning_printf(
-		thd, Sql_condition::WARN_LEVEL_WARN,
-		HA_ERR_UNSUPPORTED,
-		deprecated::innodb_background_scrub_data_interval_msg);
-}
-
 static SHOW_VAR innodb_status_variables_export[]= {
 	SHOW_FUNC_ENTRY("Innodb", &show_innodb_vars),
 	{NullS, NullS, SHOW_LONG}
@@ -18708,133 +18555,92 @@ static struct st_mysql_storage_engine innobase_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
 #ifdef WITH_WSREP
-
-static
-void
-wsrep_kill_victim(
-	MYSQL_THD const bf_thd,
-	MYSQL_THD thd,
-	trx_t* victim_trx,
-	my_bool signal)
+/** Request a transaction to be killed that holds a conflicting lock.
+@param bf_trx    brute force applier transaction
+@param thd_id    thd_get_thread_id(victim_trx->mysql_htd)
+@param trx_id    victim_trx->id */
+void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id)
 {
-  DBUG_ENTER("wsrep_kill_victim");
+  THD *bf_thd= bf_trx->mysql_thd;
 
-  /* Mark transaction as a victim for Galera abort */
-  victim_trx->lock.was_chosen_as_wsrep_victim= true;
-  if (wsrep_thd_set_wsrep_aborter(bf_thd, thd))
+  if (THD *vthd= find_thread_by_id(thd_id))
   {
-    WSREP_DEBUG("innodb kill transaction skipped due to wsrep_aborter set");
-    wsrep_thd_UNLOCK(thd);
-    DBUG_VOID_RETURN;
-  }
-
-  if (wsrep_thd_bf_abort(bf_thd, thd, signal))
-  {
-    lock_t*  wait_lock= victim_trx->lock.wait_lock;
-    if (wait_lock)
+    bool aborting= false;
+    wsrep_thd_LOCK(vthd);
+    trx_t *vtrx= thd_to_trx(vthd);
+    if (vtrx)
     {
-      DBUG_ASSERT(victim_trx->is_wsrep());
-      WSREP_DEBUG("victim has wait flag: %lu", thd_get_thread_id(thd));
-      victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
-      lock_cancel_waiting_and_release(wait_lock);
+      /* Do not bother with lock elision using transactional memory here;
+      this is rather complex code */
+      LockMutexGuard g{SRW_LOCK_CALL};
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      vtrx->mutex_lock();
+      /* victim transaction is either active or prepared, if it has already
+	 proceeded to replication phase */
+      if (vtrx->id == trx_id)
+      {
+        switch (vtrx->state) {
+        default:
+          break;
+        case TRX_STATE_PREPARED:
+          if (!wsrep_is_wsrep_xid(&vtrx->xid))
+            break;
+          /* fall through */
+        case TRX_STATE_ACTIVE:
+          WSREP_LOG_CONFLICT(bf_thd, vthd, TRUE);
+          WSREP_DEBUG("Aborter BF trx_id: " TRX_ID_FMT " thread: %ld "
+                      "seqno: %lld client_state: %s "
+                      "client_mode: %s transaction_mode: %s query: %s",
+                      bf_trx->id,
+                      thd_get_thread_id(bf_thd),
+                      wsrep_thd_trx_seqno(bf_thd),
+                      wsrep_thd_client_state_str(bf_thd),
+                      wsrep_thd_client_mode_str(bf_thd),
+                      wsrep_thd_transaction_state_str(bf_thd),
+                      wsrep_thd_query(bf_thd));
+          WSREP_DEBUG("Victim %s trx_id: " TRX_ID_FMT " thread: %ld "
+                      "seqno: %lld client_state: %s "
+                      "client_mode: %s transaction_mode: %s query: %s",
+                      wsrep_thd_is_BF(vthd, false) ? "BF" : "normal",
+                      vtrx->id,
+                      thd_get_thread_id(vthd),
+                      wsrep_thd_trx_seqno(vthd),
+                      wsrep_thd_client_state_str(vthd),
+                      wsrep_thd_client_mode_str(vthd),
+                      wsrep_thd_transaction_state_str(vthd),
+                      wsrep_thd_query(vthd));
+          /* Mark transaction as a victim for Galera abort */
+          vtrx->lock.set_wsrep_victim();
+          if (!wsrep_thd_set_wsrep_aborter(bf_thd, vthd))
+            aborting= true;
+          else
+            WSREP_DEBUG("kill transaction skipped due to wsrep_aborter set");
+        }
+      }
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      vtrx->mutex_unlock();
     }
-  }
-  else
-  {
-    wsrep_thd_LOCK(thd);
-    victim_trx->lock.was_chosen_as_wsrep_victim= false;
-    wsrep_thd_set_wsrep_aborter(NULL, thd);
-    wsrep_thd_UNLOCK(thd);
-
-    WSREP_DEBUG("wsrep_thd_bf_abort has failed, victim %lu will survive",
-                thd_get_thread_id(thd));
-  }
-
-  DBUG_VOID_RETURN;
-}
-
-/** This function is used to kill one transaction.
-
-This transaction was open on this node (not-yet-committed), and a
-conflicting writeset from some other node that was being applied
-caused a locking conflict.  First committed (from other node)
-wins, thus open transaction is rolled back.  BF stands for
-brute-force: any transaction can get aborted by galera any time
-it is necessary.
-
-This conflict can happen only when the replicated writeset (from
-other node) is being applied, not when it’s waiting in the queue.
-If our local transaction reached its COMMIT and this conflicting
-writeset was in the queue, then it should fail the local
-certification test instead.
-
-A brute force abort is only triggered by a locking conflict
-between a writeset being applied by an applier thread (slave thread)
-and an open transaction on the node, not by a Galera writeset
-comparison as in the local certification failure.
-
-@param[in]	bf_thd		Brute force (BF) thread
-@param[in,out]	victim_trx	Vimtim trx to be killed
-@param[in]	signal		Should victim be signaled */
-void
-wsrep_innobase_kill_one_trx(
-	MYSQL_THD const bf_thd,
-	trx_t *victim_trx,
-	my_bool signal)
-{
-  ut_ad(bf_thd);
-  ut_ad(victim_trx);
-  ut_ad(lock_mutex_own());
-  ut_ad(trx_mutex_own(victim_trx));
+    wsrep_thd_UNLOCK(vthd);
+    if (aborting)
+    {
+      /* if victim is waiting for some other lock, we have to cancel
+         that waiting
+      */
+      lock_sys.cancel_lock_wait_for_trx(vtrx);
 
-  DBUG_ENTER("wsrep_innobase_kill_one_trx");
-  THD *thd= (THD *) victim_trx->mysql_thd;
-  /* Note that bf_trx might not exist here e.g. on MDL conflict
-  case (test: galera_concurrent_ctas).*/
-  trx_t* bf_trx= (trx_t*)thd_to_trx(bf_thd);
+      DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
+      if (!wsrep_thd_bf_abort(bf_thd, vthd, true))
+      {
+        wsrep_thd_LOCK(vthd);
+        wsrep_thd_set_wsrep_aborter(NULL, vthd);
+        wsrep_thd_UNLOCK(vthd);
 
-  if (!thd)
-  {
-    WSREP_WARN("no THD for trx: " TRX_ID_FMT, victim_trx->id);
-    DBUG_VOID_RETURN;
+        WSREP_DEBUG("wsrep_thd_bf_abort has failed, victim %lu will survive",
+                     thd_get_thread_id(vthd));
+      }
+    }
+    wsrep_thd_kill_UNLOCK(vthd);
   }
-
-  /* Here we need to lock THD::LOCK_thd_data to protect from
-  concurrent usage or disconnect or delete. */
-  DEBUG_SYNC(bf_thd, "wsrep_before_BF_victim_lock");
-  wsrep_thd_LOCK(thd);
-  DEBUG_SYNC(bf_thd, "wsrep_after_BF_victim_lock");
-
-  WSREP_LOG_CONFLICT(bf_thd, thd, TRUE);
-
-  WSREP_DEBUG("wsrep_innobase_kill_one_trx: Aborter %s "
-	      "trx_id: " TRX_ID_FMT " thread: %ld "
-	      "seqno: %lld client_state: %s client_mode: %s "
-	      "trx_state %s query: %s",
-	      wsrep_thd_is_BF(bf_thd, false) ? "BF" : "normal",
-	      bf_trx ? bf_trx->id : TRX_ID_MAX,
-	      thd_get_thread_id(bf_thd),
-	      wsrep_thd_trx_seqno(bf_thd),
-	      wsrep_thd_client_state_str(bf_thd),
-	      wsrep_thd_client_mode_str(bf_thd),
-	      wsrep_thd_transaction_state_str(bf_thd),
-	      wsrep_thd_query(bf_thd));
-
-  WSREP_DEBUG("wsrep_innobase_kill_one_trx: Victim %s "
-	      "trx_id: " TRX_ID_FMT " thread: %ld "
-	      "seqno: %lld client_state: %s client_mode: %s "
-	      "trx_state %s query: %s",
-	      wsrep_thd_is_BF(thd, false) ? "BF" : "normal",
-	      victim_trx->id,
-	      thd_get_thread_id(thd),
-	      wsrep_thd_trx_seqno(thd),
-	      wsrep_thd_client_state_str(thd),
-	      wsrep_thd_client_mode_str(thd),
-	      wsrep_thd_transaction_state_str(thd),
-	      wsrep_thd_query(thd));
-
-  wsrep_kill_victim(bf_thd, thd, victim_trx, signal);
-  DBUG_VOID_RETURN;
 }
 
 /** This function forces the victim transaction to abort. Aborting the
@@ -18854,42 +18660,54 @@ wsrep_abort_transaction(
 	THD *victim_thd,
 	my_bool signal)
 {
-  /* Note that victim thd is protected with
-  THD::LOCK_thd_data and THD::LOCK_thd_kill here. */
-  trx_t* victim_trx= thd_to_trx(victim_thd);
-  trx_t* bf_trx= thd_to_trx(bf_thd);
-  WSREP_DEBUG("wsrep_abort_transaction: BF:"
-	      " thread %ld client_state %s client_mode %s"
-	      " trans_state %s query %s trx " TRX_ID_FMT,
-	      thd_get_thread_id(bf_thd),
-	      wsrep_thd_client_state_str(bf_thd),
-	      wsrep_thd_client_mode_str(bf_thd),
-	      wsrep_thd_transaction_state_str(bf_thd),
-	      wsrep_thd_query(bf_thd),
-	      bf_trx ? bf_trx->id : 0);
-
-  WSREP_DEBUG("wsrep_abort_transaction: victim:"
-	      " thread %ld client_state %s client_mode %s"
-	      " trans_state %s query %s trx " TRX_ID_FMT,
-	      thd_get_thread_id(victim_thd),
-	      wsrep_thd_client_state_str(victim_thd),
-	      wsrep_thd_client_mode_str(victim_thd),
-	      wsrep_thd_transaction_state_str(victim_thd),
-	      wsrep_thd_query(victim_thd),
-	      victim_trx ? victim_trx->id : 0);
-
-  if (victim_trx)
-  {
-    lock_mutex_enter();
-    trx_mutex_enter(victim_trx);
-    wsrep_kill_victim(bf_thd, victim_thd, victim_trx, signal);
-    lock_mutex_exit();
-    trx_mutex_exit(victim_trx);
-  }
-  else
-  {
-    wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
-  }
+	DBUG_ENTER("wsrep_abort_transaction");
+	ut_ad(bf_thd);
+	ut_ad(victim_thd);
+
+	wsrep_thd_kill_LOCK(victim_thd);
+	wsrep_thd_LOCK(victim_thd);
+	trx_t* victim_trx= thd_to_trx(victim_thd);
+	wsrep_thd_UNLOCK(victim_thd);
+
+	WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s",
+			wsrep_thd_query(bf_thd),
+			wsrep_thd_query(victim_thd),
+			wsrep_thd_transaction_state_str(victim_thd));
+
+	if (victim_trx) {
+		victim_trx->lock.set_wsrep_victim();
+
+		wsrep_thd_LOCK(victim_thd);
+		bool aborting= !wsrep_thd_set_wsrep_aborter(bf_thd, victim_thd);
+		wsrep_thd_UNLOCK(victim_thd);
+		if (aborting) {
+			DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
+			DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+					 {
+					   const char act[]=
+					     "now "
+					     "SIGNAL sync.before_wsrep_thd_abort_reached "
+					     "WAIT_FOR signal.before_wsrep_thd_abort";
+					   DBUG_ASSERT(!debug_sync_set_action(bf_thd,
+									      STRING_WITH_LEN(act)));
+					 };);
+			wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
+		}
+	} else {
+		DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+				 {
+				   const char act[]=
+				     "now "
+				     "SIGNAL sync.before_wsrep_thd_abort_reached "
+				     "WAIT_FOR signal.before_wsrep_thd_abort";
+				   DBUG_ASSERT(!debug_sync_set_action(bf_thd,
+								      STRING_WITH_LEN(act)));
+				 };);
+		wsrep_thd_bf_abort(bf_thd, victim_thd, signal);
+	}
+
+	wsrep_thd_kill_UNLOCK(victim_thd);
+	DBUG_VOID_RETURN;
 }
 
 static
@@ -18945,24 +18763,16 @@ static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
     " to match when reading;"
   " NONE"
     " write a constant magic number, do not do any checksum verification"
-    " when reading (same as innodb_checksums=OFF);"
+    " when reading;"
   " STRICT_NONE"
     " write a constant magic number, do not allow values other than that"
     " magic number when reading;"
   " Files updated when this option is set to crc32 or strict_crc32 will"
   " not be readable by MariaDB versions older than 10.0.4;"
   " new files created with full_crc32 are readable by MariaDB 10.4.3+",
-  NULL, innodb_checksum_algorithm_update, SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+  NULL, NULL, SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
   &innodb_checksum_algorithm_typelib);
 
-/** Description of deprecated and ignored parameters */
-static const char* innodb_deprecated_ignored
-= "Deprecated parameter with no effect.";
-
-static MYSQL_SYSVAR_BOOL(log_checksums, deprecated::innodb_log_checksums,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, NULL, innodb_log_checksums_warn, TRUE);
-
 static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
   PLUGIN_VAR_READONLY,
   "The common part for InnoDB table spaces.",
@@ -18977,9 +18787,7 @@ static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf,
 static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
   "Enable atomic writes, instead of using the doublewrite buffer, for files "
-  "on devices that supports atomic writes. "
-  "This option only works on Linux with either FusionIO cards using "
-  "the directFS filesystem or with Shannon cards using any file system.",
+  "on devices that supports atomic writes.",
   NULL, NULL, TRUE);
 
 static MYSQL_SYSVAR_BOOL(stats_include_delete_marked,
@@ -19007,12 +18815,6 @@ static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
   SRV_MAX_IO_CAPACITY_LIMIT, 0);
 
 #ifdef UNIV_DEBUG
-static MYSQL_SYSVAR_BOOL(background_drop_list_empty,
-  innodb_background_drop_list_empty,
-  PLUGIN_VAR_OPCMDARG,
-  "Wait for the background drop list to become empty",
-  NULL, wait_background_drop_list_empty, FALSE);
-
 static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
   PLUGIN_VAR_OPCMDARG,
   "Force checkpoint now",
@@ -19045,14 +18847,6 @@ static MYSQL_SYSVAR_UINT(purge_threads, srv_n_purge_threads,
   "Number of tasks for purging transaction history",
   NULL, NULL, 4, 1, innodb_purge_threads_MAX, 0);
 
-static MYSQL_SYSVAR_ULONG(sync_array_size, srv_sync_array_size,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Size of the mutex/lock wait array.",
-  NULL, NULL,
-  1,			/* Default setting */
-  1,			/* Minimum value */
-  1024, 0);		/* Maximum value */
-
 static MYSQL_SYSVAR_UINT(fast_shutdown, srv_fast_shutdown,
   PLUGIN_VAR_OPCMDARG,
   "Speeds up the shutdown process of the InnoDB storage engine. Possible"
@@ -19090,32 +18884,16 @@ static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
   " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
   NULL, NULL, 1, 0, 3, 0);
 
-static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method,
+static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "With which method to flush data.",
-  NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_FSYNC),
+  NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT),
   &innodb_flush_method_typelib);
 
-static MYSQL_SYSVAR_STR(file_format, deprecated::innodb_file_format,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  innodb_deprecated_ignored, NULL, NULL, NULL);
-static MYSQL_SYSVAR_STR(large_prefix, deprecated::innodb_large_prefix,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  innodb_deprecated_ignored, NULL, NULL, NULL);
-
-static MYSQL_SYSVAR_BOOL(force_load_corrupted, srv_load_corrupted,
-  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
-  "Force InnoDB to load metadata of corrupted table.",
-  NULL, NULL, FALSE);
-
 static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Path to InnoDB log files.", NULL, NULL, NULL);
 
-static MYSQL_SYSVAR_ULONG(page_cleaners, deprecated::innodb_page_cleaners,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, NULL, innodb_page_cleaners_warn, 0, 0, 64, 0);
-
 static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
   PLUGIN_VAR_RQCMDARG,
   "Percentage of dirty pages allowed in bufferpool.",
@@ -19237,26 +19015,12 @@ static MYSQL_SYSVAR_ULONG(adaptive_hash_index_parts, btr_ahi_parts,
   NULL, NULL, 8, 1, 512, 0);
 #endif /* BTR_CUR_HASH_ADAPT */
 
-static MYSQL_SYSVAR_UINT(replication_delay, deprecated::replication_delay,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, nullptr, deprecated::replication_delay_warn,
-   0, 0, ~0U, 0);
-
 static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
   PLUGIN_VAR_RQCMDARG,
   "Compression level used for zlib compression.  0 is no compression"
   ", 1 is fastest, 9 is best compression and default is 6.",
   NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
 
-static MYSQL_SYSVAR_BOOL(log_compressed_pages,
-  deprecated::innodb_log_compressed_pages,
-  PLUGIN_VAR_OPCMDARG,
-  innodb_deprecated_ignored, NULL, innodb_log_compressed_pages_warn, TRUE);
-
-static MYSQL_SYSVAR_BOOL(log_optimize_ddl, deprecated::innodb_log_optimize_ddl,
-  PLUGIN_VAR_OPCMDARG,
-  innodb_deprecated_ignored, NULL, innodb_log_optimize_ddl_warn, FALSE);
-
 static MYSQL_SYSVAR_UINT(autoextend_increment,
   sys_tablespace_auto_extend_increment,
   PLUGIN_VAR_RQCMDARG,
@@ -19271,23 +19035,6 @@ static MYSQL_SYSVAR_ULONG(buffer_pool_chunk_size, srv_buf_pool_chunk_unit,
   NULL, NULL,
   128 * 1024 * 1024, 1024 * 1024, LONG_MAX, 1024 * 1024);
 
-static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  "The algorithm Innodb uses for deciding which locks to grant next when"
-  " a lock is released. Possible values are"
-  " FCFS"
-  " grant the locks in First-Come-First-Served order;"
-  " VATS"
-  " use the Variance-Aware-Transaction-Scheduling algorithm, which"
-  " uses an Eldest-Transaction-First heuristic.",
-  NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
-  &innodb_lock_schedule_algorithm_typelib);
-
-static MYSQL_SYSVAR_ULONG(buffer_pool_instances,
-  deprecated::innodb_buffer_pool_instances,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  innodb_deprecated_ignored, NULL, NULL, 0, 0, 64, 0);
-
 static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Filename to/from which to dump/load the InnoDB buffer pool",
@@ -19409,23 +19156,18 @@ static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
   " when flushing a block",
   NULL, NULL, 1, 0, 2, 0);
 
-static MYSQL_SYSVAR_UINT(commit_concurrency, deprecated::commit_concurrency,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, nullptr, deprecated::commit_concurrency_warn,
-  0, 0, 1000, 0);
-
-static MYSQL_SYSVAR_UINT(concurrency_tickets, deprecated::concurrency_tickets,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, nullptr, deprecated::concurrency_tickets_warn,
-  0, 0, ~0U, 0);
-
-static MYSQL_SYSVAR_BOOL(deadlock_detect, innobase_deadlock_detect,
+static MYSQL_SYSVAR_BOOL(deadlock_detect, innodb_deadlock_detect,
   PLUGIN_VAR_NOCMDARG,
   "Enable/disable InnoDB deadlock detector (default ON)."
   " if set to OFF, deadlock detection is skipped,"
   " and we rely on innodb_lock_wait_timeout in case of deadlock.",
   NULL, NULL, TRUE);
 
+static MYSQL_SYSVAR_ENUM(deadlock_report, innodb_deadlock_report,
+  PLUGIN_VAR_RQCMDARG,
+  "How to report deadlocks (if innodb_deadlock_detect=ON).",
+  NULL, NULL, Deadlock::REPORT_FULL, &innodb_deadlock_report_typelib);
+
 static MYSQL_SYSVAR_UINT(fill_factor, innobase_fill_factor,
   PLUGIN_VAR_RQCMDARG,
   "Percentage of B-tree page filled during bulk insert",
@@ -19548,10 +19290,6 @@ static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
   NULL, NULL, 96 << 20, 1 << 20, std::numeric_limits<ulonglong>::max(),
   UNIV_PAGE_SIZE_MAX);
 
-static MYSQL_SYSVAR_ULONG(log_files_in_group, deprecated::srv_n_log_files,
-  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
-  innodb_deprecated_ignored, NULL, NULL, 1, 1, 100, 0);
-
 static MYSQL_SYSVAR_ULONG(log_write_ahead_size, srv_log_write_ahead_size,
   PLUGIN_VAR_RQCMDARG,
   "Redo log write ahead unit size to avoid read-on-write,"
@@ -19586,28 +19324,12 @@ static MYSQL_SYSVAR_UINT(spin_wait_delay, srv_spin_wait_delay,
   "Maximum delay between polling for a spin lock (4 by default)",
   NULL, NULL, 4, 0, 6000, 0);
 
-static MYSQL_SYSVAR_UINT(thread_concurrency, deprecated::thread_concurrency,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, nullptr, deprecated::thread_concurrency_warn,
-  0, 0, 1000, 0);
-
-static MYSQL_SYSVAR_UINT(
-  adaptive_max_sleep_delay, deprecated::adaptive_max_sleep_delay,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored,
-  nullptr, deprecated::adaptive_max_sleep_delay_warn, 0, 0, 1000000, 0);
-
 static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
   srv_prefix_index_cluster_optimization,
   PLUGIN_VAR_OPCMDARG,
   "Enable prefix optimization to sometimes avoid cluster index lookups.",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_UINT(thread_sleep_delay, deprecated::thread_sleep_delay,
-  PLUGIN_VAR_RQCMDARG,
-  innodb_deprecated_ignored, nullptr, deprecated::thread_sleep_delay_warn,
-  0, 0, 1000000, 0);
-
 static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Path to individual files and their sizes.",
@@ -19631,11 +19353,6 @@ static MYSQL_SYSVAR_ULONG(undo_tablespaces, srv_undo_tablespaces,
   0L,			/* Minimum value */
   TRX_SYS_MAX_UNDO_SPACES, 0); /* Maximum value */
 
-static MYSQL_SYSVAR_ULONG(undo_logs, deprecated::innodb_undo_logs,
-  PLUGIN_VAR_OPCMDARG,
-  innodb_deprecated_ignored, NULL, innodb_undo_logs_warn,
-  TRX_SYS_N_RSEGS, 0, TRX_SYS_N_RSEGS, 0);
-
 static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size,
   PLUGIN_VAR_OPCMDARG,
   "Desired maximum UNDO tablespace size in bytes",
@@ -19670,10 +19387,37 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str,
   PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
   "InnoDB version", NULL, NULL, INNODB_VERSION_STR);
 
+#ifdef HAVE_URING
+# include <sys/utsname.h>
+static utsname uname_for_io_uring;
+#else
+static
+#endif
+bool innodb_use_native_aio_default()
+{
+#ifdef HAVE_URING
+  utsname &u= uname_for_io_uring;
+  if (!uname(&u) && u.release[0] == '5' && u.release[1] == '.' &&
+      u.release[2] == '1' && u.release[3] >= '1' && u.release[3] <= '5' &&
+      u.release[4] == '.')
+  {
+    if (u.release[3] == '5') {
+      const char *s= strstr(u.version, "5.15.");
+      if (s || (s= strstr(u.release, "5.15.")))
+        if ((s[5] >= '3' || s[6] >= '0'))
+          return true; /* 5.15.3 and later should be fine */
+    }
+    io_uring_may_be_unsafe= u.release;
+    return false; /* working around io_uring hangs (MDEV-26674) */
+  }
+#endif
+  return true;
+}
+
 static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
   PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
   "Use native AIO if supported on this platform.",
-  NULL, NULL, TRUE);
+  NULL, NULL, innodb_use_native_aio_default());
 
 #ifdef HAVE_LIBNUMA
 static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave,
@@ -19801,6 +19545,11 @@ static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode,
   "Start InnoDB in read only mode (off by default)",
   NULL, NULL, FALSE);
 
+static MYSQL_SYSVAR_BOOL(read_only_compressed, innodb_read_only_compressed,
+  PLUGIN_VAR_OPCMDARG,
+  "Make ROW_FORMAT=COMPRESSED tables read-only",
+  NULL, NULL, FALSE);
+
 static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled,
   PLUGIN_VAR_OPCMDARG,
   "Enable INFORMATION_SCHEMA.innodb_cmp_per_index,"
@@ -19853,11 +19602,6 @@ static MYSQL_SYSVAR_UINT(saved_page_number_debug,
   srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG,
   "An InnoDB page number.",
   NULL, NULL, 0, 0, UINT_MAX32, 0);
-
-static MYSQL_SYSVAR_BOOL(sync_debug, srv_sync_debug,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  "Enable the sync debug checks",
-  NULL, NULL, FALSE);
 #endif /* UNIV_DEBUG */
 
 static MYSQL_SYSVAR_BOOL(force_primary_key,
@@ -19928,17 +19672,7 @@ static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops,
 			 "Use this many iops for background key rotation",
 			 NULL,
 			 innodb_encryption_rotation_iops_update,
-			 srv_n_fil_crypt_iops, 0, UINT_MAX32, 0);
-
-static MYSQL_SYSVAR_BOOL(scrub_log, deprecated::innodb_scrub_log,
-  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
-  innodb_deprecated_ignored,
-  0, 0, 0);
-
-static MYSQL_SYSVAR_ULONGLONG(scrub_log_speed, deprecated::innodb_scrub_log_speed,
-  PLUGIN_VAR_OPCMDARG,
-  innodb_deprecated_ignored, NULL, innodb_scrub_log_speed_warn,
-  256, 1, 50000, 0);
+			 100, 0, UINT_MAX32, 0);
 
 static MYSQL_SYSVAR_BOOL(encrypt_log, srv_encrypt_log,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
@@ -19951,26 +19685,6 @@ static MYSQL_SYSVAR_BOOL(immediate_scrub_data_uncompressed,
 			 "Enable scrubbing of data",
 			 NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_BOOL(background_scrub_data_uncompressed,
-  deprecated::innodb_background_scrub_data_uncompressed,
-  PLUGIN_VAR_OPCMDARG, innodb_deprecated_ignored, NULL,
-  innodb_background_scrub_data_uncompressed_warn, FALSE);
-
-static MYSQL_SYSVAR_BOOL(background_scrub_data_compressed,
-  deprecated::innodb_background_scrub_data_compressed,
-  PLUGIN_VAR_OPCMDARG, innodb_deprecated_ignored, NULL,
-  innodb_background_scrub_data_compressed_warn, FALSE);
-
-static MYSQL_SYSVAR_UINT(background_scrub_data_check_interval,
-  deprecated::innodb_background_scrub_data_check_interval,
-  0, innodb_deprecated_ignored, NULL,
-  innodb_background_scrub_data_check_interval_warn, 0, 0, 0, 0);
-
-static MYSQL_SYSVAR_UINT(background_scrub_data_interval,
-  deprecated::innodb_background_scrub_data_interval,
-  0, innodb_deprecated_ignored, NULL,
-  innodb_background_scrub_data_interval_warn, 0, 0, 0, 0);
-
 static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tables,
   PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
   "Enrypt the temporary table data.",
@@ -19980,7 +19694,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(autoextend_increment),
   MYSQL_SYSVAR(buffer_pool_size),
   MYSQL_SYSVAR(buffer_pool_chunk_size),
-  MYSQL_SYSVAR(buffer_pool_instances),
   MYSQL_SYSVAR(buffer_pool_filename),
   MYSQL_SYSVAR(buffer_pool_dump_now),
   MYSQL_SYSVAR(buffer_pool_dump_at_shutdown),
@@ -20004,9 +19717,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(lru_flush_size),
   MYSQL_SYSVAR(flush_neighbors),
   MYSQL_SYSVAR(checksum_algorithm),
-  MYSQL_SYSVAR(log_checksums),
-  MYSQL_SYSVAR(commit_concurrency),
-  MYSQL_SYSVAR(concurrency_tickets),
   MYSQL_SYSVAR(compression_level),
   MYSQL_SYSVAR(data_file_path),
   MYSQL_SYSVAR(temp_data_file_path),
@@ -20018,7 +19728,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(read_io_threads),
   MYSQL_SYSVAR(write_io_threads),
   MYSQL_SYSVAR(file_per_table),
-  MYSQL_SYSVAR(file_format), /* deprecated in MariaDB 10.2; no effect */
   MYSQL_SYSVAR(flush_log_at_timeout),
   MYSQL_SYSVAR(flush_log_at_trx_commit),
   MYSQL_SYSVAR(flush_method),
@@ -20032,19 +19741,14 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(ft_min_token_size),
   MYSQL_SYSVAR(ft_num_word_optimize),
   MYSQL_SYSVAR(ft_sort_pll_degree),
-  MYSQL_SYSVAR(large_prefix), /* deprecated in MariaDB 10.2; no effect */
-  MYSQL_SYSVAR(force_load_corrupted),
-  MYSQL_SYSVAR(lock_schedule_algorithm),
   MYSQL_SYSVAR(lock_wait_timeout),
   MYSQL_SYSVAR(deadlock_detect),
+  MYSQL_SYSVAR(deadlock_report),
   MYSQL_SYSVAR(page_size),
   MYSQL_SYSVAR(log_buffer_size),
   MYSQL_SYSVAR(log_file_size),
-  MYSQL_SYSVAR(log_files_in_group),
   MYSQL_SYSVAR(log_write_ahead_size),
   MYSQL_SYSVAR(log_group_home_dir),
-  MYSQL_SYSVAR(log_compressed_pages),
-  MYSQL_SYSVAR(log_optimize_ddl),
   MYSQL_SYSVAR(max_dirty_pages_pct),
   MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
   MYSQL_SYSVAR(adaptive_flushing_lwm),
@@ -20076,7 +19780,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(adaptive_hash_index_parts),
 #endif /* BTR_CUR_HASH_ADAPT */
   MYSQL_SYSVAR(stats_method),
-  MYSQL_SYSVAR(replication_delay),
   MYSQL_SYSVAR(status_file),
   MYSQL_SYSVAR(strict_mode),
   MYSQL_SYSVAR(sort_buffer_size),
@@ -20084,10 +19787,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(sync_spin_loops),
   MYSQL_SYSVAR(spin_wait_delay),
   MYSQL_SYSVAR(table_locks),
-  MYSQL_SYSVAR(thread_concurrency),
-  MYSQL_SYSVAR(adaptive_max_sleep_delay),
   MYSQL_SYSVAR(prefix_index_cluster_optimization),
-  MYSQL_SYSVAR(thread_sleep_delay),
   MYSQL_SYSVAR(tmpdir),
   MYSQL_SYSVAR(autoinc_lock_mode),
   MYSQL_SYSVAR(version),
@@ -20104,10 +19804,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(random_read_ahead),
   MYSQL_SYSVAR(read_ahead_threshold),
   MYSQL_SYSVAR(read_only),
+  MYSQL_SYSVAR(read_only_compressed),
   MYSQL_SYSVAR(instant_alter_column_allowed),
   MYSQL_SYSVAR(io_capacity),
   MYSQL_SYSVAR(io_capacity_max),
-  MYSQL_SYSVAR(page_cleaners),
   MYSQL_SYSVAR(monitor_enable),
   MYSQL_SYSVAR(monitor_disable),
   MYSQL_SYSVAR(monitor_reset),
@@ -20115,7 +19815,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(purge_threads),
   MYSQL_SYSVAR(purge_batch_size),
 #ifdef UNIV_DEBUG
-  MYSQL_SYSVAR(background_drop_list_empty),
   MYSQL_SYSVAR(log_checkpoint_now),
   MYSQL_SYSVAR(buf_flush_list_now),
   MYSQL_SYSVAR(merge_threshold_set_all_debug),
@@ -20124,13 +19823,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(status_output_locks),
   MYSQL_SYSVAR(print_all_deadlocks),
   MYSQL_SYSVAR(cmp_per_index_enabled),
-  MYSQL_SYSVAR(undo_logs),
   MYSQL_SYSVAR(max_undo_log_size),
   MYSQL_SYSVAR(purge_rseg_truncate_frequency),
   MYSQL_SYSVAR(undo_log_truncate),
   MYSQL_SYSVAR(undo_directory),
   MYSQL_SYSVAR(undo_tablespaces),
-  MYSQL_SYSVAR(sync_array_size),
   MYSQL_SYSVAR(compression_failure_threshold_pct),
   MYSQL_SYSVAR(compression_pad_pct_max),
   MYSQL_SYSVAR(default_row_format),
@@ -20142,7 +19839,6 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(data_file_size_debug),
   MYSQL_SYSVAR(fil_make_page_dirty_debug),
   MYSQL_SYSVAR(saved_page_number_debug),
-  MYSQL_SYSVAR(sync_debug),
 #endif /* UNIV_DEBUG */
   MYSQL_SYSVAR(force_primary_key),
   MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
@@ -20154,16 +19850,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(encryption_threads),
   MYSQL_SYSVAR(encryption_rotate_key_age),
   MYSQL_SYSVAR(encryption_rotation_iops),
-  MYSQL_SYSVAR(scrub_log),
-  MYSQL_SYSVAR(scrub_log_speed),
   MYSQL_SYSVAR(encrypt_log),
   MYSQL_SYSVAR(default_encryption_key_id),
-  /* Scrubing feature */
   MYSQL_SYSVAR(immediate_scrub_data_uncompressed),
-  MYSQL_SYSVAR(background_scrub_data_uncompressed),
-  MYSQL_SYSVAR(background_scrub_data_compressed),
-  MYSQL_SYSVAR(background_scrub_data_interval),
-  MYSQL_SYSVAR(background_scrub_data_check_interval),
   MYSQL_SYSVAR(buf_dump_status_frequency),
   MYSQL_SYSVAR(background_thread),
   MYSQL_SYSVAR(encrypt_temporary_tables),
@@ -20214,10 +19903,7 @@ i_s_innodb_sys_fields,
 i_s_innodb_sys_foreign,
 i_s_innodb_sys_foreign_cols,
 i_s_innodb_sys_tablespaces,
-i_s_innodb_sys_datafiles,
 i_s_innodb_sys_virtual,
-i_s_innodb_mutexes,
-i_s_innodb_sys_semaphore_waits,
 i_s_innodb_tablespaces_encryption
 maria_declare_plugin_end;
 
@@ -20227,11 +19913,6 @@ static
 void
 innodb_params_adjust()
 {
-	/* The default value and the max value of
-	innodb_undo_logs must be equal to the available undo logs. */
-	MYSQL_SYSVAR_NAME(undo_logs).max_val
-		= MYSQL_SYSVAR_NAME(undo_logs).def_val
-		= srv_available_undo_logs;
 	MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
 		= 1ULL << (32U + srv_page_size_shift);
 	MYSQL_SYSVAR_NAME(max_undo_log_size).min_val
@@ -20374,10 +20055,10 @@ TABLE* innobase_init_vc_templ(dict_table_t* table)
 
 	dict_vcol_templ_t* vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.lock(SRW_LOCK_CALL);
 	table->vc_templ = vc_templ;
 	innobase_build_v_templ(mysql_table, table, vc_templ, nullptr, true);
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unlock();
 	DBUG_RETURN(mysql_table);
 }
 
@@ -20436,7 +20117,7 @@ innobase_rename_vc_templ(
 		for purge thread.
 */
 
-bool innobase_allocate_row_for_vcol(THD *thd, dict_index_t *index,
+bool innobase_allocate_row_for_vcol(THD *thd, const dict_index_t *index,
                                     mem_heap_t **heap, TABLE **table,
                                     VCOL_STORAGE *storage)
 {
@@ -21340,7 +21021,6 @@ static const size_t MAX_BUF_SIZE = 4 * 1024;
 
 /********************************************************************//**
 Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
 void
 ib_push_warning(
 	trx_t*		trx,	/*!< in: trx */
@@ -21368,7 +21048,6 @@ ib_push_warning(
 
 /********************************************************************//**
 Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
 void
 ib_push_warning(
 	void*		ithd,	/*!< in: thd */
@@ -21404,7 +21083,6 @@ ib_push_warning(
 @param[in]	table_name	Table name
 @param[in]	format		Warning message
 @param[in]	...		Message arguments */
-UNIV_INTERN
 void
 ib_foreign_warn(trx_t*	    trx,   /*!< in: trx */
 		dberr_t	    error, /*!< in: error code to push as warning */
@@ -21425,13 +21103,13 @@ ib_foreign_warn(trx_t*	    trx,   /*!< in: trx */
 	vsprintf(buf, format, args);
 	va_end(args);
 
-	mutex_enter(&dict_foreign_err_mutex);
+	mysql_mutex_lock(&dict_foreign_err_mutex);
 	rewind(ef);
 	ut_print_timestamp(ef);
 	fprintf(ef, " Error in foreign key constraint of table %s:\n",
 		table_name);
 	fputs(buf, ef);
-	mutex_exit(&dict_foreign_err_mutex);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
 
 	if (trx && trx->mysql_thd) {
 		THD* thd = (THD*)trx->mysql_thd;
@@ -21447,10 +21125,8 @@ ib_foreign_warn(trx_t*	    trx,   /*!< in: trx */
 /********************************************************************//**
 Helper function to push frm mismatch error to error log and
 if needed to sql-layer. */
-UNIV_INTERN
 void
 ib_push_frm_error(
-/*==============*/
 	THD*		thd,		/*!< in: MySQL thd */
 	dict_table_t*	ib_table,	/*!< in: InnoDB table */
 	TABLE*		table,		/*!< in: MySQL table */
@@ -21504,10 +21180,7 @@ ib_push_frm_error(
 		sql_print_error("InnoDB: Table %s contains " ULINTPF " "
 			"indexes inside InnoDB, which "
 			"is different from the number of "
-			"indexes %u defined in the MariaDB "
-			" Have you mixed up "
-			".frm files from different "
-			"installations? See "
+			"indexes %u defined in the .frm file. See "
 			"https://mariadb.com/kb/en/innodb-troubleshooting/\n",
 			ib_table->name.m_name, n_keys,
 			table->s->keys);
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 13d36e71e40..5f1b67bb249 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -63,8 +63,6 @@ public:
 	ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. */
         enum row_type get_row_type() const override;
 
-        const char* table_type() const;
-
 	const char* index_type(uint key_number) override;
 
 	Table_flags table_flags() const override;
@@ -186,26 +184,24 @@ public:
 
 	void update_create_info(HA_CREATE_INFO* create_info) override;
 
-	inline int create(
+	int create(
 		const char*		name,
 		TABLE*			form,
 		HA_CREATE_INFO*		create_info,
 		bool			file_per_table,
-		trx_t*			trx = NULL);
+		trx_t*			trx);
 
 	int create(
 		const char*		name,
 		TABLE*			form,
 		HA_CREATE_INFO*		create_info) override;
 
-	inline int delete_table(const char* name, enum_sql_command sqlcom);
-
 	int truncate() override;
 
 	int delete_table(const char *name) override;
 
 	int rename_table(const char* from, const char* to) override;
-	inline int defragment_table(const char* name);
+	inline int defragment_table();
 	int check(THD* thd, HA_CHECK_OPT* check_opt) override;
 
 	inline void reload_statistics();
@@ -223,7 +219,7 @@ public:
 
 	uint referenced_by_foreign_key() override;
 
-	void free_foreign_key_create_info(char* str) override;
+	void free_foreign_key_create_info(char* str) override { my_free(str); }
 
 	uint lock_count(void) const override;
 
@@ -456,6 +452,9 @@ protected:
 	@see build_template() */
 	void reset_template();
 
+	/** @return whether the table is read-only */
+	bool is_read_only(bool altering_to_supported= false) const;
+
 	inline void update_thd(THD* thd);
 	void update_thd();
 
@@ -529,26 +528,12 @@ the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
 
 extern "C" {
 
-/** Check if a user thread is a replication slave thread
-@param thd user thread
-@retval 0 the user thread is not a replication slave thread
-@retval 1 the user thread is a replication slave thread */
-int thd_slave_thread(const MYSQL_THD thd);
-
 /** Check if a user thread is running a non-transactional update
 @param thd user thread
 @retval 0 the user thread is not running a non-transactional update
 @retval 1 the user thread is running a non-transactional update */
 int thd_non_transactional_update(const MYSQL_THD thd);
 
-/** Get high resolution timestamp for the current query start time.
-The timestamp is not anchored to any specific point in time,
-but can be used for comparison.
-@param thd user thread
-@retval timestamp in microseconds precision
-*/
-unsigned long long thd_start_utime(const MYSQL_THD thd);
-
 /** Get the user thread's binary logging format
 @param thd user thread
 @return Value to be used as index into the binlog_format_names array */
@@ -652,8 +637,9 @@ public:
 	@param create_fk	whether to add FOREIGN KEY constraints */
 	int create_table(bool create_fk = true);
 
-	/** Update the internal data dictionary. */
-	int create_table_update_dict();
+  static void create_table_update_dict(dict_table_t* table, THD* thd,
+                                       const HA_CREATE_INFO& info,
+                                       const TABLE& t);
 
 	/** Validates the create options. Checks that the options
 	KEY_BLOCK_SIZE, ROW_FORMAT, DATA DIRECTORY, TEMPORARY & TABLESPACE
@@ -713,29 +699,13 @@ public:
 	trx_t* trx() const
 	{ return(m_trx); }
 
-	/** Return table name. */
-	const char* table_name() const
-	{ return(m_table_name); }
-
-	/** @return whether the table needs to be dropped on rollback */
-	bool drop_before_rollback() const { return m_drop_before_rollback; }
-
-	THD* thd() const
-	{ return(m_thd); }
-
-	/** Normalizes a table name string.
-	A normalized name consists of the database name catenated to '/' and
-	table name. An example: test/mytable. On Windows normalization puts
-	both the database name and the table name always to lower case if
-	"set_lower_case" is set to true.
-	@param[in,out]	norm_name	Buffer to return the normalized name in.
-	@param[in]	name		Table name string.
-	@param[in]	set_lower_case	True if we want to set name to lower
-					case. */
-	static void normalize_table_name_low(
-		char*           norm_name,
-		const char*     name,
-		ibool           set_lower_case);
+	/** @return table name */
+	const char* table_name() const { return(m_table_name); }
+
+	/** @return the created table */
+	dict_table_t *table() const { return m_table; }
+
+	THD* thd() const { return(m_thd); }
 
 private:
 	/** Parses the table name into normal name and either temp path or
@@ -766,8 +736,6 @@ private:
 	char*		m_table_name;
 	/** Table */
 	dict_table_t*	m_table;
-	/** Whether the table needs to be dropped before rollback */
-	bool		m_drop_before_rollback;
 
 	/** Remote path (DATA DIRECTORY) or zero length-string */
 	char*		m_remote_path;
@@ -870,15 +838,6 @@ innodb_base_col_setup_for_stored(
 /** whether this is a stored generated column */
 #define innobase_is_s_fld(field) ((field)->vcol_info && (field)->stored_in_db())
 
-/** Always normalize table name to lower case on Windows */
-#ifdef _WIN32
-#define normalize_table_name(norm_name, name)           \
-	create_table_info_t::normalize_table_name_low(norm_name, name, TRUE)
-#else
-#define normalize_table_name(norm_name, name)           \
-	create_table_info_t::normalize_table_name_low(norm_name, name, FALSE)
-#endif /* _WIN32 */
-
 /** Converts a search mode flag understood by MySQL to a flag understood
 by InnoDB.
 @param[in]	find_flag	MySQL search mode flag.
@@ -947,10 +906,8 @@ innodb_col_no(const Field* field)
 /********************************************************************//**
 Helper function to push frm mismatch error to error log and
 if needed to sql-layer. */
-UNIV_INTERN
 void
 ib_push_frm_error(
-/*==============*/
 	THD*		thd,		/*!< in: MySQL thd */
 	dict_table_t*	ib_table,	/*!< in: InnoDB table */
 	TABLE*		table,		/*!< in: MySQL table */
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index e3c802a7b46..12dfc2f5a98 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -35,7 +35,7 @@ Smart ALTER TABLE
 #include "btr0sea.h"
 #include "dict0crea.h"
 #include "dict0dict.h"
-#include "dict0priv.h"
+#include "dict0load.h"
 #include "dict0stats.h"
 #include "dict0stats_bg.h"
 #include "log0log.h"
@@ -46,6 +46,7 @@ Smart ALTER TABLE
 #include "row0row.h"
 #include "row0upd.h"
 #include "trx0trx.h"
+#include "trx0purge.h"
 #include "handler0alter.h"
 #include "srv0mon.h"
 #include "srv0srv.h"
@@ -55,11 +56,9 @@ Smart ALTER TABLE
 #include "row0sel.h"
 #include "ha_innodb.h"
 #include "ut0stage.h"
-#include "span.h"
 #include <thread>
 #include <sstream>
 
-using st_::span;
 /** File format constraint for ALTER TABLE */
 extern ulong innodb_instant_alter_column_allowed;
 
@@ -104,6 +103,8 @@ static const alter_table_operations INNOBASE_ALTER_REBUILD
 static const alter_table_operations INNOBASE_ALTER_DATA
 	= INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD;
 
+#define ALTER_INDEX_IGNORABILITY 0
+
 /** Operations for altering a table that InnoDB does not care about */
 static const alter_table_operations INNOBASE_INPLACE_IGNORE
 	= ALTER_COLUMN_DEFAULT
@@ -116,7 +117,8 @@ static const alter_table_operations INNOBASE_INPLACE_IGNORE
 	| ALTER_RENAME
 	| ALTER_INDEX_ORDER
 	| ALTER_COLUMN_INDEX_LENGTH
-	| ALTER_CHANGE_INDEX_COMMENT;
+	| ALTER_CHANGE_INDEX_COMMENT
+	| ALTER_INDEX_IGNORABILITY;
 
 /** Operations on foreign key definitions (changing the schema only) */
 static const alter_table_operations INNOBASE_FOREIGN_OPERATIONS
@@ -496,7 +498,7 @@ inline bool dict_table_t::instant_column(const dict_table_t& table,
 	DBUG_ASSERT(table.n_cols + table.n_dropped() >= n_cols + n_dropped());
 	DBUG_ASSERT(!table.persistent_autoinc
 		    || persistent_autoinc == table.persistent_autoinc);
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	{
 		const char* end = table.col_names;
@@ -741,7 +743,7 @@ inline void dict_table_t::rollback_instant(
 	const char*	old_v_col_names,
 	const ulint*	col_map)
 {
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	if (cols == old_cols) {
 		/* Alter fails before instant operation happens.
@@ -845,10 +847,156 @@ inline void dict_table_t::rollback_instant(
 	}
 }
 
+/* Report an InnoDB error to the client by invoking my_error(). */
+static ATTRIBUTE_COLD __attribute__((nonnull))
+void
+my_error_innodb(
+/*============*/
+	dberr_t		error,	/*!< in: InnoDB error code */
+	const char*	table,	/*!< in: table name */
+	ulint		flags)	/*!< in: table flags */
+{
+	switch (error) {
+	case DB_MISSING_HISTORY:
+		my_error(ER_TABLE_DEF_CHANGED, MYF(0));
+		break;
+	case DB_RECORD_NOT_FOUND:
+		my_error(ER_KEY_NOT_FOUND, MYF(0), table);
+		break;
+	case DB_DEADLOCK:
+		my_error(ER_LOCK_DEADLOCK, MYF(0));
+		break;
+	case DB_LOCK_WAIT_TIMEOUT:
+		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+		break;
+	case DB_INTERRUPTED:
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+		break;
+	case DB_OUT_OF_MEMORY:
+		my_error(ER_OUT_OF_RESOURCES, MYF(0));
+		break;
+	case DB_OUT_OF_FILE_SPACE:
+		my_error(ER_RECORD_FILE_FULL, MYF(0), table);
+		break;
+	case DB_TEMP_FILE_WRITE_FAIL:
+		my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0));
+		break;
+	case DB_TOO_BIG_INDEX_COL:
+		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+			 (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+		break;
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+		break;
+	case DB_LOCK_TABLE_FULL:
+		my_error(ER_LOCK_TABLE_FULL, MYF(0));
+		break;
+	case DB_UNDO_RECORD_TOO_BIG:
+		my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0));
+		break;
+	case DB_CORRUPTION:
+		my_error(ER_NOT_KEYFILE, MYF(0), table);
+		break;
+	case DB_TOO_BIG_RECORD: {
+		/* Note that in page0zip.ic page_zip_rec_needs_ext() rec_size
+		is limited to COMPRESSED_REC_MAX_DATA_SIZE (16K) or
+		REDUNDANT_REC_MAX_DATA_SIZE (16K-1). */
+		bool comp = !!(flags & DICT_TF_COMPACT);
+		ulint free_space = page_get_free_space_of_empty(comp) / 2;
+
+		if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+					  REDUNDANT_REC_MAX_DATA_SIZE)) {
+			free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+				REDUNDANT_REC_MAX_DATA_SIZE) - 1;
+		}
+
+		my_error(ER_TOO_BIG_ROWSIZE, MYF(0), free_space);
+		break;
+	}
+	case DB_INVALID_NULL:
+		/* TODO: report the row, as we do for DB_DUPLICATE_KEY */
+		my_error(ER_INVALID_USE_OF_NULL, MYF(0));
+		break;
+	case DB_CANT_CREATE_GEOMETRY_OBJECT:
+		my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0));
+		break;
+	case DB_TABLESPACE_EXISTS:
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), table);
+		break;
+
+#ifdef UNIV_DEBUG
+	case DB_SUCCESS:
+	case DB_DUPLICATE_KEY:
+	case DB_ONLINE_LOG_TOO_BIG:
+		/* These codes should not be passed here. */
+		ut_error;
+#endif /* UNIV_DEBUG */
+	default:
+		my_error(ER_GET_ERRNO, MYF(0), error, "InnoDB");
+		break;
+	}
+}
+
+/** Get the name of an erroneous key.
+@param[in]	error_key_num	InnoDB number of the erroneus key
+@param[in]	ha_alter_info	changes that were being performed
+@param[in]	table		InnoDB table
+@return	the name of the erroneous key */
+static
+const char*
+get_error_key_name(
+	ulint				error_key_num,
+	const Alter_inplace_info*	ha_alter_info,
+	const dict_table_t*		table)
+{
+	if (error_key_num == ULINT_UNDEFINED) {
+		return(FTS_DOC_ID_INDEX_NAME);
+	} else if (ha_alter_info->key_count == 0) {
+		return(dict_table_get_first_index(table)->name);
+	} else {
+		return(ha_alter_info->key_info_buffer[error_key_num].name.str);
+	}
+}
+
+/** Convert field type and length to InnoDB format */
+static void get_type(const Field &f, uint &prtype, uint8_t &mtype,
+                     uint16_t &len)
+{
+  mtype= get_innobase_type_from_mysql_type(&prtype, &f);
+  len= static_cast<uint16_t>(f.pack_length());
+  prtype|= f.type();
+  if (f.type() == MYSQL_TYPE_VARCHAR)
+  {
+    auto l= static_cast<const Field_varstring&>(f).length_bytes;
+    len= static_cast<uint16_t>(len - l);
+    if (l == 2)
+      prtype|= DATA_LONG_TRUE_VARCHAR;
+  }
+  if (!f.real_maybe_null())
+    prtype |= DATA_NOT_NULL;
+  if (f.binary())
+    prtype |= DATA_BINARY_TYPE;
+  if (f.table->versioned())
+  {
+    if (&f == f.table->field[f.table->s->vers.start_fieldno])
+      prtype|= DATA_VERS_START;
+    else if (&f == f.table->field[f.table->s->vers.end_fieldno])
+      prtype|= DATA_VERS_END;
+    else if (!(f.flags & VERS_UPDATE_UNVERSIONED_FLAG))
+      prtype|= DATA_VERSIONED;
+  }
+
+  if (!f.stored_in_db())
+    prtype|= DATA_VIRTUAL;
+
+  if (dtype_is_string_type(mtype))
+    prtype|= f.charset()->number << 16;
+}
+
 struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 {
 	/** Dummy query graph */
-	que_thr_t*	thr;
+	que_thr_t*const	thr;
 	/** The prebuilt struct of the creating instance */
 	row_prebuilt_t*&	prebuilt;
 	/** InnoDB indexes being created */
@@ -870,9 +1018,9 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 	/** number of InnoDB foreign key constraints being dropped */
 	const ulint	num_to_add_fk;
 	/** whether to create the indexes online */
-	bool		online;
+	const bool	online;
 	/** memory heap */
-	mem_heap_t*	heap;
+	mem_heap_t* const heap;
 	/** dictionary transaction */
 	trx_t*		trx;
 	/** original table (if rebuilt, differs from indexed_table) */
@@ -939,6 +1087,10 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 	/** The page_compression_level attribute, or 0 */
 	const uint	page_compression_level;
 
+	/** Indexed columns whose charset-collation is changing
+	in a way that does not require the table to be rebuilt */
+	col_collations change_col_collate;
+
 	ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg,
 				dict_index_t** drop_arg,
 				ulint num_to_drop_arg,
@@ -957,12 +1109,15 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 				bool page_compressed,
 				ulonglong page_compression_level_arg) :
 		inplace_alter_handler_ctx(),
+		thr (pars_complete_graph_for_exec(nullptr, prebuilt_arg->trx,
+						  heap_arg, prebuilt_arg)),
 		prebuilt (prebuilt_arg),
 		add_index (0), add_key_numbers (0), num_to_add_index (0),
 		drop_index (drop_arg), num_to_drop_index (num_to_drop_arg),
 		drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg),
 		add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg),
-		online (online_arg), heap (heap_arg), trx (0),
+		online (online_arg), heap (heap_arg),
+		trx (innobase_trx_allocate(prebuilt_arg->trx->mysql_thd)),
 		old_table (prebuilt_arg->table),
 		new_table (new_table_arg), instant_table (0),
 		col_map (0), col_names (col_names_arg),
@@ -1009,8 +1164,7 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 		}
 #endif /* UNIV_DEBUG */
 
-		thr = pars_complete_graph_for_exec(NULL, prebuilt->trx, heap,
-			prebuilt);
+		trx_start_for_ddl(trx);
 	}
 
 	~ha_innobase_inplace_ctx()
@@ -1021,7 +1175,7 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
 			while (dict_index_t* index
 			       = UT_LIST_GET_LAST(instant_table->indexes)) {
 				UT_LIST_REMOVE(instant_table->indexes, index);
-				rw_lock_free(&index->lock);
+				index->lock.free();
 				dict_mem_index_free(index);
 			}
 			for (unsigned i = old_n_v_cols; i--; ) {
@@ -1173,110 +1327,142 @@ struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
     }
   }
 
-private:
-	// Disable copying
-	ha_innobase_inplace_ctx(const ha_innobase_inplace_ctx&);
-	ha_innobase_inplace_ctx& operator=(const ha_innobase_inplace_ctx&);
-};
+  /** @return whether a FULLTEXT INDEX is being added */
+  bool adding_fulltext_index() const
+  {
+    for (ulint a= 0; a < num_to_add_index; a++)
+      if (add_index[a]->type & DICT_FTS)
+        return true;
+    return false;
+  }
 
-/********************************************************************//**
-Get the upper limit of the MySQL integral and floating-point type.
-@return maximum allowed value for the field */
-UNIV_INTERN
-ulonglong
-innobase_get_int_col_max_value(
-/*===========================*/
-	const Field*	field);	/*!< in: MySQL field */
+  /** Handle the apply log failure for online DDL operation.
+  @param ha_alter_info    handler alter inplace info
+  @param altered_table    MySQL table that is being altered
+  @param error            error code
+  @retval false if error value is DB_SUCCESS or
+  TRUE in case of error */
+  bool log_failure(Alter_inplace_info *ha_alter_info,
+                   TABLE *altered_table, dberr_t error)
+  {
+    ulint err_key= thr_get_trx(thr)->error_key_num;
+    switch (error) {
+      KEY *dup_key;
+    case DB_SUCCESS:
+      return false;
+    case DB_DUPLICATE_KEY:
+      if (err_key == ULINT_UNDEFINED)
+        /* This should be the hidden index on FTS_DOC_ID */
+        dup_key= nullptr;
+      else
+      {
+        DBUG_ASSERT(err_key < ha_alter_info->key_count);
+        dup_key= &ha_alter_info->key_info_buffer[err_key];
+      }
+      print_keydup_error(altered_table, dup_key, MYF(0));
+      break;
+    case DB_ONLINE_LOG_TOO_BIG:
+      my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+               get_error_key_name(err_key, ha_alter_info, new_table));
+      break;
+    case DB_INDEX_CORRUPT:
+      my_error(ER_INDEX_CORRUPT, MYF(0),
+               get_error_key_name(err_key, ha_alter_info, new_table));
+      break;
+    default:
+      my_error_innodb(error, old_table->name.m_name, old_table->flags);
+    }
+    return true;
+  }
 
-/* Report an InnoDB error to the client by invoking my_error(). */
-static ATTRIBUTE_COLD __attribute__((nonnull))
-void
-my_error_innodb(
-/*============*/
-	dberr_t		error,	/*!< in: InnoDB error code */
-	const char*	table,	/*!< in: table name */
-	ulint		flags)	/*!< in: table flags */
-{
-	switch (error) {
-	case DB_MISSING_HISTORY:
-		my_error(ER_TABLE_DEF_CHANGED, MYF(0));
-		break;
-	case DB_RECORD_NOT_FOUND:
-		my_error(ER_KEY_NOT_FOUND, MYF(0), table);
-		break;
-	case DB_DEADLOCK:
-		my_error(ER_LOCK_DEADLOCK, MYF(0));
-		break;
-	case DB_LOCK_WAIT_TIMEOUT:
-		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
-		break;
-	case DB_INTERRUPTED:
-		my_error(ER_QUERY_INTERRUPTED, MYF(0));
-		break;
-	case DB_OUT_OF_MEMORY:
-		my_error(ER_OUT_OF_RESOURCES, MYF(0));
-		break;
-	case DB_OUT_OF_FILE_SPACE:
-		my_error(ER_RECORD_FILE_FULL, MYF(0), table);
-		break;
-	case DB_TEMP_FILE_WRITE_FAIL:
-		my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0));
-		break;
-	case DB_TOO_BIG_INDEX_COL:
-		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
-			 (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
-		break;
-	case DB_TOO_MANY_CONCURRENT_TRXS:
-		my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
-		break;
-	case DB_LOCK_TABLE_FULL:
-		my_error(ER_LOCK_TABLE_FULL, MYF(0));
-		break;
-	case DB_UNDO_RECORD_TOO_BIG:
-		my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0));
-		break;
-	case DB_CORRUPTION:
-		my_error(ER_NOT_KEYFILE, MYF(0), table);
-		break;
-	case DB_TOO_BIG_RECORD: {
-		/* Note that in page0zip.ic page_zip_rec_needs_ext() rec_size
-		is limited to COMPRESSED_REC_MAX_DATA_SIZE (16K) or
-		REDUNDANT_REC_MAX_DATA_SIZE (16K-1). */
-		bool comp = !!(flags & DICT_TF_COMPACT);
-		ulint free_space = page_get_free_space_of_empty(comp) / 2;
+  /** Check whether the column has any change in collation type.
+  If it is then store the column information in heap
+  @param index          index being added (or rebuilt)
+  @param altered_table  altered table definition */
+  void change_col_collation(dict_index_t *index, const TABLE &altered_table)
+  {
+    ut_ad(!need_rebuild());
+    ut_ad(!index->is_primary());
+    ut_ad(!index->is_committed());
 
-		if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE :
-					  REDUNDANT_REC_MAX_DATA_SIZE)) {
-			free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE :
-				REDUNDANT_REC_MAX_DATA_SIZE) - 1;
-		}
+    unsigned n_cols= 0;
+    for (unsigned i= 0; i < index->n_fields; i++)
+    {
+      const char *field_name= index->fields[i].name();
+      if (!field_name || !dtype_is_string_type(index->fields[i].col->mtype))
+        continue;
+      for (uint j= 0; j < altered_table.s->fields; j++)
+      {
+        const Field *altered_field= altered_table.field[j];
 
-		my_error(ER_TOO_BIG_ROWSIZE, MYF(0), free_space);
-		break;
-	}
-	case DB_INVALID_NULL:
-		/* TODO: report the row, as we do for DB_DUPLICATE_KEY */
-		my_error(ER_INVALID_USE_OF_NULL, MYF(0));
-		break;
-	case DB_CANT_CREATE_GEOMETRY_OBJECT:
-		my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0));
-		break;
-	case DB_TABLESPACE_EXISTS:
-		my_error(ER_TABLESPACE_EXISTS, MYF(0), table);
-		break;
+        if (my_strcasecmp(system_charset_info, field_name,
+                          altered_field->field_name.str))
+          continue;
 
-#ifdef UNIV_DEBUG
-	case DB_SUCCESS:
-	case DB_DUPLICATE_KEY:
-	case DB_ONLINE_LOG_TOO_BIG:
-		/* These codes should not be passed here. */
-		ut_error;
-#endif /* UNIV_DEBUG */
-	default:
-		my_error(ER_GET_ERRNO, MYF(0), error, "InnoDB");
-		break;
-	}
-}
+        unsigned prtype;
+        uint8_t mtype;
+        uint16_t len;
+        get_type(*altered_field, prtype, mtype, len);
+
+        if (prtype == index->fields[i].col->prtype)
+          continue;
+        auto it= change_col_collate.find(index->fields[i].col->ind);
+        if (it != change_col_collate.end())
+        {
+          n_cols++;
+          index->fields[i].col= it->second;
+          continue;
+        }
+
+        const CHARSET_INFO *cs= altered_field->charset();
+
+        dict_col_t *col=
+          static_cast<dict_col_t*>(mem_heap_alloc(heap, sizeof *col));
+        *col= *index->fields[i].col;
+        col->prtype= prtype;
+        col->mtype= mtype;
+        col->mbminlen= cs->mbminlen & 7;
+        col->mbmaxlen= cs->mbmaxlen & 7;
+        col->len= len;
+        index->fields[i].col= col;
+        n_cols++;
+        change_col_collate[col->ind]= col;
+      }
+    }
+
+    index->init_change_cols(n_cols);
+  }
+
+  void cleanup_col_collation()
+  {
+    ut_ad(old_table == new_table);
+    if (change_col_collate.empty())
+      return;
+    const dict_index_t *index= dict_table_get_first_index(old_table);
+    while ((index= dict_table_get_next_index(index)) != nullptr)
+    {
+      if (index->is_committed())
+        continue;
+      auto collate_end= change_col_collate.end();
+      for (unsigned i= 0, j= 0; i < index->n_fields; i++)
+      {
+        const dict_col_t *col= index->fields[i].col;
+        auto it= change_col_collate.find(col->ind);
+        if (it != collate_end)
+        {
+          ut_ad(it->second == col);
+          index->fields[i].col=
+            index->change_col_info->add(index->heap, *col, j++);
+        }
+      }
+    }
+  }
+};
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+ulonglong innobase_get_int_col_max_value(const Field *field);
 
 /** Determine if fulltext indexes exist in a given table.
 @param table MySQL table
@@ -1911,7 +2097,7 @@ innobase_fts_check_doc_id_col(
 			col = dict_table_get_nth_col(table, i);
 
 			/* Because the FTS_DOC_ID does not exist in
-			the MySQL data dictionary, this must be the
+			the .frm file or TABLE_SHARE, this must be the
 			internally created FTS_DOC_ID column. */
 			ut_ad(col->mtype == DATA_INT);
 			ut_ad(col->len == 8);
@@ -1941,18 +2127,25 @@ static bool innobase_table_is_empty(const dict_table_t *table,
   btr_pcur_t pcur;
   buf_block_t *block;
   page_cur_t *cur;
-  const rec_t *rec;
+  rec_t *rec;
   bool next_page= false;
 
   mtr.start();
-  btr_pcur_open_at_index_side(true, clust_index, BTR_SEARCH_LEAF,
-                              &pcur, true, 0, &mtr);
-  btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-  if (!rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index))
-    btr_pcur_move_to_prev_on_page(&pcur);
+  if (pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr) != DB_SUCCESS)
+  {
+non_empty:
+    mtr.commit();
+    return false;
+  }
+  rec= page_rec_get_next(btr_pcur_get_rec(&pcur));
+  if (UNIV_UNLIKELY(!rec))
+    goto non_empty;
+  if (rec_is_metadata(rec, *clust_index))
+    btr_pcur_get_page_cur(&pcur)->rec= rec;
 scan_leaf:
   cur= btr_pcur_get_page_cur(&pcur);
-  page_cur_move_to_next(cur);
+  if (UNIV_UNLIKELY(!page_cur_move_to_next(cur)))
+    goto non_empty;
 next_page:
   if (next_page)
   {
@@ -1964,12 +2157,15 @@ next_page:
     }
 
     next_page= false;
-    block= page_cur_get_block(cur);
     block= btr_block_get(*clust_index, next_page_no, BTR_SEARCH_LEAF, false,
                          &mtr);
-    btr_leaf_page_release(page_cur_get_block(cur), BTR_SEARCH_LEAF, &mtr);
+    if (!block)
+      goto non_empty;
     page_cur_set_before_first(block, cur);
-    page_cur_move_to_next(cur);
+    if (UNIV_UNLIKELY(!page_cur_move_to_next(cur)))
+      goto non_empty;
+    const auto s= mtr.get_savepoint();
+    mtr.rollback_to_savepoint(s - 2, s - 1);
   }
 
   rec= page_cur_get_rec(cur);
@@ -1977,9 +2173,7 @@ next_page:
   {
     if (ignore_delete_marked)
       goto scan_leaf;
-non_empty:
-    mtr.commit();
-    return false;
+    goto non_empty;
   }
   else if (!page_rec_is_supremum(rec))
     goto non_empty;
@@ -2036,13 +2230,6 @@ ha_innobase::check_if_supported_inplace_alter(
 		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
 	}
 
-	if (high_level_read_only) {
-		ha_alter_info->unsupported_reason =
-			my_get_err_msg(ER_READ_ONLY_MODE);
-
-		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
-	}
-
 	if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) {
 		/* Deny the inplace ALTER TABLE. MySQL will try to
 		re-create the table and ha_innobase::create() will
@@ -2061,11 +2248,23 @@ ha_innobase::check_if_supported_inplace_alter(
 			    table->s->table_name.str);
 	}
 
+	if (is_read_only(!high_level_read_only
+			 && (ha_alter_info->handler_flags & ALTER_OPTIONS)
+			 && ha_alter_info->create_info->key_block_size == 0
+			 && ha_alter_info->create_info->row_type
+			 != ROW_TYPE_COMPRESSED)) {
+		ha_alter_info->unsupported_reason =
+			my_get_err_msg(ER_READ_ONLY_MODE);
+
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
 	if (ha_alter_info->handler_flags
 	    & ~(INNOBASE_INPLACE_IGNORE
 		| INNOBASE_ALTER_INSTANT
 		| INNOBASE_ALTER_NOREBUILD
-		| INNOBASE_ALTER_REBUILD)) {
+		| INNOBASE_ALTER_REBUILD
+		| ALTER_INDEX_IGNORABILITY)) {
 
 		if (ha_alter_info->handler_flags
 		    & ALTER_STORED_COLUMN_TYPE) {
@@ -2076,6 +2275,8 @@ ha_innobase::check_if_supported_inplace_alter(
 		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
 	}
 
+	ut_ad(dict_sys.sys_tables_exist());
+
 	/* Only support online add foreign key constraint when
 	check_foreigns is turned off */
 	if ((ha_alter_info->handler_flags & ALTER_ADD_FOREIGN_KEY)
@@ -2279,16 +2480,7 @@ innodb_instant_alter_column_allowed_reason:
 
 			if (new_field->field) {
 				/* This is an existing column. */
-
-				if (new_field->field->charset()
-				    == key_part->field->charset()) {
-					continue;
-				}
-
-				ha_alter_info->unsupported_reason =
-					"Collation change on"
-					" an indexed column";
-				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+				continue;
 			}
 
 			/* This is an added column. */
@@ -2307,7 +2499,7 @@ innodb_instant_alter_column_allowed_reason:
 				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
 			}
 
-			DBUG_ASSERT((MTYP_TYPENR(key_part->field->unireg_check)
+			DBUG_ASSERT((key_part->field->unireg_check
 				     == Field::NEXT_NUMBER)
 				    == !!(key_part->field->flags
 					  & AUTO_INCREMENT_FLAG));
@@ -2418,10 +2610,8 @@ innodb_instant_alter_column_allowed_reason:
 			/* An AUTO_INCREMENT attribute can only
 			be added to an existing column by ALGORITHM=COPY,
 			but we can remove the attribute. */
-			ut_ad((MTYP_TYPENR((*af)->unireg_check)
-			       != Field::NEXT_NUMBER)
-			      || (MTYP_TYPENR(f->unireg_check)
-				  == Field::NEXT_NUMBER));
+			ut_ad((*af)->unireg_check != Field::NEXT_NUMBER
+			      || f->unireg_check == Field::NEXT_NUMBER);
 			if (!f->real_maybe_null() || (*af)->real_maybe_null())
 				goto next_column;
 			/* We are changing an existing column
@@ -2674,7 +2864,7 @@ innobase_init_foreign(
 	ulint		referenced_num_field)	/*!< in: number of referenced
 						columns */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
         if (constraint_name) {
                 ulint   db_len;
@@ -3078,7 +3268,7 @@ innobase_get_foreign_key_info(
 
 		add_fk[num_fk] = dict_mem_foreign_create();
 
-		mutex_enter(&dict_sys.mutex);
+		dict_sys.lock(SRW_LOCK_CALL);
 
 		referenced_table_name = dict_get_referenced_table(
 			table->name.m_name,
@@ -3094,11 +3284,9 @@ innobase_get_foreign_key_info(
 				referenced_table = NULL;);
 
 		if (!referenced_table && trx->check_foreigns) {
-			mutex_exit(&dict_sys.mutex);
 			my_error(ER_FK_CANNOT_OPEN_PARENT,
 				 MYF(0), fk_key->ref_table.str);
-
-			goto err_exit;
+			goto err_exit_unlock;
 		}
 
 		if (fk_key->ref_columns.elements > 0) {
@@ -3127,12 +3315,11 @@ innobase_get_foreign_key_info(
 				/* Check whether there exist such
 				index in the the index create clause */
 				if (!referenced_index) {
-					mutex_exit(&dict_sys.mutex);
 					my_error(ER_FK_NO_INDEX_PARENT, MYF(0),
 						 fk_key->name.str
 						 ? fk_key->name.str : "",
 						 fk_key->ref_table.str);
-					goto err_exit;
+					goto err_exit_unlock;
 				}
 			} else {
 				ut_a(!trx->check_foreigns);
@@ -3142,10 +3329,9 @@ innobase_get_foreign_key_info(
 		} else {
 			/* Not possible to add a foreign key without a
 			referenced column */
-			mutex_exit(&dict_sys.mutex);
 			my_error(ER_CANNOT_ADD_FOREIGN, MYF(0),
 				 fk_key->ref_table.str);
-			goto err_exit;
+			goto err_exit_unlock;
 		}
 
 		if (!innobase_init_foreign(
@@ -3154,15 +3340,14 @@ innobase_get_foreign_key_info(
 			    num_col, referenced_table_name,
 			    referenced_table, referenced_index,
 			    referenced_column_names, referenced_num_col)) {
-			mutex_exit(&dict_sys.mutex);
 			my_error(
 				ER_DUP_CONSTRAINT_NAME,
 				MYF(0),
                                 "FOREIGN KEY", add_fk[num_fk]->id);
-			goto err_exit;
+			goto err_exit_unlock;
 		}
 
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.unlock();
 
 		correct_option = innobase_set_foreign_key_option(
 			add_fk[num_fk], fk_key);
@@ -3193,6 +3378,8 @@ innobase_get_foreign_key_info(
 	*n_add_fk = num_fk;
 
 	DBUG_RETURN(true);
+err_exit_unlock:
+	dict_sys.unlock();
 err_exit:
 	for (ulint i = 0; i <= num_fk; i++) {
 		if (add_fk[i]) {
@@ -4120,9 +4307,9 @@ online_retry_drop_indexes_low(
 	dict_table_t*	table,	/*!< in/out: table */
 	trx_t*		trx)	/*!< in/out: transaction */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
 
 	/* We can have table->n_ref_count > 1, because other threads
 	may have prebuilt->table pointing to the table. However, these
@@ -4135,60 +4322,54 @@ online_retry_drop_indexes_low(
 	}
 }
 
-/********************************************************************//**
-Drop any indexes that we were not able to free previously due to
-open table handles. */
-static MY_ATTRIBUTE((nonnull))
-void
-online_retry_drop_indexes(
-/*======================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	THD*		user_thd)	/*!< in/out: MySQL connection */
+/** After commit, unlock the data dictionary and close any deleted files.
+@param deleted  handles of deleted files
+@param trx      committed transaction */
+static void unlock_and_close_files(const std::vector<pfs_os_file_t> &deleted,
+                                   trx_t *trx)
 {
-	if (table->drop_aborted) {
-		trx_t*	trx = innobase_trx_allocate(user_thd);
-
-		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
-
-		row_mysql_lock_data_dictionary(trx);
-		online_retry_drop_indexes_low(table, trx);
-		trx_commit_for_mysql(trx);
-		row_mysql_unlock_data_dictionary(trx);
-		trx->free();
-	}
-
-	ut_d(mutex_enter(&dict_sys.mutex));
-	ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
-	ut_d(mutex_exit(&dict_sys.mutex));
-	ut_ad(!table->drop_aborted);
+  row_mysql_unlock_data_dictionary(trx);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+  log_write_up_to(trx->commit_lsn, true);
 }
 
-/********************************************************************//**
-Commit a dictionary transaction and drop any indexes that we were not
-able to free previously due to open table handles. */
-static MY_ATTRIBUTE((nonnull))
-void
-online_retry_drop_indexes_with_trx(
-/*===============================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	trx_t*		trx)	/*!< in/out: transaction */
+/** Commit a DDL transaction and unlink any deleted files. */
+static void commit_unlock_and_unlink(trx_t *trx)
 {
-	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
-
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-
-	/* Now that the dictionary is being locked, check if we can
-	drop any incompletely created indexes that may have been left
-	behind in rollback_inplace_alter_table() earlier. */
-	if (table->drop_aborted) {
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  unlock_and_close_files(deleted, trx);
+}
 
-		trx->table_id = 0;
+/**
+Drop any indexes that we were not able to free previously due to
+open table handles.
+@param table     InnoDB table
+@param thd       connection handle
+*/
+static void online_retry_drop_indexes(dict_table_t *table, THD *thd)
+{
+  if (table->drop_aborted)
+  {
+    trx_t *trx= innobase_trx_allocate(thd);
 
-		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
+    trx_start_for_ddl(trx);
+    if (lock_sys_tables(trx) == DB_SUCCESS)
+    {
+      row_mysql_lock_data_dictionary(trx);
+      online_retry_drop_indexes_low(table, trx);
+      commit_unlock_and_unlink(trx);
+    }
+    else
+      trx->commit();
+    trx->free();
+  }
 
-		online_retry_drop_indexes_low(table, trx);
-		trx_commit_for_mysql(trx);
-	}
+  ut_d(dict_sys.freeze(SRW_LOCK_CALL));
+  ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+  ut_d(dict_sys.unfreeze());
+  ut_ad(!table->drop_aborted);
 }
 
 /** Determines if InnoDB is dropping a foreign key constraint.
@@ -4234,7 +4415,7 @@ innobase_check_foreigns_low(
 	bool			drop)
 {
 	dict_foreign_t*	foreign;
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	/* Check if any FOREIGN KEY constraints are defined on this
 	column. */
@@ -4567,38 +4748,6 @@ found_col:
 	DBUG_RETURN(col_map);
 }
 
-/** Drop newly create FTS index related auxiliary table during
-FIC create index process, before fts_add_index is called
-@param table table that was being rebuilt online
-@param trx transaction
-@return DB_SUCCESS if successful, otherwise last error code
-*/
-static
-dberr_t
-innobase_drop_fts_index_table(
-/*==========================*/
-        dict_table_t*   table,
-	trx_t*		trx)
-{
-	dberr_t		ret_err = DB_SUCCESS;
-
-	for (dict_index_t* index = dict_table_get_first_index(table);
-	     index != NULL;
-	     index = dict_table_get_next_index(index)) {
-		if (index->type & DICT_FTS) {
-			dberr_t	err;
-
-			err = fts_drop_index_tables(trx, index);
-
-			if (err != DB_SUCCESS) {
-				ret_err = err;
-			}
-		}
-	}
-
-	return(ret_err);
-}
-
 /** Get the new non-virtual column names if any columns were renamed
 @param ha_alter_info	Data used during in-place alter
 @param altered_table	MySQL table that is being altered
@@ -4863,9 +5012,9 @@ innobase_update_gis_column_type(
 
 	DBUG_ENTER("innobase_update_gis_column_type");
 
-	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_d(dict_sys.assert_locked());
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
 
 	info = pars_info_create();
 
@@ -4881,8 +5030,7 @@ innobase_update_gis_column_type(
 		"BEGIN\n"
 		"UPDATE SYS_COLUMNS SET MTYPE=:mtype\n"
 		"WHERE TABLE_ID=:tableid AND NAME=:name;\n"
-		"END;\n",
-		false, trx);
+		"END;\n", trx);
 
 	trx->error_state = DB_SUCCESS;
 	trx->op_info = "";
@@ -5201,8 +5349,7 @@ static bool innobase_insert_sys_virtual(
 		    "PROCEDURE P () IS\n"
 		    "BEGIN\n"
 		    "INSERT INTO SYS_VIRTUAL VALUES (:id, :pos, :base_pos);\n"
-		    "END;\n",
-		    FALSE, trx)) {
+		    "END;\n", trx)) {
 		my_error(ER_INTERNAL_ERROR, MYF(0),
 			 "InnoDB: ADD COLUMN...VIRTUAL");
 		return true;
@@ -5251,7 +5398,7 @@ static bool innodb_insert_sys_columns(
 			    "NAME=:name, MTYPE=:mtype, PRTYPE=:prtype, "
 			    "LEN=:len, PREC=:base\n"
 			    "WHERE TABLE_ID=:id AND POS=:pos;\n"
-			    "END;\n", FALSE, trx)) {
+			    "END;\n", trx)) {
 			my_error(ER_INTERNAL_ERROR, MYF(0),
 				 "InnoDB: Updating SYS_COLUMNS failed");
 			return true;
@@ -5266,7 +5413,7 @@ static bool innodb_insert_sys_columns(
 		    "BEGIN\n"
 		    "INSERT INTO SYS_COLUMNS VALUES"
 		    "(:id,:pos,:name,:mtype,:prtype,:len,:base);\n"
-		    "END;\n", FALSE, trx)) {
+		    "END;\n", trx)) {
 		my_error(ER_INTERNAL_ERROR, MYF(0),
 			 "InnoDB: Insert into SYS_COLUMNS failed");
 		return true;
@@ -5324,7 +5471,7 @@ static bool innodb_update_cols(const dict_table_t* table, ulint n, trx_t* trx)
 				       "BEGIN\n"
 				       "UPDATE SYS_TABLES SET N_COLS = :n"
 				       " WHERE ID = :id;\n"
-				       "END;\n", FALSE, trx)) {
+				       "END;\n", trx)) {
 		my_error(ER_INTERNAL_ERROR, MYF(0),
 			 "InnoDB: Updating SYS_TABLES.N_COLS failed");
 		return true;
@@ -5379,7 +5526,7 @@ static bool innobase_instant_drop_cols(table_id_t id, ulint pos, trx_t* trx)
 			"DELETE FROM SYS_COLUMNS WHERE\n"
 			"TABLE_ID = :id AND POS >= :pos;\n"
 			"DELETE FROM SYS_VIRTUAL WHERE TABLE_ID = :id;\n"
-			"END;\n", FALSE, trx);
+			"END;\n", trx);
 	if (err != DB_SUCCESS) {
 		my_error(ER_INTERNAL_ERROR, MYF(0),
 			 "InnoDB: DELETE from SYS_COLUMNS/SYS_VIRTUAL failed");
@@ -5417,8 +5564,7 @@ innobase_update_v_pos_sys_columns(
 			"SET POS = :val\n"
 			"WHERE POS = :pos\n"
 			"AND TABLE_ID = :id;\n"
-			"END;\n",
-			FALSE, trx);
+			"END;\n", trx);
 
 	return(error);
 }
@@ -5451,8 +5597,7 @@ innobase_update_v_pos_sys_virtual(
 			"SET POS = :val\n"
 			"WHERE POS = :pos\n"
 			"AND TABLE_ID = :id;\n"
-			"END;\n",
-			FALSE, trx);
+			"END;\n", trx);
 
 	return(error);
 }
@@ -5486,8 +5631,7 @@ innobase_drop_one_virtual_sys_columns(
 			"DELETE FROM SYS_COLUMNS\n"
 			"WHERE TABLE_ID = :id\n"
 			"AND NAME = :name;\n"
-			"END;\n",
-			FALSE, trx);
+			"END;\n", trx);
 
 	if (error != DB_SUCCESS) {
 		return(error);
@@ -5545,8 +5689,7 @@ innobase_drop_one_virtual_sys_virtual(
 			"DELETE FROM SYS_VIRTUAL\n"
 			"WHERE TABLE_ID = :id\n"
 			"AND POS = :pos;\n"
-			"END;\n",
-			FALSE, trx);
+			"END;\n", trx);
 
 	return(error);
 }
@@ -5733,12 +5876,12 @@ static bool innobase_instant_try(
 #ifdef BTR_CUR_HASH_ADAPT
 	/* Acquire the ahi latch to avoid a race condition
 	between ahi access and instant alter table */
-	rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
-	rw_lock_x_lock(ahi_latch);
+	srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index);
+	ahi_latch->wr_lock(SRW_LOCK_CALL);
 #endif /* BTR_CUR_HASH_ADAPT */
 	const bool metadata_changed = ctx->instant_column();
 #ifdef BTR_CUR_HASH_ADAPT
-	rw_lock_x_unlock(ahi_latch);
+	ahi_latch->wr_unlock();
 #endif /* BTR_CUR_HASH_ADAPT */
 
 	DBUG_ASSERT(index->n_fields >= n_old_fields);
@@ -5915,28 +6058,42 @@ add_all_virtual:
 	mtr.start();
 	index->set_modified(mtr);
 	btr_pcur_t pcur;
-	btr_pcur_open_at_index_side(true, index, BTR_MODIFY_TREE, &pcur, true,
-				    0, &mtr);
+	dberr_t err= pcur.open_leaf(true, index, BTR_MODIFY_TREE, &mtr);
+	if (err != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+
+		if (err != DB_SUCCESS) {
+			my_error_innodb(err, table->s->table_name.str,
+					user_table->flags);
+			return true;
+		}
+		return false;
+	}
 	ut_ad(btr_pcur_is_before_first_on_page(&pcur));
-	btr_pcur_move_to_next_on_page(&pcur);
 
 	buf_block_t* block = btr_pcur_get_block(&pcur);
-	ut_ad(page_is_leaf(block->frame));
-	ut_ad(!page_has_prev(block->frame));
+	ut_ad(page_is_leaf(block->page.frame));
+	ut_ad(!page_has_prev(block->page.frame));
 	ut_ad(!buf_block_get_page_zip(block));
-	const rec_t* rec = btr_pcur_get_rec(&pcur);
+	const rec_t* rec = btr_pcur_move_to_next_on_page(&pcur);
+	if (UNIV_UNLIKELY(!rec)) {
+		err = DB_CORRUPTION;
+		goto func_exit;
+	}
+
 	que_thr_t* thr = pars_complete_graph_for_exec(
 		NULL, trx, ctx->heap, NULL);
-	const bool is_root = block->page.id().page_no() == index->page;
+	page_id_t id{block->page.id()};
+	const bool is_root = id.page_no() == index->page;
 
-	dberr_t err = DB_SUCCESS;
 	if (rec_is_metadata(rec, *index)) {
 		ut_ad(page_rec_is_user_rec(rec));
 		if (is_root
 		    && !rec_is_alter_metadata(rec, *index)
 		    && !index->table->instant
-		    && !page_has_next(block->frame)
-		    && page_rec_is_last(rec, block->frame)) {
+		    && !page_has_next(block->page.frame)
+		    && page_rec_is_last(rec, block->page.frame)) {
 			goto empty_table;
 		}
 
@@ -5945,10 +6102,17 @@ add_all_virtual:
 		}
 
 		/* Ensure that the root page is in the correct format. */
-		buf_block_t* root = btr_root_block_get(index, RW_X_LATCH,
-						       &mtr);
-		DBUG_ASSERT(root);
-		if (fil_page_get_type(root->frame) != FIL_PAGE_TYPE_INSTANT) {
+		id.set_page_no(index->page);
+		buf_block_t* root = mtr.get_already_latched(
+			id, MTR_MEMO_PAGE_SX_FIX);
+
+		if (UNIV_UNLIKELY(!root)) {
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		if (fil_page_get_type(root->page.frame)
+		    != FIL_PAGE_TYPE_INSTANT) {
 			DBUG_ASSERT("wrong page type" == 0);
 			err = DB_CORRUPTION;
 			goto func_exit;
@@ -5997,10 +6161,13 @@ add_all_virtual:
 			&offsets, &offsets_heap, ctx->heap,
 			&big_rec, update, UPD_NODE_NO_ORD_CHANGE,
 			thr, trx->id, &mtr);
+		if (err == DB_SUCCESS) {
+			offsets = rec_get_offsets(
+				btr_pcur_get_rec(&pcur), index, offsets,
+				index->n_core_fields, ULINT_UNDEFINED,
+				&offsets_heap);
+		}
 
-		offsets = rec_get_offsets(
-			btr_pcur_get_rec(&pcur), index, offsets,
-			index->n_core_fields, ULINT_UNDEFINED, &offsets_heap);
 		if (big_rec) {
 			if (err == DB_SUCCESS) {
 				err = btr_store_big_rec_extern_fields(
@@ -6013,14 +6180,14 @@ add_all_virtual:
 		if (offsets_heap) {
 			mem_heap_free(offsets_heap);
 		}
-		btr_pcur_close(&pcur);
+		ut_free(pcur.old_rec_buf);
 		goto func_exit;
 	} else if (is_root && page_rec_is_supremum(rec)
 		   && !index->table->instant) {
 empty_table:
 		/* The table is empty. */
-		ut_ad(fil_page_index_page_check(block->frame));
-		ut_ad(!page_has_siblings(block->frame));
+		ut_ad(fil_page_index_page_check(block->page.frame));
+		ut_ad(!page_has_siblings(block->page.frame));
 		ut_ad(block->page.id().page_no() == index->page);
 		/* MDEV-17383: free metadata BLOBs! */
 		btr_page_empty(block, NULL, index, 0, &mtr);
@@ -6037,10 +6204,12 @@ empty_table:
 	mtr.commit();
 	mtr.start();
 	index->set_modified(mtr);
-	if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr)) {
-		if (fil_page_get_type(root->frame) != FIL_PAGE_INDEX) {
+	if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr,
+						   &err)) {
+		if (fil_page_get_type(root->page.frame) != FIL_PAGE_INDEX) {
 			DBUG_ASSERT("wrong page type" == 0);
-			goto err_exit;
+			err = DB_CORRUPTION;
+			goto func_exit;
 		}
 
 		btr_set_instant(root, *index, &mtr);
@@ -6050,21 +6219,9 @@ empty_table:
 		err = row_ins_clust_index_entry_low(
 			BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index,
 			index->n_uniq, entry, 0, thr);
-	} else {
-err_exit:
-		err = DB_CORRUPTION;
 	}
 
-func_exit:
-	mtr.commit();
-
-	if (err != DB_SUCCESS) {
-		my_error_innodb(err, table->s->table_name.str,
-				user_table->flags);
-		return true;
-	}
-
-	return false;
+	goto func_exit;
 }
 
 /** Adjust the create index column number from "New table" to
@@ -6148,6 +6305,8 @@ innodb_v_adjust_idx_col(
 /** Create index metadata in the data dictionary.
 @param[in,out]	trx	dictionary transaction
 @param[in,out]	index	index being created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
 @param[in]	add_v	virtual columns that are being added, or NULL
 @return the created index */
 MY_ATTRIBUTE((nonnull(1,2), warn_unused_result))
@@ -6156,13 +6315,15 @@ dict_index_t*
 create_index_dict(
 	trx_t*			trx,
 	dict_index_t*		index,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
 	const dict_add_v_col_t* add_v)
 {
 	DBUG_ENTER("create_index_dict");
 
 	mem_heap_t* heap = mem_heap_create(512);
 	ind_node_t* node = ind_create_graph_create(
-		index, index->table->name.m_name, heap, add_v);
+		index, index->table->name.m_name, heap, mode, key_id, add_v);
 	que_thr_t* thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
 
 	que_fork_start_command(
@@ -6215,7 +6376,7 @@ prepare_inplace_alter_table_dict(
 	dict_table_t*		user_table;
 	dict_index_t*		fts_index	= NULL;
 	bool			new_clustered	= false;
-	dberr_t			error;
+	dberr_t			error		= DB_SUCCESS;
 	ulint			num_fts_index;
 	dict_add_v_col_t*	add_v = NULL;
 	ha_innobase_inplace_ctx*ctx;
@@ -6317,7 +6478,6 @@ prepare_inplace_alter_table_dict(
 	create_table_info_t info(ctx->prebuilt->trx->mysql_thd, altered_table,
 				 ha_alter_info->create_info, NULL, NULL,
 				 srv_file_per_table);
-	ut_d(bool stats_wait = false);
 
 	/* The primary index would be rebuilt if a FTS Doc ID
 	column is to be added, and the primary index definition
@@ -6336,41 +6496,55 @@ prepare_inplace_alter_table_dict(
 		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
 			       * sizeof *ctx->add_key_numbers));
 
+	const bool fts_exist = ctx->new_table->flags2
+		& (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
 	/* Acquire a lock on the table before creating any indexes. */
+	bool table_lock_failed = false;
 
-	if (ctx->online) {
-		error = DB_SUCCESS;
-	} else {
-		error = row_merge_lock_table(
-			ctx->prebuilt->trx, ctx->new_table, LOCK_S);
-
-		if (error != DB_SUCCESS) {
+	if (!ctx->online) {
+acquire_lock:
+		ctx->prebuilt->trx->op_info = "acquiring table lock";
+		error = lock_table_for_trx(user_table, ctx->trx, LOCK_S);
+	} else if (add_key_nums) {
+		/* FIXME: trx_resurrect_table_locks() will not resurrect
+		MDL for any recovered transactions that may hold locks on
+		the table. We will prevent race conditions by "unnecessarily"
+		acquiring an InnoDB table lock even for online operation,
+		to ensure that the rollback of recovered transactions will
+		not run concurrently with online ADD INDEX. */
+		user_table->lock_mutex_lock();
+		for (lock_t *lock = UT_LIST_GET_FIRST(user_table->locks);
+		     lock;
+		     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+			if (lock->trx->is_recovered) {
+				user_table->lock_mutex_unlock();
+				goto acquire_lock;
+			}
+		}
+		user_table->lock_mutex_unlock();
+	}
 
-			goto error_handling;
+	if (fts_exist) {
+		purge_sys.stop_FTS(*ctx->new_table);
+		if (error == DB_SUCCESS) {
+			error = fts_lock_tables(ctx->trx, *ctx->new_table);
 		}
 	}
 
-	/* Create a background transaction for the operations on
-	the data dictionary tables. */
-	ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd);
+	if (error == DB_SUCCESS) {
+		error = lock_sys_tables(ctx->trx);
+	}
 
-	trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX);
+	if (error != DB_SUCCESS) {
+		table_lock_failed = true;
+		goto error_handling;
+	}
 
 	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
 	or lock waits can happen in it during an index create operation. */
 
 	row_mysql_lock_data_dictionary(ctx->trx);
 	dict_locked = true;
-
-	/* Wait for background stats processing to stop using the table that
-	we are going to alter. We know bg stats will not start using it again
-	until we are holding the data dict locked and we are holding it here
-	at least until checking ut_ad(user_table->n_ref_count == 1) below.
-	XXX what may happen if bg stats opens the table after we
-	have unlocked data dictionary below? */
-	dict_stats_wait_bg_to_stop_using_table(user_table, ctx->trx);
-	ut_d(stats_wait = true);
-
 	online_retry_drop_indexes_low(ctx->new_table, ctx->trx);
 
 	ut_d(dict_table_check_for_dup_indexes(
@@ -6393,8 +6567,20 @@ new_clustered_failed:
 
 			ut_ad(user_table->get_ref_count() == 1);
 
-			online_retry_drop_indexes_with_trx(
-				user_table, ctx->trx);
+			if (user_table->drop_aborted) {
+				row_mysql_unlock_data_dictionary(ctx->trx);
+				trx_start_for_ddl(ctx->trx);
+				if (lock_sys_tables(ctx->trx) == DB_SUCCESS) {
+					row_mysql_lock_data_dictionary(
+						ctx->trx);
+					online_retry_drop_indexes_low(
+						user_table, ctx->trx);
+					commit_unlock_and_unlink(ctx->trx);
+				} else {
+					ctx->trx->commit();
+				}
+				row_mysql_lock_data_dictionary(ctx->trx);
+			}
 
 			if (ctx->need_rebuild()) {
 				if (ctx->new_table) {
@@ -6460,9 +6646,9 @@ new_clustered_failed:
 
 		DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS));
 
-		ctx->new_table = dict_mem_table_create(
-			new_table_name, NULL, n_cols + n_v_cols, n_v_cols,
-			flags, flags2);
+		ctx->new_table = dict_table_t::create(
+			{new_table_name, tablen + partlen}, nullptr,
+			n_cols + n_v_cols, n_v_cols, flags, flags2);
 
 		/* The rebuilt indexed_table will use the renamed
 		column names. */
@@ -6667,8 +6853,8 @@ wrong_column_name:
 			ha_alter_info, ctx->new_table, ctx->trx);
 		if (error != DB_SUCCESS) {
 			ut_ad(error == DB_ERROR);
-			error = DB_UNSUPPORTED;
-			goto error_handling;
+			my_error(ER_TABLE_CANT_HANDLE_SPKEYS, MYF(0), "SYS_COLUMNS");
+			goto error_handled;
 		}
 	}
 
@@ -6860,42 +7046,24 @@ wrong_column_name:
 			}
 		}
 
-		if (dict_table_get_low(ctx->new_table->name.m_name)) {
+		if (dict_sys.find_table(
+			    {ctx->new_table->name.m_name,
+			     strlen(ctx->new_table->name.m_name)})) {
 			my_error(ER_TABLE_EXISTS_ERROR, MYF(0),
 				 ctx->new_table->name.m_name);
 			goto new_clustered_failed;
 		}
 
 		/* Create the table. */
-		trx_set_dict_operation(ctx->trx, TRX_DICT_OP_TABLE);
+		ctx->trx->dict_operation = true;
 
-		error = row_create_table_for_mysql(
-			ctx->new_table, ctx->trx, mode, key_id);
+		error = row_create_table_for_mysql(ctx->new_table, ctx->trx);
 
 		switch (error) {
-			dict_table_t*	temp_table;
 		case DB_SUCCESS:
-			/* We need to bump up the table ref count and
-			before we can use it we need to open the
-			table. The new_table must be in the data
-			dictionary cache, because we are still holding
-			the dict_sys.mutex. */
-			ut_ad(mutex_own(&dict_sys.mutex));
-			temp_table = dict_table_open_on_name(
-				ctx->new_table->name.m_name, TRUE, FALSE,
-				DICT_ERR_IGNORE_NONE);
-			ut_a(ctx->new_table == temp_table);
-			/* n_ref_count must be 1, because purge cannot
-			be executing on this very table as we are
-			holding dict_sys.latch X-latch. */
-			DBUG_ASSERT(ctx->new_table->get_ref_count() == 1);
+			DBUG_ASSERT(ctx->new_table->get_ref_count() == 0);
 			DBUG_ASSERT(ctx->new_table->id != 0);
-			DBUG_ASSERT(ctx->new_table->id == ctx->trx->table_id);
 			break;
-		case DB_TABLESPACE_EXISTS:
-			my_error(ER_TABLESPACE_EXISTS, MYF(0),
-				 altered_table->s->table_name.str);
-			goto new_table_failed;
 		case DB_DUPLICATE_KEY:
 			my_error(HA_ERR_TABLE_EXIST, MYF(0),
 				 altered_table->s->table_name.str);
@@ -6915,7 +7083,8 @@ new_table_failed:
 		for (ulint a = 0; a < ctx->num_to_add_index; a++) {
 			dict_index_t* index = ctx->add_index[a];
 			const ulint n_v_col = index->get_new_n_vcol();
-			index = create_index_dict(ctx->trx, index, add_v);
+			index = create_index_dict(ctx->trx, index,
+						  mode, key_id, add_v);
 			error = ctx->trx->error_state;
 			if (error != DB_SUCCESS) {
 				if (index) {
@@ -6995,7 +7164,7 @@ error_handling_drop_uncached_1:
 
 		if (ctx->online) {
 			/* Allocate a log for online table rebuild. */
-			rw_lock_x_lock(&clust_index->lock);
+			clust_index->lock.x_lock(SRW_LOCK_CALL);
 			bool ok = row_log_allocate(
 				ctx->prebuilt->trx,
 				clust_index, ctx->new_table,
@@ -7004,7 +7173,7 @@ error_handling_drop_uncached_1:
 				ctx->defaults, ctx->col_map, path,
 				old_table,
 				ctx->allow_not_null);
-			rw_lock_x_unlock(&clust_index->lock);
+			clust_index->lock.x_unlock();
 
 			if (!ok) {
 				error = DB_OUT_OF_MEMORY;
@@ -7013,7 +7182,6 @@ error_handling_drop_uncached_1:
 		}
 	} else if (ctx->num_to_add_index) {
 		ut_ad(!ctx->is_instant());
-		ctx->trx->table_id = user_table->id;
 
 		for (ulint a = 0; a < ctx->num_to_add_index; a++) {
 			dict_index_t* index = ctx->add_index[a];
@@ -7025,7 +7193,10 @@ error_handling_drop_uncached_1:
 						DB_OUT_OF_FILE_SPACE;
 					goto index_created;
 				});
-			index = create_index_dict(ctx->trx, index, add_v);
+			index = create_index_dict(ctx->trx, index,
+						  FIL_ENCRYPTION_DEFAULT,
+						  FIL_DEFAULT_ENCRYPTION_KEY,
+						  add_v);
 #ifndef DBUG_OFF
 index_created:
 #endif
@@ -7052,6 +7223,8 @@ error_handling_drop_uncached:
 			if (n_v_col) {
 				index->assign_new_v_col(n_v_col);
 			}
+
+			ctx->change_col_collation(index, *altered_table);
 			/* Note the id of the transaction that created this
 			index, we use it to restrict readers from accessing
 			this index, to ensure read consistency. */
@@ -7072,7 +7245,7 @@ error_handling_drop_uncached:
 				/* No need to allocate a modification log. */
 				DBUG_ASSERT(!index->online_log);
 			} else {
-				rw_lock_x_lock(&ctx->add_index[a]->lock);
+				index->lock.x_lock(SRW_LOCK_CALL);
 
 				bool ok = row_log_allocate(
 					ctx->prebuilt->trx,
@@ -7081,7 +7254,7 @@ error_handling_drop_uncached:
 					path, old_table,
 					ctx->allow_not_null);
 
-				rw_lock_x_unlock(&index->lock);
+				index->lock.x_unlock();
 
 				DBUG_EXECUTE_IF(
 					"innodb_OOM_prepare_add_index",
@@ -7089,6 +7262,8 @@ error_handling_drop_uncached:
 						row_log_free(
 							index->online_log);
 						index->online_log = NULL;
+						ctx->old_table->indexes.start
+							->online_log = nullptr;
 						ok = false;
 					});
 
@@ -7111,23 +7286,9 @@ error_handling_drop_uncached:
 	}
 
 	if (fts_index) {
-		/* Ensure that the dictionary operation mode will
-		not change while creating the auxiliary tables. */
-		trx_dict_op_t	op = trx_get_dict_operation(ctx->trx);
-
-#ifdef UNIV_DEBUG
-		switch (op) {
-		case TRX_DICT_OP_NONE:
-			break;
-		case TRX_DICT_OP_TABLE:
-		case TRX_DICT_OP_INDEX:
-			goto op_ok;
-		}
-		ut_error;
-op_ok:
-#endif /* UNIV_DEBUG */
-		ut_ad(ctx->trx->dict_operation_lock_mode == RW_X_LATCH);
-		ut_d(dict_sys.assert_locked());
+		ut_ad(ctx->trx->dict_operation);
+		ut_ad(ctx->trx->dict_operation_lock_mode);
+		ut_ad(dict_sys.locked());
 
 		DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
 		if (ctx->need_rebuild()) {
@@ -7150,9 +7311,6 @@ op_ok:
 			goto error_handling;
 		}
 
-		ctx->trx->commit();
-		trx_start_for_ddl(ctx->trx, op);
-
 		if (!ctx->new_table->fts
 		    || ib_vector_size(ctx->new_table->fts->indexes) == 0) {
 			error = fts_create_common_tables(
@@ -7166,122 +7324,110 @@ op_ok:
 				goto error_handling;
 			}
 
-			ctx->new_table->fts->dict_locked = true;
-
 			error = innobase_fts_load_stopword(
 				ctx->new_table, ctx->trx,
 				ctx->prebuilt->trx->mysql_thd)
 				? DB_SUCCESS : DB_ERROR;
-			ctx->new_table->fts->dict_locked = false;
 
 			if (error != DB_SUCCESS) {
 				goto error_handling;
 			}
 		}
-
-		ut_ad(trx_get_dict_operation(ctx->trx) == op);
 	}
 
 	DBUG_ASSERT(error == DB_SUCCESS);
 
-	/* Commit the data dictionary transaction in order to release
-	the table locks on the system tables.  This means that if
-	MySQL crashes while creating a new primary key inside
-	row_merge_build_indexes(), ctx->new_table will not be dropped
-	by trx_rollback_active().  It will have to be recovered or
-	dropped by the database administrator. */
-	trx_commit_for_mysql(ctx->trx);
+	{
+		/* Commit the data dictionary transaction in order to release
+		the table locks on the system tables.  This means that if
+		MariaDB is killed while rebuilding the table inside
+		row_merge_build_indexes(), ctx->new_table will not be dropped
+		by trx_rollback_active(). */
+		ut_d(dict_table_check_for_dup_indexes(user_table,
+						      CHECK_PARTIAL_OK));
+		if (ctx->need_rebuild()) {
+			ctx->new_table->acquire();
+		}
 
-	row_mysql_unlock_data_dictionary(ctx->trx);
-	dict_locked = false;
+		/* fts_create_common_tables() may drop old common tables,
+		whose files would be deleted here. */
+		commit_unlock_and_unlink(ctx->trx);
+		if (fts_exist) {
+			purge_sys.resume_FTS();
+		}
 
-	ut_ad(!ctx->trx->lock.n_active_thrs);
+		trx_start_for_ddl(ctx->trx);
+		ctx->prebuilt->trx_id = ctx->trx->id;
+	}
 
 	if (ctx->old_table->fts) {
 		fts_sync_during_ddl(ctx->old_table);
 	}
 
+	DBUG_RETURN(false);
+
 error_handling:
 	/* After an error, remove all those index definitions from the
 	dictionary which were defined. */
 
 	switch (error) {
-	case DB_SUCCESS:
-		ut_a(!dict_locked);
-
-		ut_d(mutex_enter(&dict_sys.mutex));
-		ut_d(dict_table_check_for_dup_indexes(
-			     user_table, CHECK_PARTIAL_OK));
-		ut_d(mutex_exit(&dict_sys.mutex));
-		DBUG_RETURN(false);
 	case DB_TABLESPACE_EXISTS:
 		my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)");
 		break;
 	case DB_DUPLICATE_KEY:
 		my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES");
 		break;
-	case DB_UNSUPPORTED:
-		my_error(ER_TABLE_CANT_HANDLE_SPKEYS, MYF(0), "SYS_COLUMNS");
-		break;
 	default:
 		my_error_innodb(error, table_name, user_table->flags);
 	}
 
-error_handled:
+	ctx->trx->rollback();
 
-	ctx->prebuilt->trx->error_info = NULL;
-
-	if (!ctx->trx) {
-		goto err_exit;
-	}
+	ut_ad(!ctx->need_rebuild()
+	      || !user_table->indexes.start->online_log);
 
+	ctx->prebuilt->trx->error_info = NULL;
 	ctx->trx->error_state = DB_SUCCESS;
 
-	if (!dict_locked) {
+	if (false) {
+error_handled:
+		ut_ad(!table_lock_failed);
+		ut_ad(ctx->trx->state == TRX_STATE_ACTIVE);
+		ut_ad(!ctx->trx->undo_no);
+		ut_ad(dict_locked);
+	} else if (table_lock_failed) {
+		if (!dict_locked) {
+			row_mysql_lock_data_dictionary(ctx->trx);
+		}
+		goto err_exit;
+	} else {
+		ut_ad(ctx->trx->state == TRX_STATE_NOT_STARTED);
+		if (new_clustered && !user_table->drop_aborted) {
+			goto err_exit;
+		}
+		if (dict_locked) {
+			row_mysql_unlock_data_dictionary(ctx->trx);
+		}
+		trx_start_for_ddl(ctx->trx);
+		dberr_t err= lock_sys_tables(ctx->trx);
 		row_mysql_lock_data_dictionary(ctx->trx);
-	}
-
-	if (new_clustered) {
-		if (ctx->need_rebuild()) {
-
-			if (DICT_TF2_FLAG_IS_SET(
-				    ctx->new_table, DICT_TF2_FTS)) {
-				innobase_drop_fts_index_table(
-					ctx->new_table, ctx->trx);
-			}
-
-			dict_table_close_and_drop(ctx->trx, ctx->new_table);
-
-			/* Free the log for online table rebuild, if
-			one was allocated. */
-
-			dict_index_t* clust_index = dict_table_get_first_index(
-				user_table);
-
-			rw_lock_x_lock(&clust_index->lock);
-
-			if (clust_index->online_log) {
-				ut_ad(ctx->online);
-				row_log_abort_sec(clust_index);
-				clust_index->online_status
-					= ONLINE_INDEX_COMPLETE;
-			}
-
-			rw_lock_x_unlock(&clust_index->lock);
+		if (err != DB_SUCCESS) {
+			goto err_exit;
 		}
+	}
 
-		trx_commit_for_mysql(ctx->trx);
-		/* n_ref_count must be 1, because purge cannot
-		be executing on this very table as we are
-		holding dict_sys.latch X-latch. */
-		ut_ad(!stats_wait || ctx->online
-		      || user_table->get_ref_count() == 1);
+	/* n_ref_count must be 1, because background threads cannot
+	be executing on this very table as we are
+	holding MDL_EXCLUSIVE. */
+	ut_ad(ctx->online || user_table->get_ref_count() == 1);
 
-		online_retry_drop_indexes_with_trx(user_table, ctx->trx);
+	if (new_clustered) {
+		online_retry_drop_indexes_low(user_table, ctx->trx);
+		commit_unlock_and_unlink(ctx->trx);
+		row_mysql_lock_data_dictionary(ctx->trx);
 	} else {
-		ut_ad(!ctx->need_rebuild());
 		row_merge_drop_indexes(ctx->trx, user_table, true);
-		trx_commit_for_mysql(ctx->trx);
+		ctx->trx->commit();
 	}
 
 	ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE));
@@ -7297,10 +7443,13 @@ err_exit:
 
 	if (ctx->trx) {
 		row_mysql_unlock_data_dictionary(ctx->trx);
-
+		ctx->trx->rollback();
 		ctx->trx->free();
 	}
 	trx_commit_for_mysql(ctx->prebuilt->trx);
+	if (fts_exist) {
+		purge_sys.resume_FTS();
+	}
 
 	for (uint i = 0; i < ctx->num_to_add_fk; i++) {
 		if (ctx->add_fk[i]) {
@@ -7422,8 +7571,8 @@ rename_index_try(
 	trx_t*			trx)
 {
 	DBUG_ENTER("rename_index_try");
-	ut_d(dict_sys.assert_locked());
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
 
 	pars_info_t*	pinfo;
 	dberr_t		err;
@@ -7450,8 +7599,7 @@ rename_index_try(
 		"WHERE\n"
 		"ID = :index_id AND\n"
 		"TABLE_ID = :table_id;\n"
-		"END;\n",
-		FALSE, trx); /* pinfo is freed by que_eval_sql() */
+		"END;\n", trx); /* pinfo is freed by que_eval_sql() */
 
 	DBUG_EXECUTE_IF(
 		"ib_rename_index_fail1",
@@ -7480,7 +7628,7 @@ void
 innobase_rename_index_cache(dict_index_t* index, const char* new_name)
 {
 	DBUG_ENTER("innobase_rename_index_cache");
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	size_t	old_name_len = strlen(index->name);
 	size_t	new_name_len = strlen(new_name);
@@ -7639,14 +7787,15 @@ ha_innobase::prepare_inplace_alter_table(
 	}
 #endif /* UNIV_DEBUG */
 
-	ut_d(mutex_enter(&dict_sys.mutex));
+	ut_d(dict_sys.freeze(SRW_LOCK_CALL));
 	ut_d(dict_table_check_for_dup_indexes(
 		     m_prebuilt->table, CHECK_ABORTED_OK));
-	ut_d(mutex_exit(&dict_sys.mutex));
+	ut_d(dict_sys.unfreeze());
 
 	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
 		/* Nothing to do */
-		DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0);
+		DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode);
+		m_prebuilt->trx_id = 0;
 		DBUG_RETURN(false);
 	}
 
@@ -7740,7 +7889,7 @@ ha_innobase::prepare_inplace_alter_table(
 		    ha_alter_info->key_info_buffer,
 		    ha_alter_info->key_count)) {
 err_exit_no_heap:
-		DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0);
+		DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode);
 		online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
 		DBUG_RETURN(true);
 	}
@@ -8192,6 +8341,8 @@ err_exit:
 	const ha_table_option_struct& alt_opt=
 		*ha_alter_info->create_info->option_struct;
 
+        ha_innobase_inplace_ctx *ctx = NULL;
+
 	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
 	    || ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
 						  | INNOBASE_ALTER_NOCREATE
@@ -8199,7 +8350,9 @@ err_exit:
 		== ALTER_OPTIONS
 		&& !alter_options_need_rebuild(ha_alter_info, table))) {
 
-		ha_innobase_inplace_ctx *ctx = NULL;
+		DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode);
+		online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
+
 		if (heap) {
 			ctx = new ha_innobase_inplace_ctx(
 					m_prebuilt,
@@ -8216,9 +8369,6 @@ err_exit:
 			ha_alter_info->handler_ctx = ctx;
 		}
 
-		DBUG_ASSERT(m_prebuilt->trx->dict_operation_lock_mode == 0);
-		online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
-
 		if ((ha_alter_info->handler_flags
 		     & ALTER_DROP_VIRTUAL_COLUMN)
 		    && prepare_inplace_drop_virtual(ha_alter_info, table)) {
@@ -8250,6 +8400,16 @@ err_exit:
 			ctx->new_table->vc_templ = NULL;
 		}
 
+
+success:
+		/* Memorize the future transaction ID for committing
+		the data dictionary change, to be reported by
+		ha_innobase::table_version(). */
+		m_prebuilt->trx_id = (ha_alter_info->handler_flags
+				      & ~INNOBASE_INPLACE_IGNORE)
+			? static_cast<ha_innobase_inplace_ctx*>
+			(ha_alter_info->handler_ctx)->trx->id
+			: 0;
 		DBUG_RETURN(false);
 	}
 
@@ -8315,7 +8475,7 @@ err_exit:
 
 		field = altered_table->field[i];
 
-		DBUG_ASSERT((MTYP_TYPENR(field->unireg_check)
+		DBUG_ASSERT((field->unireg_check
 			     == Field::NEXT_NUMBER)
 			    == !!(field->flags & AUTO_INCREMENT_FLAG));
 
@@ -8353,12 +8513,16 @@ found_col:
 		ha_alter_info->ignore || !thd_is_strict_mode(m_user_thd),
 		alt_opt.page_compressed, alt_opt.page_compression_level);
 
-	DBUG_RETURN(prepare_inplace_alter_table_dict(
-			    ha_alter_info, altered_table, table,
-			    table_share->table_name.str,
-			    info.flags(), info.flags2(),
-			    fts_doc_col_no, add_fts_doc_id,
-			    add_fts_doc_id_idx));
+	if (!prepare_inplace_alter_table_dict(
+		    ha_alter_info, altered_table, table,
+		    table_share->table_name.str,
+		    info.flags(), info.flags2(),
+		    fts_doc_col_no, add_fts_doc_id,
+		    add_fts_doc_id_idx)) {
+		goto success;
+	}
+
+	DBUG_RETURN(true);
 }
 
 /* Check whether a columnn length change alter operation requires
@@ -8394,27 +8558,6 @@ alter_templ_needs_rebuild(
 	return(false);
 }
 
-/** Get the name of an erroneous key.
-@param[in]	error_key_num	InnoDB number of the erroneus key
-@param[in]	ha_alter_info	changes that were being performed
-@param[in]	table		InnoDB table
-@return	the name of the erroneous key */
-static
-const char*
-get_error_key_name(
-	ulint				error_key_num,
-	const Alter_inplace_info*	ha_alter_info,
-	const dict_table_t*		table)
-{
-	if (error_key_num == ULINT_UNDEFINED) {
-		return(FTS_DOC_ID_INDEX_NAME);
-	} else if (ha_alter_info->key_count == 0) {
-		return(dict_table_get_first_index(table)->name);
-	} else {
-		return(ha_alter_info->key_info_buffer[error_key_num].name.str);
-	}
-}
-
 /** Alter the table structure in-place with operations
 specified using Alter_inplace_info.
 The level of concurrency allowed during this operation depends
@@ -8442,9 +8585,6 @@ ha_innobase::inplace_alter_table(
 	bool			rebuild_templ = false;
 	DBUG_ENTER("inplace_alter_table");
 	DBUG_ASSERT(!srv_read_only_mode);
-	ut_ad(!sync_check_iterate(sync_check()));
-	ut_ad(!rw_lock_own_flagged(&dict_sys.latch,
-				   RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
 
 	DEBUG_SYNC(m_user_thd, "innodb_inplace_alter_table_enter");
 
@@ -8554,7 +8694,9 @@ ok_exit:
 		ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index,
 		altered_table, ctx->defaults, ctx->col_map,
 		ctx->add_autoinc, ctx->sequence, ctx->skip_pk_sort,
-		ctx->m_stage, add_v, eval_table, ctx->allow_not_null);
+		ctx->m_stage, add_v, eval_table, ctx->allow_not_null,
+		ctx->change_col_collate.empty()
+		? nullptr : &ctx->change_col_collate);
 
 #ifndef DBUG_OFF
 oom:
@@ -8591,12 +8733,17 @@ oom:
 
 	switch (error) {
 		KEY*	dup_key;
+	default:
+		my_error_innodb(error,
+				table_share->table_name.str,
+				m_prebuilt->table->flags);
+		break;
 	all_done:
 	case DB_SUCCESS:
-		ut_d(mutex_enter(&dict_sys.mutex));
+		ut_d(dict_sys.freeze(SRW_LOCK_CALL));
 		ut_d(dict_table_check_for_dup_indexes(
 			     m_prebuilt->table, CHECK_PARTIAL_OK));
-		ut_d(mutex_exit(&dict_sys.mutex));
+		ut_d(dict_sys.unfreeze());
 		/* prebuilt->table->n_ref_count can be anything here,
 		given that we hold at most a shared lock on the table. */
 		goto ok_exit;
@@ -8627,18 +8774,14 @@ oom:
 			 get_error_key_name(m_prebuilt->trx->error_key_num,
 					    ha_alter_info, m_prebuilt->table));
 		break;
-	case DB_DECRYPTION_FAILED: {
+	case DB_DECRYPTION_FAILED:
 		String str;
 		const char* engine= table_type();
 		get_error_message(HA_ERR_DECRYPTION_FAILED, &str);
-		my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
+		my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED,
+			 str.c_ptr(), engine);
 		break;
 	}
-	default:
-		my_error_innodb(error,
-				table_share->table_name.str,
-				m_prebuilt->table->flags);
-	}
 
 	/* prebuilt->table->n_ref_count can be anything here, given
 	that we hold at most a shared lock on the table. */
@@ -8657,8 +8800,8 @@ innobase_online_rebuild_log_free(
 	dict_table_t*	table)
 {
 	dict_index_t* clust_index = dict_table_get_first_index(table);
-	ut_d(dict_sys.assert_locked());
-	rw_lock_x_lock(&clust_index->lock);
+	ut_ad(dict_sys.locked());
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
 
 	if (clust_index->online_log) {
 		ut_ad(dict_index_get_online_status(clust_index)
@@ -8671,7 +8814,7 @@ innobase_online_rebuild_log_free(
 
 	DBUG_ASSERT(dict_index_get_online_status(clust_index)
 		    == ONLINE_INDEX_COMPLETE);
-	rw_lock_x_unlock(&clust_index->lock);
+	clust_index->lock.x_unlock();
 }
 
 /** For each user column, which is part of an index which is not going to be
@@ -8759,26 +8902,7 @@ innobase_rollback_sec_index(
 	}
 }
 
-/* Get the number of uncommitted fts index during rollback
-operation.
-@param[in]	table	table which undergoes rollback for alter
-@return number of uncommitted fts indexes. */
-static
-ulint innobase_get_uncommitted_fts_indexes(const dict_table_t* table)
-{
-  ut_ad(mutex_own(&dict_sys.mutex));
-  dict_index_t*	index = dict_table_get_first_index(table);
-  ulint n_uncommitted_fts = 0;
-
-  for (; index ; index = dict_table_get_next_index(index))
-  {
-    if (index->type & DICT_FTS && !index->is_committed())
-      n_uncommitted_fts++;
-  }
-
-  return n_uncommitted_fts;
-}
-
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Roll back the changes made during prepare_inplace_alter_table()
 and inplace_alter_table() inside the storage engine. Note that the
 allowed level of concurrency during this operation will be the same as
@@ -8792,174 +8916,194 @@ during prepare, but might not be during commit).
 @retval true Failure
 @retval false Success
 */
-inline MY_ATTRIBUTE((nonnull, warn_unused_result))
-bool
-rollback_inplace_alter_table(
-/*=========================*/
-	Alter_inplace_info*	ha_alter_info,
-	const TABLE*		table,
-	row_prebuilt_t*		prebuilt)
+inline bool rollback_inplace_alter_table(Alter_inplace_info *ha_alter_info,
+                                         const TABLE *table,
+                                         row_prebuilt_t *prebuilt)
 {
-	bool	fail	= false;
-
-	ha_innobase_inplace_ctx*	ctx
-		= static_cast<ha_innobase_inplace_ctx*>
-		(ha_alter_info->handler_ctx);
-
-	DBUG_ENTER("rollback_inplace_alter_table");
-
-	if (!ctx || !ctx->trx) {
-		/* If we have not started a transaction yet,
-		(almost) nothing has been or needs to be done. */
-		goto func_exit;
-	}
-
-	trx_start_for_ddl(ctx->trx, ctx->need_rebuild()
-			  ? TRX_DICT_OP_TABLE : TRX_DICT_OP_INDEX);
-	row_mysql_lock_data_dictionary(ctx->trx);
-
-	if (ctx->need_rebuild()) {
-		/* DML threads can access ctx->new_table via the
-		online rebuild log. Free it first. */
-		innobase_online_rebuild_log_free(prebuilt->table);
-	}
-
-	if (!ctx->new_table) {
-		ut_ad(ctx->need_rebuild());
-	} else if (ctx->need_rebuild()) {
-		dberr_t	err= DB_SUCCESS;
-		ulint	flags	= ctx->new_table->flags;
-
-		/* Since the FTS index specific auxiliary tables has
-		not yet registered with "table->fts" by fts_add_index(),
-		we will need explicitly delete them here */
-		if (dict_table_has_fts_index(ctx->new_table)) {
-
-			err = innobase_drop_fts_index_table(
-				ctx->new_table, ctx->trx);
-
-			if (err != DB_SUCCESS) {
-				my_error_innodb(
-					err, table->s->table_name.str,
-					flags);
-				fail = true;
-			}
-		}
-
-		dict_table_close_and_drop(ctx->trx, ctx->new_table);
-
-		switch (err) {
-		case DB_SUCCESS:
-			break;
-		default:
-			my_error_innodb(err, table->s->table_name.str,
-					flags);
-			fail = true;
-		}
-	} else {
-		DBUG_ASSERT(!(ha_alter_info->handler_flags
-			      & ALTER_ADD_PK_INDEX));
-		DBUG_ASSERT(ctx->new_table == prebuilt->table);
-
-		/* Remove the fts table from fts_optimize_wq if
-		there is only one fts index exist. */
-		if (prebuilt->table->fts
-		    && innobase_get_uncommitted_fts_indexes(
-					prebuilt->table) == 1
-		    && (ib_vector_is_empty(prebuilt->table->fts->indexes)
-			|| ib_vector_size(prebuilt->table->fts->indexes)
-			   == 1)) {
-			row_mysql_unlock_data_dictionary(ctx->trx);
-			fts_optimize_remove_table(prebuilt->table);
-			row_mysql_lock_data_dictionary(ctx->trx);
-		}
-
-		innobase_rollback_sec_index(
-			prebuilt->table, table,
-			(ha_alter_info->alter_info->requested_lock
-                         == Alter_info::ALTER_TABLE_LOCK_EXCLUSIVE),
-			ctx->trx, prebuilt->trx);
-
-		ctx->clean_new_vcol_index();
-	}
-
-	trx_commit_for_mysql(ctx->trx);
-	row_mysql_unlock_data_dictionary(ctx->trx);
-	ctx->trx->free();
-	ctx->trx = NULL;
+  bool fail= false;
+  ha_innobase_inplace_ctx *ctx= static_cast<ha_innobase_inplace_ctx*>
+    (ha_alter_info->handler_ctx);
+
+  DBUG_ENTER("rollback_inplace_alter_table");
+
+  DEBUG_SYNC_C("innodb_rollback_inplace_alter_table");
+  if (!ctx)
+    /* If we have not started a transaction yet,
+    (almost) nothing has been or needs to be done. */
+    dict_sys.lock(SRW_LOCK_CALL);
+  else if (ctx->trx->state == TRX_STATE_NOT_STARTED)
+    goto free_and_exit;
+  else if (ctx->new_table)
+  {
+    ut_ad(ctx->trx->state == TRX_STATE_ACTIVE);
+    const bool fts_exist= (ctx->new_table->flags2 &
+                           (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) ||
+      ctx->adding_fulltext_index();
+    if (ctx->need_rebuild())
+    {
+      if (fts_exist)
+      {
+        fts_optimize_remove_table(ctx->new_table);
+        purge_sys.stop_FTS(*ctx->new_table);
+      }
 
-func_exit:
-#ifndef DBUG_OFF
-	dict_index_t* clust_index = dict_table_get_first_index(
-		prebuilt->table);
-	DBUG_ASSERT(!clust_index->online_log);
-	DBUG_ASSERT(dict_index_get_online_status(clust_index)
-		    == ONLINE_INDEX_COMPLETE);
-#endif /* !DBUG_OFF */
+      dberr_t err= lock_table_for_trx(ctx->new_table, ctx->trx, LOCK_X);
+      if (fts_exist)
+      {
+        if (err == DB_SUCCESS)
+          err= fts_lock_common_tables(ctx->trx, *ctx->new_table);
+        for (const dict_index_t* index= ctx->new_table->indexes.start;
+             err == DB_SUCCESS && index; index= index->indexes.next)
+          if (index->type & DICT_FTS)
+            err= fts_lock_index_tables(ctx->trx, *index);
+      }
+      if (err == DB_SUCCESS)
+        err= lock_sys_tables(ctx->trx);
+
+      row_mysql_lock_data_dictionary(ctx->trx);
+      /* Detach ctx->new_table from dict_index_t::online_log. */
+      innobase_online_rebuild_log_free(ctx->old_table);
+
+      ut_d(const bool last_handle=) ctx->new_table->release();
+      ut_ad(last_handle);
+      if (err == DB_SUCCESS)
+        err= ctx->trx->drop_table(*ctx->new_table);
+
+      if (err == DB_SUCCESS)
+        for (const dict_index_t* index= ctx->new_table->indexes.start; index;
+             index= index->indexes.next)
+          if (index->type & DICT_FTS)
+            if (dberr_t err2= fts_drop_index_tables(ctx->trx, *index))
+              err= err2;
+
+      if (err != DB_SUCCESS)
+      {
+        my_error_innodb(err, table->s->table_name.str, ctx->new_table->flags);
+        fail= true;
+      }
+    }
+    else
+    {
+      DBUG_ASSERT(!(ha_alter_info->handler_flags & ALTER_ADD_PK_INDEX));
+      DBUG_ASSERT(ctx->old_table == prebuilt->table);
+      uint &innodb_lock_wait_timeout=
+        thd_lock_wait_timeout(ctx->trx->mysql_thd);
+      const uint save_timeout= innodb_lock_wait_timeout;
+      innodb_lock_wait_timeout= ~0U; /* infinite  */
+      dict_index_t *old_clust_index= ctx->old_table->indexes.start;
+      old_clust_index->lock.x_lock(SRW_LOCK_CALL);
+      old_clust_index->online_log= nullptr;
+      old_clust_index->lock.x_unlock();
+      if (fts_exist)
+      {
+        const dict_index_t *fts_index= nullptr;
+        for (ulint a= 0; a < ctx->num_to_add_index; a++)
+        {
+          const dict_index_t *index = ctx->add_index[a];
+          if (index->type & DICT_FTS)
+            fts_index= index;
+        }
 
-	if (ctx) {
-		DBUG_ASSERT(ctx->prebuilt == prebuilt);
+        /* Remove the fts table from fts_optimize_wq if there are
+        no FTS secondary index exist other than newly added one */
+        if (fts_index &&
+            (ib_vector_is_empty(prebuilt->table->fts->indexes) ||
+             (ib_vector_size(prebuilt->table->fts->indexes) == 1 &&
+              fts_index == static_cast<dict_index_t*>(
+                ib_vector_getp(prebuilt->table->fts->indexes, 0)))))
+          fts_optimize_remove_table(prebuilt->table);
+
+        purge_sys.stop_FTS(*prebuilt->table);
+        ut_a(!fts_index || !fts_lock_index_tables(ctx->trx, *fts_index));
+        ut_a(!fts_lock_common_tables(ctx->trx, *ctx->new_table));
+        ut_a(!lock_sys_tables(ctx->trx));
+      }
+      else
+      {
+        ut_a(!lock_table_for_trx(dict_sys.sys_indexes, ctx->trx, LOCK_X));
+        ut_a(!lock_table_for_trx(dict_sys.sys_fields, ctx->trx, LOCK_X));
+      }
+      innodb_lock_wait_timeout= save_timeout;
+      DEBUG_SYNC_C("innodb_rollback_after_fts_lock");
+      row_mysql_lock_data_dictionary(ctx->trx);
+      ctx->rollback_instant();
+      innobase_rollback_sec_index(ctx->old_table, table,
+                                  ha_alter_info->alter_info->requested_lock ==
+                                  Alter_info::ALTER_TABLE_LOCK_EXCLUSIVE,
+                                  ctx->trx, prebuilt->trx);
+      ctx->clean_new_vcol_index();
+      ctx->cleanup_col_collation();
+      ut_d(dict_table_check_for_dup_indexes(ctx->old_table, CHECK_ABORTED_OK));
+    }
 
-		if (ctx->num_to_add_fk) {
-			for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
-				dict_foreign_free(ctx->add_fk[i]);
-			}
-		}
+    DEBUG_SYNC(ctx->trx->mysql_thd, "before_commit_rollback_inplace");
+    commit_unlock_and_unlink(ctx->trx);
+    if (fts_exist)
+      purge_sys.resume_FTS();
+    if (ctx->old_table->fts)
+    {
+      dict_sys.lock(SRW_LOCK_CALL);
+      ut_ad(fts_check_cached_index(ctx->old_table));
+      fts_optimize_add_table(ctx->old_table);
+      dict_sys.unlock();
+    }
+    goto free_and_exit;
+  }
+  else
+  {
+free_and_exit:
+    DBUG_ASSERT(ctx->prebuilt == prebuilt);
+    ctx->trx->free();
+    ctx->trx= nullptr;
 
-		if (ctx->num_to_drop_index) {
-			row_mysql_lock_data_dictionary(prebuilt->trx);
+    dict_sys.lock(SRW_LOCK_CALL);
 
-			/* Clear the to_be_dropped flags
-			in the data dictionary cache.
-			The flags may already have been cleared,
-			in case an error was detected in
-			commit_inplace_alter_table(). */
-			for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
-				dict_index_t*	index = ctx->drop_index[i];
-				DBUG_ASSERT(index->is_committed());
-				index->to_be_dropped = 0;
-			}
+    if (ctx->add_vcol)
+    {
+      for (ulint i = 0; i < ctx->num_to_add_vcol; i++)
+        ctx->add_vcol[i].~dict_v_col_t();
+      ctx->num_to_add_vcol= 0;
+      ctx->add_vcol= nullptr;
+    }
 
-			row_mysql_unlock_data_dictionary(prebuilt->trx);
-		}
+    for (ulint i= 0; i < ctx->num_to_add_fk; i++)
+      dict_foreign_free(ctx->add_fk[i]);
+    /* Clear the to_be_dropped flags in the data dictionary cache.
+    The flags may already have been cleared, in case an error was
+    detected in commit_inplace_alter_table(). */
+    for (ulint i= 0; i < ctx->num_to_drop_index; i++)
+    {
+      dict_index_t *index= ctx->drop_index[i];
+      DBUG_ASSERT(index->is_committed());
+      index->to_be_dropped= 0;
+    }
+  }
 
-		if (ctx->add_vcol) {
-			for (ulint i = 0; i < ctx->num_to_add_vcol; i++) {
-				ctx->add_vcol[i].~dict_v_col_t();
-			}
-			ctx->num_to_add_vcol = 0;
-			ctx->add_vcol = nullptr;
-		}
-	}
+  DBUG_ASSERT(!prebuilt->table->indexes.start->online_log);
+  DBUG_ASSERT(prebuilt->table->indexes.start->online_status ==
+              ONLINE_INDEX_COMPLETE);
 
-	/* Reset dict_col_t::ord_part for those columns fail to be indexed,
-	we do this by checking every existing column, if any current
-	index would index them */
-	for (ulint i = 0; i < dict_table_get_n_cols(prebuilt->table); i++) {
-		dict_col_t& col = prebuilt->table->cols[i];
-		if (!col.ord_part) {
-			continue;
-		}
-		if (!check_col_exists_in_indexes(prebuilt->table, i, false,
-						 true)) {
-			col.ord_part = 0;
-		}
-	}
-
-	for (ulint i = 0; i < dict_table_get_n_v_cols(prebuilt->table); i++) {
-		dict_col_t& col = prebuilt->table->v_cols[i].m_col;
-		if (!col.ord_part) {
-			continue;
-		}
-		if (!check_col_exists_in_indexes(prebuilt->table, i, true,
-						 true)) {
-			col.ord_part = 0;
-		}
-	}
+  /* Reset dict_col_t::ord_part for unindexed columns */
+  for (ulint i= 0; i < dict_table_get_n_cols(prebuilt->table); i++)
+  {
+    dict_col_t &col= prebuilt->table->cols[i];
+    if (col.ord_part && !check_col_exists_in_indexes(prebuilt->table, i, false,
+                                                     true))
+      col.ord_part= 0;
+  }
 
-	trx_commit_for_mysql(prebuilt->trx);
-	MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
-	DBUG_RETURN(fail);
+  for (ulint i = 0; i < dict_table_get_n_v_cols(prebuilt->table); i++)
+  {
+    dict_col_t &col = prebuilt->table->v_cols[i].m_col;
+    if (col.ord_part && !check_col_exists_in_indexes(prebuilt->table, i, true,
+                                                     true))
+      col.ord_part= 0;
+  }
+  dict_sys.unlock();
+  trx_commit_for_mysql(prebuilt->trx);
+  prebuilt->trx_id = 0;
+  MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+  DBUG_RETURN(fail);
 }
 
 /** Drop a FOREIGN KEY constraint from the data dictionary tables.
@@ -8978,9 +9122,9 @@ innobase_drop_foreign_try(
 {
 	DBUG_ENTER("innobase_drop_foreign_try");
 
-	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_d(dict_sys.assert_locked());
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
 
 	/* Drop the constraint from the data dictionary. */
 	static const char sql[] =
@@ -8997,7 +9141,7 @@ innobase_drop_foreign_try(
 	pars_info_add_str_literal(info, "id", foreign_id);
 
 	trx->op_info = "dropping foreign key constraint from dictionary";
-	error = que_eval_sql(info, sql, FALSE, trx);
+	error = que_eval_sql(info, sql, trx);
 	trx->op_info = "";
 
 	DBUG_EXECUTE_IF("ib_drop_foreign_error",
@@ -9034,9 +9178,9 @@ innobase_rename_column_try(
 
 	DBUG_ENTER("innobase_rename_column_try");
 
-	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_d(dict_sys.assert_locked());
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
 
 	if (ctx.need_rebuild()) {
 		goto rename_foreign;
@@ -9082,8 +9226,7 @@ innobase_rename_column_try(
 				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
 				"WHERE INDEX_ID=:indexid\n"
 				"AND POS=:nth;\n"
-				"END;\n",
-				FALSE, trx);
+				"END;\n", trx);
 			DBUG_EXECUTE_IF("ib_rename_column_error",
 					error = DB_OUT_OF_FILE_SPACE;);
 
@@ -9113,8 +9256,7 @@ innobase_rename_column_try(
 				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
 				"WHERE INDEX_ID=:indexid\n"
 				"AND POS=:nth;\n"
-				"END;\n",
-				FALSE, trx);
+				"END;\n", trx);
 
 			if (error != DB_SUCCESS) {
 				goto err_exit;
@@ -9175,8 +9317,7 @@ rename_foreign:
 				"UPDATE SYS_FOREIGN_COLS\n"
 				"SET FOR_COL_NAME=:new\n"
 				"WHERE ID=:id AND POS=:nth;\n"
-				"END;\n",
-				FALSE, trx);
+				"END;\n", trx);
 
 			if (error != DB_SUCCESS) {
 				goto err_exit;
@@ -9217,8 +9358,7 @@ rename_foreign:
 				"UPDATE SYS_FOREIGN_COLS\n"
 				"SET REF_COL_NAME=:new\n"
 				"WHERE ID=:id AND POS=:nth;\n"
-				"END;\n",
-				FALSE, trx);
+				"END;\n", trx);
 
 			if (error != DB_SUCCESS) {
 				goto err_exit;
@@ -9291,36 +9431,6 @@ processed_field:
 	return(false);
 }
 
-/** Convert field type and length to InnoDB format */
-static void get_type(const Field& f, uint& prtype, uint8_t& mtype,
-                     uint16_t& len)
-{
-	mtype = get_innobase_type_from_mysql_type(&prtype, &f);
-	len = static_cast<uint16_t>(f.pack_length());
-	prtype |= f.type();
-	if (f.type() == MYSQL_TYPE_VARCHAR) {
-		auto l = static_cast<const Field_varstring&>(f).length_bytes;
-		len = static_cast<uint16_t>(len - l);
-		if (l == 2) prtype |= DATA_LONG_TRUE_VARCHAR;
-	}
-	if (!f.real_maybe_null()) prtype |= DATA_NOT_NULL;
-	if (f.binary()) prtype |= DATA_BINARY_TYPE;
-	if (f.table->versioned()) {
-		if (&f == f.table->field[f.table->s->vers.start_fieldno]) {
-			prtype |= DATA_VERS_START;
-		} else if (&f == f.table->field[f.table->s->vers.end_fieldno]) {
-			prtype |= DATA_VERS_END;
-		} else if (!(f.flags & VERS_UPDATE_UNVERSIONED_FLAG)) {
-			prtype |= DATA_VERSIONED;
-		}
-	}
-	if (!f.stored_in_db()) prtype |= DATA_VIRTUAL;
-
-	if (dtype_is_string_type(mtype)) {
-		prtype |= f.charset()->number << 16;
-	}
-}
-
 /** Enlarge a column in the data dictionary tables.
 @param ctx In-place ALTER TABLE context
 @param trx data dictionary transaction
@@ -9346,9 +9456,9 @@ innobase_rename_or_enlarge_column_try(
 	DBUG_ENTER("innobase_rename_or_enlarge_column_try");
 	DBUG_ASSERT(!ctx->need_rebuild());
 
-	DBUG_ASSERT(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_d(dict_sys.assert_locked());
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
 
 	ulint n_base;
 
@@ -9774,7 +9884,7 @@ innobase_update_foreign_cache(
 
 	DBUG_ENTER("innobase_update_foreign_cache");
 
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	user_table = ctx->old_table;
 
@@ -9790,8 +9900,6 @@ innobase_update_foreign_cache(
 		column names. No need to pass col_names or to drop
 		constraints from the data dictionary cache. */
 		DBUG_ASSERT(!ctx->col_names);
-		DBUG_ASSERT(user_table->foreign_set.empty());
-		DBUG_ASSERT(user_table->referenced_set.empty());
 		user_table = ctx->new_table;
 	} else {
 		/* Drop the foreign key constraints if the
@@ -9810,8 +9918,8 @@ innobase_update_foreign_cache(
 	dict_names_t	fk_tables;
 
 	err = dict_load_foreigns(user_table->name.m_name,
-				 ctx->col_names, false, true,
-				 DICT_ERR_IGNORE_NONE,
+				 ctx->col_names, 1, true,
+				 DICT_ERR_IGNORE_FK_NOKEY,
 				 fk_tables);
 
 	if (err == DB_CANNOT_ADD_CONSTRAINT) {
@@ -9821,7 +9929,7 @@ innobase_update_foreign_cache(
 		loaded with "foreign_key checks" off,
 		so let's retry the loading with charset_check is off */
 		err = dict_load_foreigns(user_table->name.m_name,
-					 ctx->col_names, false, false,
+					 ctx->col_names, 1, false,
 					 DICT_ERR_IGNORE_NONE,
 					 fk_tables);
 
@@ -9842,17 +9950,14 @@ innobase_update_foreign_cache(
 	/* For complete loading of foreign keys, all associated tables must
 	also be loaded. */
 	while (err == DB_SUCCESS && !fk_tables.empty()) {
-		dict_table_t*	table = dict_load_table(
-			fk_tables.front(), DICT_ERR_IGNORE_NONE);
-
-		if (table == NULL) {
+		const char *f = fk_tables.front();
+		if (!dict_sys.load_table({f, strlen(f)})) {
 			err = DB_TABLE_NOT_FOUND;
 			ib::error()
-				<< "Failed to load table '"
-				<< table_name_t(const_cast<char*>
-						(fk_tables.front()))
-				<< "' which has a foreign key constraint with"
-				<< " table '" << user_table->name << "'.";
+				<< "Failed to load table "
+				<< table_name_t(const_cast<char*>(f))
+				<< " which has a foreign key constraint with"
+				<< user_table->name;
 			break;
 		}
 
@@ -9893,8 +9998,7 @@ vers_change_field_try(
 				     "BEGIN\n"
 				     "UPDATE SYS_COLUMNS SET PRTYPE=:prtype\n"
 				     "WHERE TABLE_ID=:tableid AND POS=:pos;\n"
-				     "END;\n",
-				     false, trx);
+				     "END;\n", trx);
 
 	if (error != DB_SUCCESS) {
 		my_error_innodb(error, table_name, 0);
@@ -10023,6 +10127,7 @@ commit_try_rebuild(
 	ha_innobase_inplace_ctx*ctx,
 	TABLE*			altered_table,
 	const TABLE*		old_table,
+	bool			statistics_exist,
 	trx_t*			trx,
 	const char*		table_name)
 {
@@ -10031,13 +10136,15 @@ commit_try_rebuild(
 
 	DBUG_ENTER("commit_try_rebuild");
 	DBUG_ASSERT(ctx->need_rebuild());
-	DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+	DBUG_ASSERT(trx->dict_operation_lock_mode);
 	DBUG_ASSERT(!(ha_alter_info->handler_flags
 		      & ALTER_DROP_FOREIGN_KEY)
 		    || ctx->num_to_drop_fk > 0);
 	DBUG_ASSERT(ctx->num_to_drop_fk
 		    <= ha_alter_info->alter_info->drop_list.elements);
 
+	innobase_online_rebuild_log_free(user_table);
+
 	for (dict_index_t* index = dict_table_get_first_index(rebuilt_table);
 	     index;
 	     index = dict_table_get_next_index(index)) {
@@ -10054,8 +10161,6 @@ commit_try_rebuild(
 		DBUG_RETURN(true);
 	}
 
-	dberr_t	error;
-
 	/* Clear the to_be_dropped flag in the data dictionary cache
 	of user_table. */
 	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
@@ -10073,8 +10178,6 @@ commit_try_rebuild(
 		DBUG_RETURN(true);
 	}
 
-	DBUG_EXECUTE_IF("ib_ddl_crash_before_rename", DBUG_SUICIDE(););
-
 	/* The new table must inherit the flag from the
 	"parent" table. */
 	if (!user_table->space) {
@@ -10087,18 +10190,25 @@ commit_try_rebuild(
 	old table. */
 	char* old_name= mem_heap_strdup(ctx->heap, user_table->name.m_name);
 
-	error = row_rename_table_for_mysql(user_table->name.m_name,
-					   ctx->tmp_name, trx, false, false);
+	dberr_t error = row_rename_table_for_mysql(user_table->name.m_name,
+						   ctx->tmp_name, trx, false);
 	if (error == DB_SUCCESS) {
-		error = row_rename_table_for_mysql(rebuilt_table->name.m_name,
-						   old_name, trx,
-						   false, false);
+		error = row_rename_table_for_mysql(
+			rebuilt_table->name.m_name, old_name, trx, false);
+		if (error == DB_SUCCESS) {
+			/* The statistics for the surviving indexes will be
+			re-inserted in alter_stats_rebuild(). */
+			if (statistics_exist) {
+				error = trx->drop_table_statistics(old_name);
+			}
+			if (error == DB_SUCCESS) {
+				error = trx->drop_table(*user_table);
+			}
+		}
 	}
 
 	/* We must be still holding a table handle. */
 	DBUG_ASSERT(user_table->get_ref_count() == 1);
-
-	DBUG_EXECUTE_IF("ib_ddl_crash_after_rename", DBUG_SUICIDE(););
 	DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;);
 
 	switch (error) {
@@ -10234,8 +10344,7 @@ innobase_page_compression_try(
 				     "BEGIN\n"
 				     "UPDATE SYS_TABLES SET TYPE=:type\n"
 				     "WHERE ID=:id;\n"
-				     "END;\n",
-				     false, trx);
+				     "END;\n", trx);
 
 	if (error != DB_SUCCESS) {
 		my_error_innodb(error, table_name, 0);
@@ -10247,22 +10356,6 @@ innobase_page_compression_try(
 	DBUG_RETURN(false);
 }
 
-static
-void
-dict_stats_try_drop_table(THD *thd, const table_name_t &name,
-                          const LEX_CSTRING &table_name)
-{
-  char errstr[1024];
-  if (dict_stats_drop_table(name.m_name, errstr, sizeof(errstr)) != DB_SUCCESS)
-  {
-    push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_ALTER_INFO,
-                        "Deleting persistent statistics"
-                        " for table '%s' in InnoDB failed: %s",
-                        table_name.str,
-                        errstr);
-  }
-}
-
 /** Evict the table from cache and reopen it. Drop outdated statistics.
 @param thd           mariadb THD entity
 @param table         innodb table
@@ -10273,26 +10366,19 @@ static dict_table_t *innobase_reload_table(THD *thd, dict_table_t *table,
                                            const LEX_CSTRING &table_name,
                                            ha_innobase_inplace_ctx &ctx)
 {
-  char *tb_name= strdup(table->name.m_name);
-  dict_table_close(table, true, false);
-
   if (ctx.is_instant())
   {
-    for (auto i = ctx.old_n_v_cols; i--; )
+    for (auto i= ctx.old_n_v_cols; i--; )
     {
       ctx.old_v_cols[i].~dict_v_col_t();
-      const_cast<unsigned&>(ctx.old_n_v_cols) = 0;
+      const_cast<unsigned&>(ctx.old_n_v_cols)= 0;
     }
   }
 
+  const table_id_t id= table->id;
+  table->release();
   dict_sys.remove(table);
-  table= dict_table_open_on_name(tb_name, TRUE, TRUE,
-                                 DICT_ERR_IGNORE_FK_NOKEY);
-
-  /* Drop outdated table stats. */
-  dict_stats_try_drop_table(thd, table->name, table_name);
-  free(tb_name);
-  return table;
+  return dict_table_open_on_id(id, true, DICT_TABLE_OP_NORMAL);
 }
 
 /** Commit the changes made during prepare_inplace_alter_table()
@@ -10319,7 +10405,7 @@ commit_try_norebuild(
 {
 	DBUG_ENTER("commit_try_norebuild");
 	DBUG_ASSERT(!ctx->need_rebuild());
-	DBUG_ASSERT(trx->dict_operation_lock_mode == RW_X_LATCH);
+	DBUG_ASSERT(trx->dict_operation_lock_mode);
 	DBUG_ASSERT(!(ha_alter_info->handler_flags
 		      & ALTER_DROP_FOREIGN_KEY)
 		    || ctx->num_to_drop_fk > 0);
@@ -10367,57 +10453,137 @@ commit_try_norebuild(
 		DBUG_RETURN(true);
 	}
 
-	dberr_t	error;
+	dberr_t	error = DB_SUCCESS;
+	dict_index_t* index;
+	const char *op = "rename index to add";
+	ulint num_fts_index = 0;
 
 	/* We altered the table in place. Mark the indexes as committed. */
 	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
-		dict_index_t*	index = ctx->add_index[i];
+		index = ctx->add_index[i];
 		DBUG_ASSERT(dict_index_get_online_status(index)
 			    == ONLINE_INDEX_COMPLETE);
 		DBUG_ASSERT(!index->is_committed());
 		error = row_merge_rename_index_to_add(
 			trx, ctx->new_table->id, index->id);
-		switch (error) {
-		case DB_SUCCESS:
-			break;
-		case DB_TOO_MANY_CONCURRENT_TRXS:
-			/* If we wrote some undo log here, then the
-			persistent data dictionary for this table may
-			probably be corrupted. This is because a
-			'trigger' on SYS_INDEXES could already have invoked
-			btr_free_if_exists(), which cannot be rolled back. */
-			DBUG_ASSERT(trx->undo_no == 0);
-			my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
-			DBUG_RETURN(true);
-		default:
-			sql_print_error(
-				"InnoDB: rename index to add: %lu\n",
-				(ulong) error);
-			DBUG_ASSERT(0);
-			my_error(ER_INTERNAL_ERROR, MYF(0),
-				 "rename index to add");
-			DBUG_RETURN(true);
+		if (error) {
+			goto handle_error;
 		}
 	}
 
-	/* Drop any indexes that were requested to be dropped.
-	Flag them in the data dictionary first. */
+	for (dict_index_t *index = UT_LIST_GET_FIRST(ctx->old_table->indexes);
+	     index; index = UT_LIST_GET_NEXT(indexes, index)) {
+		if (index->type & DICT_FTS) {
+			num_fts_index++;
+		}
+	}
+
+	char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
+	if (ctx->num_to_drop_index) {
+		dict_fs2utf8(ctx->old_table->name.m_name,
+			     db, sizeof db, table, sizeof table);
+	}
 
 	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
-		dict_index_t*	index = ctx->drop_index[i];
+		index = ctx->drop_index[i];
 		DBUG_ASSERT(index->is_committed());
 		DBUG_ASSERT(index->table == ctx->new_table);
 		DBUG_ASSERT(index->to_be_dropped);
+		op = "DROP INDEX";
+
+		static const char drop_index[] =
+			"PROCEDURE DROP_INDEX_PROC () IS\n"
+			"BEGIN\n"
+			"DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+			"DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+			"END;\n";
+
+		pars_info_t* info = pars_info_create();
+		pars_info_add_ull_literal(info, "indexid", index->id);
+		error = que_eval_sql(info, drop_index, trx);
+
+		if (error == DB_SUCCESS && index->type & DICT_FTS) {
+			DBUG_ASSERT(index->table->fts);
+			DEBUG_SYNC_C("norebuild_fts_drop");
+			error = fts_drop_index(index->table, index, trx);
+			ut_ad(num_fts_index);
+			num_fts_index--;
+		}
 
-		error = row_merge_rename_index_to_drop(
-			trx, index->table->id, index->id);
 		if (error != DB_SUCCESS) {
-			sql_print_error(
-				"InnoDB: rename index to drop: %lu\n",
-				(ulong) error);
-			DBUG_ASSERT(0);
-			my_error(ER_INTERNAL_ERROR, MYF(0),
-				 "rename index to drop");
+			goto handle_error;
+		}
+
+		error = dict_stats_delete_from_index_stats(db, table,
+							   index->name, trx);
+		switch (error) {
+		case DB_SUCCESS:
+		case DB_STATS_DO_NOT_EXIST:
+			continue;
+		default:
+			goto handle_error;
+		}
+	}
+
+	if (const size_t size = ha_alter_info->rename_keys.size()) {
+		char tmp_name[5];
+		char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
+
+		dict_fs2utf8(ctx->new_table->name.m_name, db, sizeof db,
+			     table, sizeof table);
+		tmp_name[0]= (char)0xff;
+		for (size_t i = 0; error == DB_SUCCESS && i < size; i++) {
+			snprintf(tmp_name+1, sizeof(tmp_name)-1, "%zu", i);
+			error = dict_stats_rename_index(db, table,
+							ha_alter_info->
+							rename_keys[i].
+							old_key->name.str,
+							tmp_name, trx);
+		}
+		for (size_t i = 0; error == DB_SUCCESS && i < size; i++) {
+			snprintf(tmp_name+1, sizeof(tmp_name)-1, "%zu", i);
+			error = dict_stats_rename_index(db, table, tmp_name,
+							ha_alter_info
+							->rename_keys[i].
+							new_key->name.str,
+							trx);
+		}
+
+		switch (error) {
+		case DB_SUCCESS:
+		case DB_STATS_DO_NOT_EXIST:
+			break;
+		case DB_DUPLICATE_KEY:
+			my_error(ER_DUP_KEY, MYF(0),
+				 "mysql.innodb_index_stats");
+			DBUG_RETURN(true);
+		default:
+			goto handle_error;
+		}
+	}
+
+	if ((ctx->old_table->flags2 & DICT_TF2_FTS) && !num_fts_index) {
+		error = fts_drop_tables(trx, *ctx->old_table);
+		if (error != DB_SUCCESS) {
+handle_error:
+			switch (error) {
+			case DB_TOO_MANY_CONCURRENT_TRXS:
+				my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+				break;
+			case DB_LOCK_WAIT_TIMEOUT:
+				my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+				break;
+			default:
+				sql_print_error("InnoDB: %s: %s\n", op,
+						ut_strerr(error));
+				DBUG_ASSERT(error == DB_IO_ERROR
+					    || error == DB_LOCK_TABLE_FULL
+					    || error == DB_DECRYPTION_FAILED
+					    || error == DB_PAGE_CORRUPTED
+					    || error == DB_CORRUPTION);
+				my_error(ER_INTERNAL_ERROR, MYF(0), op);
+			}
+
 			DBUG_RETURN(true);
 		}
 	}
@@ -10518,7 +10684,7 @@ commit_cache_norebuild(
 		if (fil_space_t* space = ctx->new_table->space) {
 			bool update = !(space->flags
 					& FSP_FLAGS_MASK_PAGE_COMPRESSION);
-			mutex_enter(&fil_system.mutex);
+			mysql_mutex_lock(&fil_system.mutex);
 			space->flags &= ~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL;
 			space->flags |= ctx->page_compression_level
 				<< FSP_FLAGS_MEM_COMPRESSION_LEVEL;
@@ -10530,7 +10696,7 @@ commit_cache_norebuild(
 					|= innodb_compression_algorithm
 					<< FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
 			}
-			mutex_exit(&fil_system.mutex);
+			mysql_mutex_unlock(&fil_system.mutex);
 
 			if (update) {
 				/* Maybe we should introduce an undo
@@ -10550,7 +10716,8 @@ commit_cache_norebuild(
 					    space->zip_size(),
 					    RW_X_LATCH, &mtr)) {
 					byte* f = FSP_HEADER_OFFSET
-						+ FSP_SPACE_FLAGS + b->frame;
+						+ FSP_SPACE_FLAGS
+						+ b->page.frame;
 					const auto sf = space->flags
 						& ~FSP_FLAGS_MEM_MASK;
 					if (mach_read_from_4(f) != sf) {
@@ -10589,64 +10756,26 @@ commit_cache_norebuild(
 		DBUG_ASSERT(dict_index_get_online_status(index)
 			    == ONLINE_INDEX_COMPLETE);
 		DBUG_ASSERT(!index->is_committed());
+		index->change_col_info = nullptr;
 		index->set_committed(true);
 	}
 
-	if (ctx->num_to_drop_index) {
-		/* Really drop the indexes that were dropped.
-		The transaction had to be committed first
-		(after renaming the indexes), so that in the
-		event of a crash, crash recovery will drop the
-		indexes, because it drops all indexes whose
-		names start with TEMP_INDEX_PREFIX_STR. Once we
-		have started dropping an index tree, there is
-		no way to roll it back. */
-
-		for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
-			dict_index_t*	index = ctx->drop_index[i];
-			DBUG_ASSERT(index->is_committed());
-			DBUG_ASSERT(index->table == ctx->new_table);
-			DBUG_ASSERT(index->to_be_dropped);
-
-			/* Replace the indexes in foreign key
-			constraints if needed. */
-
-			if (!dict_foreign_replace_index(
-				    index->table, ctx->col_names, index)) {
-				found = false;
-			}
-
-			/* Mark the index dropped
-			in the data dictionary cache. */
-			rw_lock_x_lock(dict_index_get_lock(index));
-			index->page = FIL_NULL;
-			rw_lock_x_unlock(dict_index_get_lock(index));
-		}
-
-		trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
-		row_merge_drop_indexes_dict(trx, ctx->new_table->id);
-
-		for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
-			dict_index_t*	index = ctx->drop_index[i];
-			DBUG_ASSERT(index->is_committed());
-			DBUG_ASSERT(index->table == ctx->new_table);
-
-			if (index->type & DICT_FTS) {
-				DBUG_ASSERT(index->type == DICT_FTS
-					    || (index->type
-						& DICT_CORRUPT));
-				DBUG_ASSERT(index->table->fts);
-				DEBUG_SYNC_C("norebuild_fts_drop");
-				fts_drop_index(index->table, index, trx);
-			}
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(index->is_committed());
+		DBUG_ASSERT(index->table == ctx->new_table);
+		DBUG_ASSERT(index->to_be_dropped);
 
-			dict_index_remove_from_cache(index->table, index);
+		if (!dict_foreign_replace_index(index->table, ctx->col_names,
+						index)) {
+			found = false;
 		}
 
-		fts_clear_all(ctx->old_table, trx);
-		trx_commit_for_mysql(trx);
+		dict_index_remove_from_cache(index->table, index);
 	}
 
+	fts_clear_all(ctx->old_table);
+
 	if (!ctx->is_instant()) {
 		innobase_rename_or_enlarge_columns_cache(
 			ha_alter_info, altered_table, table, ctx->new_table);
@@ -10729,8 +10858,6 @@ alter_stats_norebuild(
 	ha_innobase_inplace_ctx*	ctx,
 	THD*				thd)
 {
-	ulint	i;
-
 	DBUG_ENTER("alter_stats_norebuild");
 	DBUG_ASSERT(!ctx->need_rebuild());
 
@@ -10738,98 +10865,7 @@ alter_stats_norebuild(
 		DBUG_VOID_RETURN;
 	}
 
-	/* Delete corresponding rows from the stats table. We do this
-	in a separate transaction from trx, because lock waits are not
-	allowed in a data dictionary transaction. (Lock waits are possible
-	on the statistics table, because it is directly accessible by users,
-	not covered by the dict_sys.latch.)
-
-	Because the data dictionary changes were already committed, orphaned
-	rows may be left in the statistics table if the system crashes.
-
-	FIXME: each change to the statistics tables is being committed in a
-	separate transaction, meaning that the operation is not atomic
-
-	FIXME: This will not drop the (unused) statistics for
-	FTS_DOC_ID_INDEX if it was a hidden index, dropped together
-	with the last renamining FULLTEXT index. */
-	for (i = 0; i < ha_alter_info->index_drop_count; i++) {
-		const KEY* key = ha_alter_info->index_drop_buffer[i];
-
-		if (key->flags & HA_FULLTEXT) {
-			/* There are no index cardinality
-			statistics for FULLTEXT indexes. */
-			continue;
-		}
-
-		char	errstr[1024];
-
-		if (dict_stats_drop_index(
-			    ctx->new_table->name.m_name, key->name.str,
-			    errstr, sizeof errstr) != DB_SUCCESS) {
-			push_warning(thd,
-				     Sql_condition::WARN_LEVEL_WARN,
-				     ER_LOCK_WAIT_TIMEOUT, errstr);
-		}
-	}
-
-	for (size_t i = 0; i < ha_alter_info->rename_keys.size(); i++) {
-		const Alter_inplace_info::Rename_key_pair& pair
-			= ha_alter_info->rename_keys[i];
-
-		std::stringstream ss;
-		ss << TEMP_FILE_PREFIX_INNODB << std::this_thread::get_id()
-		   << i;
-		auto tmp_name = ss.str();
-
-		dberr_t err = dict_stats_rename_index(ctx->new_table,
-						      pair.old_key->name.str,
-						      tmp_name.c_str());
-
-		if (err != DB_SUCCESS) {
-			push_warning_printf(
-				thd,
-				Sql_condition::WARN_LEVEL_WARN,
-				ER_ERROR_ON_RENAME,
-				"Error renaming an index of table '%s'"
-				" from '%s' to '%s' in InnoDB persistent"
-				" statistics storage: %s",
-				ctx->new_table->name.m_name,
-				pair.old_key->name.str,
-				tmp_name.c_str(),
-				ut_strerr(err));
-		}
-	}
-
-	for (size_t i = 0; i < ha_alter_info->rename_keys.size(); i++) {
-		const Alter_inplace_info::Rename_key_pair& pair
-			= ha_alter_info->rename_keys[i];
-
-		std::stringstream ss;
-		ss << TEMP_FILE_PREFIX_INNODB << std::this_thread::get_id()
-		   << i;
-		auto tmp_name = ss.str();
-
-		dberr_t err = dict_stats_rename_index(ctx->new_table,
-						      tmp_name.c_str(),
-						      pair.new_key->name.str);
-
-		if (err != DB_SUCCESS) {
-			push_warning_printf(
-				thd,
-				Sql_condition::WARN_LEVEL_WARN,
-				ER_ERROR_ON_RENAME,
-				"Error renaming an index of table '%s'"
-				" from '%s' to '%s' in InnoDB persistent"
-				" statistics storage: %s",
-				ctx->new_table->name.m_name,
-				tmp_name.c_str(),
-				pair.new_key->name.str,
-				ut_strerr(err));
-		}
-	}
-
-	for (i = 0; i < ctx->num_to_add_index; i++) {
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
 		dict_index_t*	index = ctx->add_index[i];
 		DBUG_ASSERT(index->table == ctx->new_table);
 
@@ -10879,17 +10915,6 @@ alter_stats_rebuild(
 	DBUG_VOID_RETURN;
 }
 
-#ifndef DBUG_OFF
-# define DBUG_INJECT_CRASH(prefix, count)			\
-do {								\
-	char buf[32];						\
-	snprintf(buf, sizeof buf, prefix "_%u", count);	\
-	DBUG_EXECUTE_IF(buf, DBUG_SUICIDE(););			\
-} while (0)
-#else
-# define DBUG_INJECT_CRASH(prefix, count)
-#endif
-
 /** Apply the log for the table rebuild operation.
 @param[in]	ctx		Inplace Alter table context
 @param[in]	altered_table	MySQL table that is being altered
@@ -10909,7 +10934,6 @@ static bool alter_rebuild_apply_log(
 	dropped were not created in the copy of the table. Apply any
 	last bit of the rebuild log and then rename the tables. */
 	dict_table_t*	user_table = ctx->old_table;
-	dict_table_t*	rebuilt_table = ctx->new_table;
 
 	DEBUG_SYNC_C("row_log_table_apply2_before");
 
@@ -10938,41 +10962,8 @@ static bool alter_rebuild_apply_log(
 		ctx->new_table->vc_templ = NULL;
 	}
 
-	ulint	err_key = thr_get_trx(ctx->thr)->error_key_num;
-
-	switch (error) {
-		KEY*	dup_key;
-	case DB_SUCCESS:
-		break;
-	case DB_DUPLICATE_KEY:
-		if (err_key == ULINT_UNDEFINED) {
-			/* This should be the hidden index on
-			   FTS_DOC_ID. */
-			dup_key = NULL;
-		} else {
-			DBUG_ASSERT(err_key < ha_alter_info->key_count);
-			dup_key = &ha_alter_info->key_info_buffer[err_key];
-		}
-
-		print_keydup_error(altered_table, dup_key, MYF(0));
-		DBUG_RETURN(true);
-	case DB_ONLINE_LOG_TOO_BIG:
-		my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
-			 get_error_key_name(err_key, ha_alter_info,
-					    rebuilt_table));
-		DBUG_RETURN(true);
-	case DB_INDEX_CORRUPT:
-		my_error(ER_INDEX_CORRUPT, MYF(0),
-			 get_error_key_name(err_key, ha_alter_info,
-					    rebuilt_table));
-		DBUG_RETURN(true);
-	default:
-		my_error_innodb(error, ctx->old_table->name.m_name,
-				user_table->flags);
-		DBUG_RETURN(true);
-	}
-
-	DBUG_RETURN(false);
+	DBUG_RETURN(ctx->log_failure(
+			ha_alter_info, altered_table, error));
 }
 
 /** Commit or rollback the changes made during
@@ -11003,8 +10994,6 @@ ha_innobase::commit_inplace_alter_table(
 		(ha_alter_info->handler_ctx);
 
 #ifndef DBUG_OFF
-	uint	crash_inject_count	= 1;
-	uint	crash_fail_inject_count	= 1;
 	uint	failure_inject_count	= 1;
 #endif /* DBUG_OFF */
 
@@ -11023,12 +11012,10 @@ ha_innobase::commit_inplace_alter_table(
 
 	if (!commit) {
 		/* A rollback is being requested. So far we may at
-		most have created some indexes. If any indexes were to
-		be dropped, they would actually be dropped in this
-		method if commit=true. */
-		const bool	ret = rollback_inplace_alter_table(
-			ha_alter_info, table, m_prebuilt);
-		DBUG_RETURN(ret);
+		most have created stubs for ADD INDEX or a copy of the
+		table for rebuild. */
+		DBUG_RETURN(rollback_inplace_alter_table(
+				    ha_alter_info, table, m_prebuilt));
 	}
 
 	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
@@ -11063,13 +11050,15 @@ ha_innobase::commit_inplace_alter_table(
 	ut_ad(m_prebuilt->table == ctx0->old_table);
 	ha_alter_info->group_commit_ctx = NULL;
 
-	trx_start_if_not_started_xa(m_prebuilt->trx, true);
-
+	const bool new_clustered = ctx0->need_rebuild();
+	trx_t* const trx = ctx0->trx;
+	trx->op_info = "acquiring table lock";
+	bool fts_exist = false;
 	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
-		ha_innobase_inplace_ctx*	ctx
-			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		auto ctx = static_cast<ha_innobase_inplace_ctx*>(*pctx);
 		DBUG_ASSERT(ctx->prebuilt->trx == m_prebuilt->trx);
-
+		ut_ad(m_prebuilt != ctx->prebuilt || ctx == ctx0);
+		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
 		/* If decryption failed for old table or new table
 		fail here. */
 		if ((!ctx->old_table->is_readable()
@@ -11082,45 +11071,37 @@ ha_innobase::commit_inplace_alter_table(
 			my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
 			DBUG_RETURN(true);
 		}
-
-		/* Exclusively lock the table, to ensure that no other
-		transaction is holding locks on the table while we
-		change the table definition. The MySQL meta-data lock
-		should normally guarantee that no conflicting locks
-		exist. However, FOREIGN KEY constraints checks and any
-		transactions collected during crash recovery could be
-		holding InnoDB locks only, not MySQL locks. */
-
-		dberr_t error = row_merge_lock_table(
-			m_prebuilt->trx, ctx->old_table, LOCK_X);
-
-		if (error != DB_SUCCESS) {
-			my_error_innodb(
-				error, table_share->table_name.str, 0);
-			DBUG_RETURN(true);
+		if ((ctx->old_table->flags2 | ctx->new_table->flags2)
+		    & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) {
+			fts_exist = true;
 		}
 	}
 
-	DEBUG_SYNC(m_user_thd, "innodb_alter_commit_after_lock_table");
-
-	const bool	new_clustered	= ctx0->need_rebuild();
-	trx_t*		trx		= ctx0->trx;
-	bool		fail		= false;
+	bool already_stopped= false;
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		auto ctx = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		dberr_t error = DB_SUCCESS;
 
-	/* Stop background FTS operations. */
-	for (inplace_alter_handler_ctx** pctx = ctx_array;
-			 *pctx; pctx++) {
-		ha_innobase_inplace_ctx*	ctx
-			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		if (fts_exist) {
+			purge_sys.stop_FTS(*ctx->old_table, already_stopped);
+			already_stopped = true;
+		}
 
-		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+		if (new_clustered && ctx->old_table->fts) {
+			ut_ad(!ctx->old_table->fts->add_wq);
+			fts_optimize_remove_table(ctx->old_table);
+		}
 
-		if (new_clustered) {
-			if (ctx->old_table->fts) {
-				ut_ad(!ctx->old_table->fts->add_wq);
-				fts_optimize_remove_table(ctx->old_table);
+		dict_sys.freeze(SRW_LOCK_CALL);
+		for (auto f : ctx->old_table->referenced_set) {
+			if (dict_table_t* child = f->foreign_table) {
+				error = lock_table_for_trx(child, trx, LOCK_X);
+				if (error != DB_SUCCESS) {
+					break;
+				}
 			}
 		}
+		dict_sys.unfreeze();
 
 		if (ctx->new_table->fts) {
 			ut_ad(!ctx->new_table->fts->add_wq);
@@ -11128,93 +11109,265 @@ ha_innobase::commit_inplace_alter_table(
 			fts_sync_during_ddl(ctx->new_table);
 		}
 
-		/* Apply the online log of the table before acquiring
-		data dictionary latches. Here alter thread already acquired
-		MDL_EXCLUSIVE on the table. So there can't be anymore DDLs, DMLs
-		for the altered table. By applying the log here, InnoDB
-		makes sure that concurrent DDLs, purge thread or any other
-		background thread doesn't wait for the dict_operation_lock
-		for longer time. */
-		if (new_clustered && commit
-		    && alter_rebuild_apply_log(
-				ctx, ha_alter_info, altered_table)) {
+		/* Exclusively lock the table, to ensure that no other
+		transaction is holding locks on the table while we
+		change the table definition. Any recovered incomplete
+		transactions would be holding InnoDB locks only, not MDL. */
+		if (error == DB_SUCCESS) {
+			error = lock_table_for_trx(ctx->new_table, trx,
+						   LOCK_X);
+		}
+
+		DBUG_EXECUTE_IF("deadlock_table_fail",
+				{
+				  error= DB_DEADLOCK;
+				  trx_rollback_for_mysql(trx);
+				});
+
+		if (error != DB_SUCCESS) {
+lock_fail:
+			my_error_innodb(
+				error, table_share->table_name.str, 0);
+			if (fts_exist) {
+				purge_sys.resume_FTS();
+			}
+
+			/* Deadlock encountered and rollbacked the
+			transaction. So restart the transaction
+			to remove the newly created table or
+			index from data dictionary and table cache
+			in rollback_inplace_alter_table() */
+			if (trx->state == TRX_STATE_NOT_STARTED) {
+				trx_start_for_ddl(trx);
+			}
+
 			DBUG_RETURN(true);
+		} else if ((ctx->new_table->flags2
+			    & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))
+			   && (error = fts_lock_tables(trx, *ctx->new_table))
+			   != DB_SUCCESS) {
+			goto lock_fail;
+		} else if (!new_clustered) {
+		} else if ((error = lock_table_for_trx(ctx->old_table, trx,
+						       LOCK_X))
+			   != DB_SUCCESS) {
+			goto lock_fail;
+		} else if ((ctx->old_table->flags2
+			    & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))
+			   && (error = fts_lock_tables(trx, *ctx->old_table))
+			   != DB_SUCCESS) {
+			goto lock_fail;
 		}
 	}
 
-	if (!trx) {
-		DBUG_ASSERT(!new_clustered);
-		trx = innobase_trx_allocate(m_user_thd);
-	}
+	DEBUG_SYNC(m_user_thd, "innodb_alter_commit_after_lock_table");
 
-	trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
-	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
-	or lock waits can happen in it during the data dictionary operation. */
-	row_mysql_lock_data_dictionary(trx);
+	if (new_clustered) {
+		/* We are holding MDL_EXCLUSIVE as well as exclusive
+		InnoDB table locks. Let us apply any table rebuild log
+		before locking dict_sys. */
+		for (inplace_alter_handler_ctx** pctx= ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+			if (alter_rebuild_apply_log(ctx, ha_alter_info,
+						    altered_table)) {
+				if (fts_exist) {
+					purge_sys.resume_FTS();
+				}
+				DBUG_RETURN(true);
+			}
+		}
+	} else {
+		dberr_t error= DB_SUCCESS;
+		for (inplace_alter_handler_ctx** pctx= ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
 
-	/* Prevent the background statistics collection from accessing
-	the tables. */
-	for (;;) {
-		bool	retry = false;
+			if (!ctx->online || !ctx->old_table->space
+			    || !ctx->old_table->is_readable()) {
+				continue;
+			}
 
-		for (inplace_alter_handler_ctx** pctx = ctx_array;
-		     *pctx; pctx++) {
-			ha_innobase_inplace_ctx*	ctx
-				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+				dict_index_t *index= ctx->add_index[i];
 
-			DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+				ut_ad(!(index->type &
+					(DICT_FTS | DICT_SPATIAL)));
 
-			if (new_clustered
-			    && !dict_stats_stop_bg(ctx->old_table)) {
-				retry = true;
-			}
+				index->lock.x_lock(SRW_LOCK_CALL);
+				if (!index->online_log) {
+					/* online log would've cleared
+					when we detect the error in
+					other index */
+					index->lock.x_unlock();
+					continue;
+				}
+
+				if (index->is_corrupted()) {
+					/* Online index log has been
+					preserved to show the error
+					when it happened via
+					row_log_apply() by DML thread */
+					error= row_log_get_error(index);
+err_index:
+					ut_ad(error != DB_SUCCESS);
+					ctx->log_failure(
+						ha_alter_info,
+						altered_table, error);
+					row_log_free(index->online_log);
+					index->online_log= nullptr;
+					index->lock.x_unlock();
+
+					ctx->old_table->indexes.start
+						->online_log= nullptr;
+					if (fts_exist) {
+						purge_sys.resume_FTS();
+					}
+					MONITOR_ATOMIC_INC(
+						MONITOR_BACKGROUND_DROP_INDEX);
+					DBUG_RETURN(true);
+				}
+
+				index->lock.x_unlock();
 
-			if (!dict_stats_stop_bg(ctx->new_table)) {
-				retry = true;
+				error = row_log_apply(
+					m_prebuilt->trx, index, altered_table,
+					ctx->m_stage);
+
+				index->lock.x_lock(SRW_LOCK_CALL);
+
+				if (error != DB_SUCCESS) {
+					goto err_index;
+				}
+
+				row_log_free(index->online_log);
+				index->online_log= nullptr;
+				index->lock.x_unlock();
 			}
+
+			ctx->old_table->indexes.start->online_log= nullptr;
 		}
+	}
 
-		if (!retry) {
-			break;
+	dict_table_t *table_stats = nullptr, *index_stats = nullptr;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dberr_t error = DB_SUCCESS;
+	if (!ctx0->old_table->is_stats_table() &&
+	    !ctx0->new_table->is_stats_table()) {
+		table_stats = dict_table_open_on_name(
+			TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+		if (table_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			table_stats = dict_acquire_mdl_shared<false>(
+				table_stats, m_user_thd, &mdl_table);
+			dict_sys.unfreeze();
+		}
+		index_stats = dict_table_open_on_name(
+			INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+		if (index_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			index_stats = dict_acquire_mdl_shared<false>(
+				index_stats, m_user_thd, &mdl_index);
+			dict_sys.unfreeze();
 		}
 
-		DICT_BG_YIELD(trx);
+		if (table_stats && index_stats
+		    && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME)
+		    && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)
+		    && !(error = lock_table_for_trx(table_stats,
+						    trx, LOCK_X))) {
+			error = lock_table_for_trx(index_stats, trx, LOCK_X);
+		}
 	}
 
+	DBUG_EXECUTE_IF("stats_lock_fail",
+			error = DB_LOCK_WAIT_TIMEOUT;
+			trx_rollback_for_mysql(trx););
+
+	if (error == DB_SUCCESS) {
+		error = lock_sys_tables(trx);
+	}
+	if (error != DB_SUCCESS) {
+		if (table_stats) {
+			dict_table_close(table_stats, false, m_user_thd,
+					 mdl_table);
+		}
+		if (index_stats) {
+			dict_table_close(index_stats, false, m_user_thd,
+					 mdl_index);
+		}
+		my_error_innodb(error, table_share->table_name.str, 0);
+		if (fts_exist) {
+			purge_sys.resume_FTS();
+		}
+
+		if (trx->state == TRX_STATE_NOT_STARTED) {
+			/* Transaction may have been rolled back
+			due to a lock wait timeout, deadlock,
+			or a KILL statement. So restart the
+			transaction to remove the newly created
+			table or index stubs from data dictionary
+			and table cache in
+			rollback_inplace_alter_table() */
+			trx_start_for_ddl(trx);
+		}
+
+		DBUG_RETURN(true);
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
 	/* Apply the changes to the data dictionary tables, for all
 	partitions. */
-
-	for (inplace_alter_handler_ctx** pctx = ctx_array;
-	     *pctx && !fail; pctx++) {
-		ha_innobase_inplace_ctx*	ctx
-			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		auto ctx = static_cast<ha_innobase_inplace_ctx*>(*pctx);
 
 		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
 		if (ctx->need_rebuild() && !ctx->old_table->space) {
 			my_error(ER_TABLESPACE_DISCARDED, MYF(0),
 				 table->s->table_name.str);
-			fail = true;
-		} else {
-			fail = commit_set_autoinc(ha_alter_info, ctx,
-						  altered_table, table);
+fail:
+			trx->rollback();
+			ut_ad(!trx->fts_trx);
+			if (table_stats) {
+				dict_table_close(table_stats, true, m_user_thd,
+						 mdl_table);
+			}
+			if (index_stats) {
+				dict_table_close(index_stats, true, m_user_thd,
+						 mdl_index);
+			}
+			row_mysql_unlock_data_dictionary(trx);
+			if (fts_exist) {
+				purge_sys.resume_FTS();
+			}
+			trx_start_for_ddl(trx);
+			DBUG_RETURN(true);
+		}
+
+		if (commit_set_autoinc(ha_alter_info, ctx,
+				       altered_table, table)) {
+			goto fail;
 		}
 
-		if (fail) {
-		} else if (ctx->need_rebuild()) {
+		if (ctx->need_rebuild()) {
 			ctx->tmp_name = dict_mem_create_temporary_tablename(
 				ctx->heap, ctx->new_table->name.m_name,
 				ctx->new_table->id);
 
-			fail = commit_try_rebuild(
-				ha_alter_info, ctx, altered_table, table,
-				trx, table_share->table_name.str);
-		} else {
-			fail = commit_try_norebuild(
-				ha_alter_info, ctx, altered_table, table, trx,
-				table_share->table_name.str);
+			if (commit_try_rebuild(ha_alter_info, ctx,
+					       altered_table, table,
+					       table_stats && index_stats,
+					       trx,
+					       table_share->table_name.str)) {
+				goto fail;
+			}
+		} else if (commit_try_norebuild(ha_alter_info, ctx,
+						altered_table, table, trx,
+						table_share->table_name.str)) {
+			goto fail;
 		}
-		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
-				  crash_inject_count++);
 #ifndef DBUG_OFF
 		{
 			/* Generate a dynamic dbug text. */
@@ -11227,58 +11380,72 @@ ha_innobase::commit_inplace_alter_table(
 			DBUG_EXECUTE_IF(buf,
 					my_error(ER_INTERNAL_ERROR, MYF(0),
 						 "Injected error!");
-					fail = true;
+					goto fail;
 			);
 		}
 #endif
 	}
 
+	if (table_stats) {
+		dict_table_close(table_stats, true, m_user_thd, mdl_table);
+	}
+	if (index_stats) {
+		dict_table_close(index_stats, true, m_user_thd, mdl_index);
+	}
+
 	/* Commit or roll back the changes to the data dictionary. */
 	DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit");
 
-	if (fail) {
-		trx_rollback_for_mysql(trx);
-		for (inplace_alter_handler_ctx** pctx = ctx_array;
-		     *pctx; pctx++) {
-			ha_innobase_inplace_ctx*	ctx
-				= static_cast<ha_innobase_inplace_ctx*>(*pctx);
-			ctx->rollback_instant();
+	if (new_clustered) {
+		ut_ad(trx->has_logged());
+		for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			ut_ad(!strcmp(ctx->old_table->name.m_name,
+				      ctx->tmp_name));
+			ut_ad(ctx->new_table->get_ref_count() == 1);
+			const bool own = m_prebuilt == ctx->prebuilt;
+			trx_t* const user_trx = m_prebuilt->trx;
+			ctx->prebuilt->table->release();
+			ctx->prebuilt->table = nullptr;
+			row_prebuilt_free(ctx->prebuilt);
+			/* Rebuild the prebuilt object. */
+			ctx->prebuilt = row_create_prebuilt(
+				ctx->new_table, altered_table->s->reclength);
+			if (own) {
+				m_prebuilt = ctx->prebuilt;
+			}
+			trx_start_if_not_started(user_trx, true);
+			m_prebuilt->trx = user_trx;
 		}
-	} else if (!new_clustered) {
-		trx_commit_for_mysql(trx);
-	} else {
-		/* Test what happens on crash if the redo logs
-		are flushed to disk here. The log records
-		about the rename should not be committed, and
-		the data dictionary transaction should be
-		rolled back, restoring the old table. */
-		DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit",
-				log_buffer_flush_to_disk();
-				DBUG_SUICIDE(););
-		ut_ad(!trx->fts_trx);
+	}
 
-		if (fail) {
-			trx_rollback_for_mysql(trx);
-		} else {
-			ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
-			ut_ad(trx->has_logged());
-			trx->commit();
-		}
+	ut_ad(!trx->fts_trx);
 
-		/* If server crashes here, the dictionary in
-		InnoDB and MySQL will differ.  The .ibd files
-		and the .frm files must be swapped manually by
-		the administrator. No loss of data. */
-		DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
-				log_buffer_flush_to_disk();
-				DBUG_SUICIDE(););
-	}
+	std::vector<pfs_os_file_t> deleted;
+	DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit",
+			log_buffer_flush_to_disk(); DBUG_SUICIDE(););
+	/* The SQL layer recovery of ALTER TABLE will invoke
+	innodb_check_version() to know whether our trx->id, which we
+	reported via ha_innobase::table_version() after
+	ha_innobase::prepare_inplace_alter_table(), was committed.
 
-	/* Flush the log to reduce probability that the .frm files and
-	the InnoDB data dictionary get out-of-sync if the user runs
-	with innodb_flush_log_at_trx_commit = 0 */
+	If this trx was committed (the log write below completed),
+	we will be able to recover our trx->id to
+	dict_table_t::def_trx_id from the data dictionary tables.
 
-	log_buffer_flush_to_disk();
+	For this logic to work, purge_sys.stop_SYS() and
+	purge_sys.resume_SYS() will ensure that the DB_TRX_ID that we
+	wrote to the SYS_ tables will be preserved until the SQL layer
+	has durably marked the ALTER TABLE operation as completed.
+
+	During recovery, the purge of InnoDB transaction history will
+	not start until innodb_ddl_recovery_done(). */
+#if 0
+	ha_alter_info->inplace_alter_table_committed = purge_sys.resume_SYS;
+	purge_sys.stop_SYS();
+#endif
+	trx->commit(deleted);
 
 	/* At this point, the changes to the persistent storage have
 	been committed or rolled back. What remains to be done is to
@@ -11292,50 +11459,10 @@ ha_innobase::commit_inplace_alter_table(
 
 		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
 
-		if (new_clustered) {
-			innobase_online_rebuild_log_free(ctx->old_table);
-		}
-
-		if (fail) {
-			if (new_clustered) {
-				trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
-
-				dict_table_close_and_drop(trx, ctx->new_table);
-
-				trx_commit_for_mysql(trx);
-				ctx->new_table = NULL;
-			} else {
-				/* We failed, but did not rebuild the table.
-				Roll back any ADD INDEX, or get rid of garbage
-				ADD INDEX that was left over from a previous
-				ALTER TABLE statement. */
-				trx_start_for_ddl(trx, TRX_DICT_OP_INDEX);
-				innobase_rollback_sec_index(
-					ctx->new_table, table, TRUE, trx);
-				trx_commit_for_mysql(trx);
-			}
-			DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
-					  crash_fail_inject_count++);
-
-			continue;
-		}
-
 		innobase_copy_frm_flags_from_table_share(
 			ctx->new_table, altered_table->s);
 
 		if (new_clustered) {
-			/* We will reload and refresh the
-			in-memory foreign key constraint
-			metadata. This is a rename operation
-			in preparing for dropping the old
-			table. Set the table to_be_dropped bit
-			here, so to make sure DML foreign key
-			constraint check does not use the
-			stale dict_foreign_t. This is done
-			because WL#6049 (FK MDL) has not been
-			implemented yet. */
-			ctx->old_table->to_be_dropped = true;
-
 			DBUG_PRINT("to_be_dropped",
 				   ("table: %s", ctx->old_table->name.m_name));
 
@@ -11367,42 +11494,10 @@ foreign_fail:
 
 		dict_mem_table_free_foreign_vcol_set(ctx->new_table);
 		dict_mem_table_fill_foreign_vcol_set(ctx->new_table);
-
-		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
-				  crash_inject_count++);
 	}
 
-	if (fail) {
-		for (inplace_alter_handler_ctx** pctx = ctx_array;
-		     *pctx; pctx++) {
-			ha_innobase_inplace_ctx*	ctx
-				= static_cast<ha_innobase_inplace_ctx*>
-				(*pctx);
-			DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
-
-			ut_d(dict_table_check_for_dup_indexes(
-				     ctx->old_table,
-				     CHECK_ABORTED_OK));
-			ut_a(fts_check_cached_index(ctx->old_table));
-			DBUG_INJECT_CRASH("ib_commit_inplace_crash_fail",
-					  crash_fail_inject_count++);
-
-			/* Restart the FTS background operations. */
-			if (ctx->old_table->fts) {
-				fts_optimize_add_table(ctx->old_table);
-			}
-		}
-
-		row_mysql_unlock_data_dictionary(trx);
-		if (trx != ctx0->trx) {
-			trx->free();
-		}
-		DBUG_RETURN(true);
-	}
-
-	if (trx == ctx0->trx) {
-		ctx0->trx = NULL;
-	}
+	ut_ad(trx == ctx0->trx);
+	ctx0->trx = nullptr;
 
 	/* Free the ctx->trx of other partitions, if any. We will only
 	use the ctx0->trx here. Others may have been allocated in
@@ -11414,6 +11509,7 @@ foreign_fail:
 			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
 
 		if (ctx->trx) {
+			ctx->trx->rollback();
 			ctx->trx->free();
 			ctx->trx = NULL;
 		}
@@ -11428,10 +11524,10 @@ foreign_fail:
 		    || ha_alter_info->alter_info->create_list.elements))
 	    || (ctx0->is_instant()
 		&& m_prebuilt->table->n_v_cols
-		&& ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER)) {
+		&& ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER)
+	    || !ctx0->change_col_collate.empty()) {
 		DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1);
 		ut_ad(ctx0->prebuilt == m_prebuilt);
-		trx_commit_for_mysql(m_prebuilt->trx);
 
 		for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx;
 		     pctx++) {
@@ -11443,19 +11539,20 @@ foreign_fail:
 				ctx->prebuilt->table, altered_table->s);
 		}
 
-		row_mysql_unlock_data_dictionary(trx);
+		unlock_and_close_files(deleted, trx);
+		log_write_up_to(trx->commit_lsn, true);
+		DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+				DBUG_SUICIDE(););
 		trx->free();
+		if (fts_exist) {
+			purge_sys.resume_FTS();
+		}
 		MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
 		/* There is no need to reset dict_table_t::persistent_autoinc
 		as the table is reloaded */
 		DBUG_RETURN(false);
 	}
 
-	/* Release the table locks. */
-	trx_commit_for_mysql(m_prebuilt->trx);
-
-	DBUG_EXECUTE_IF("ib_ddl_crash_after_user_trx_commit", DBUG_SUICIDE(););
-
 	for (inplace_alter_handler_ctx** pctx = ctx_array;
 	     *pctx; pctx++) {
 		ha_innobase_inplace_ctx*	ctx
@@ -11491,69 +11588,18 @@ foreign_fail:
 		ut_d(dict_table_check_for_dup_indexes(
 			     ctx->new_table, CHECK_ABORTED_OK));
 
-#ifdef UNIV_DEBUG
-		if (!(ctx->new_table->fts != NULL
-			&& ctx->new_table->fts->cache->sync->in_progress)) {
-			ut_a(fts_check_cached_index(ctx->new_table));
-		}
-#endif
-		if (new_clustered) {
-			/* Since the table has been rebuilt, we remove
-			all persistent statistics corresponding to the
-			old copy of the table (which was renamed to
-			ctx->tmp_name). */
-
-			DBUG_ASSERT(0 == strcmp(ctx->old_table->name.m_name,
-						ctx->tmp_name));
-
-			dict_stats_try_drop_table(m_user_thd,
-                                                  ctx->new_table->name,
-                                                  table->s->table_name);
-
-			DBUG_EXECUTE_IF("ib_ddl_crash_before_commit",
-					DBUG_SUICIDE(););
-
-			ut_ad(m_prebuilt != ctx->prebuilt
-			      || ctx == ctx0);
-			bool update_own_prebuilt =
-				(m_prebuilt == ctx->prebuilt);
-			trx_t* const	user_trx = m_prebuilt->trx;
-
-			row_prebuilt_free(ctx->prebuilt, TRUE);
-
-			/* Drop the copy of the old table, which was
-			renamed to ctx->tmp_name at the atomic DDL
-			transaction commit.  If the system crashes
-			before this is completed, some orphan tables
-			with ctx->tmp_name may be recovered. */
-			trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
-			dberr_t error = row_merge_drop_table(trx, ctx->old_table);
-
-			if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
-				ib::error() << "Inplace alter table " << ctx->old_table->name
-					    << " dropping copy of the old table failed error "
-					    << error
-					    << ". tmp_name " << (ctx->tmp_name ? ctx->tmp_name : "N/A")
-					    << " new_table " << ctx->new_table->name;
-			}
-
-			trx_commit_for_mysql(trx);
-
-			/* Rebuild the prebuilt object. */
-			ctx->prebuilt = row_create_prebuilt(
-				ctx->new_table, altered_table->s->reclength);
-			if (update_own_prebuilt) {
-				m_prebuilt = ctx->prebuilt;
-			}
-			trx_start_if_not_started(user_trx, true);
-			m_prebuilt->trx = user_trx;
-		}
-		DBUG_INJECT_CRASH("ib_commit_inplace_crash",
-				  crash_inject_count++);
+		ut_ad(!ctx->new_table->fts
+		      || fts_check_cached_index(ctx->new_table));
 	}
 
-	row_mysql_unlock_data_dictionary(trx);
+	unlock_and_close_files(deleted, trx);
+	log_write_up_to(trx->commit_lsn, true);
+	DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+			DBUG_SUICIDE(););
 	trx->free();
+	if (fts_exist) {
+		purge_sys.resume_FTS();
+	}
 
 	/* TODO: The following code could be executed
 	while allowing concurrent access to the table
@@ -11570,8 +11616,6 @@ foreign_fail:
 			alter_stats_rebuild(
 				ctx->new_table, table->s->table_name.str,
 				m_user_thd);
-			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
-					  crash_inject_count++);
 		}
 	} else {
 		for (inplace_alter_handler_ctx** pctx = ctx_array;
@@ -11582,8 +11626,6 @@ foreign_fail:
 			DBUG_ASSERT(!ctx->need_rebuild());
 
 			alter_stats_norebuild(ha_alter_info, ctx, m_user_thd);
-			DBUG_INJECT_CRASH("ib_commit_inplace_crash",
-					  crash_inject_count++);
 		}
 	}
 
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index 3487f4ffc69..e393bed6d9f 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -22,7 +22,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 InnoDB INFORMATION SCHEMA tables interface to MySQL.
 
 Created July 18, 2007 Vasil Dimov
-Modified Dec 29, 2014 Jan Lindström (Added sys_semaphore_waits)
 *******************************************************/
 
 #include "univ.i"
@@ -47,22 +46,21 @@ Modified Dec 29, 2014 Jan Lindström (Added sys_semaphore_waits)
 #include "trx0i_s.h"
 #include "trx0trx.h"
 #include "srv0mon.h"
-#include "fut0fut.h"
 #include "pars0pars.h"
 #include "fts0types.h"
 #include "fts0opt.h"
 #include "fts0priv.h"
 #include "btr0btr.h"
 #include "page0zip.h"
-#include "sync0arr.h"
 #include "fil0fil.h"
 #include "fil0crypt.h"
 #include "dict0crea.h"
 #include "fts0vlc.h"
+#include "scope.h"
 #include "log.h"
 
 /** The latest successfully looked up innodb_fts_aux_table */
-UNIV_INTERN table_id_t innodb_ft_aux_table_id;
+table_id_t innodb_ft_aux_table_id;
 
 /** structure associates a name string with a file page type and/or buffer
 page state. */
@@ -118,10 +116,8 @@ struct buf_page_info_t{
 	ulint		block_id;	/*!< Buffer Pool block ID */
 	/** page identifier */
 	page_id_t	id;
-	unsigned	access_time:32;	/*!< Time of first access */
-	unsigned	io_fix:2;	/*!< type of pending I/O operation */
-	uint32_t	fix_count;	/*!< Count of how manyfold this block
-					is bufferfixed */
+	uint32_t	access_time;	/*!< Time of first access */
+	uint32_t	state;		/*!< buf_page_t::state() */
 #ifdef BTR_CUR_HASH_ADAPT
 	unsigned	hashed:1;	/*!< Whether hash index has been
 					built on this page */
@@ -132,7 +128,7 @@ struct buf_page_info_t{
 					buf_pool.freed_page_clock */
 	unsigned	zip_ssize:PAGE_ZIP_SSIZE_BITS;
 					/*!< Compressed page size */
-	unsigned	page_state:3; /*!< Page state */
+	unsigned	compressed_only:1; /*!< ROW_FORMAT=COMPRESSED only */
 	unsigned	page_type:I_S_PAGE_TYPE_BITS;	/*!< Page type */
 	unsigned	num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
 					/*!< Number of records on Page */
@@ -173,20 +169,6 @@ time_t			MYSQL_TYPE_DATETIME
 ---------------------------------
 */
 
-/** Implemented on sync0arr.cc */
-/*******************************************************************//**
-Function to populate INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
-Loop through each item on sync array, and extract the column
-information and fill the INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS table.
-@return 0 on success */
-UNIV_INTERN
-int
-sync_arr_fill_sys_semphore_waits_table(
-/*===================================*/
-	THD*		thd,	/*!< in: thread */
-	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	Item*		);	/*!< in: condition (not used) */
-
 /**
 Common function to fill any of the dynamic tables:
 INFORMATION_SCHEMA.innodb_trx
@@ -265,6 +247,7 @@ field_store_time_t(
 /*******************************************************************//**
 Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
 @return 0 on success */
+static
 int
 field_store_string(
 /*===============*/
@@ -306,7 +289,7 @@ static ST_FIELD_INFO innodb_trx_fields_info[]=
   Column("trx_id", ULonglong(), NOT_NULL),
 
 #define IDX_TRX_STATE		1
-  Column("trx_state", Varchar(TRX_QUE_STATE_STR_MAX_LEN + 1), NOT_NULL),
+  Column("trx_state", Varchar(13), NOT_NULL),
 
 #define IDX_TRX_STARTED		2
   Column("trx_started", Datetime(0), NOT_NULL),
@@ -551,7 +534,7 @@ static struct st_mysql_information_schema	i_s_info =
 	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
 };
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_trx =
+struct st_maria_plugin	i_s_innodb_trx =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -775,7 +758,7 @@ innodb_locks_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_locks =
+struct st_maria_plugin	i_s_innodb_locks =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -938,7 +921,7 @@ innodb_lock_waits_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_lock_waits =
+struct st_maria_plugin	i_s_innodb_lock_waits =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1127,7 +1110,7 @@ i_s_cmp_reset_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp =
+struct st_maria_plugin	i_s_innodb_cmp =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1176,7 +1159,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_reset =
+struct st_maria_plugin	i_s_innodb_cmp_reset =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1291,12 +1274,12 @@ i_s_cmp_per_index_fill_low(
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
 	/* Create a snapshot of the stats so we do not bump into lock
-	order violations with dict_sys.mutex below. */
-	mutex_enter(&page_zip_stat_per_index_mutex);
+	order violations with dict_sys.latch below. */
+	mysql_mutex_lock(&page_zip_stat_per_index_mutex);
 	page_zip_stat_per_index_t		snap (page_zip_stat_per_index);
-	mutex_exit(&page_zip_stat_per_index_mutex);
+	mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.freeze(SRW_LOCK_CALL);
 
 	page_zip_stat_per_index_t::iterator	iter;
 	ulint					i;
@@ -1348,18 +1331,18 @@ i_s_cmp_per_index_fill_low(
 			status = 1;
 			break;
 		}
-		/* Release and reacquire the dict mutex to allow other
+		/* Release and reacquire the dict_sys.latch to allow other
 		threads to proceed. This could eventually result in the
 		contents of INFORMATION_SCHEMA.innodb_cmp_per_index being
 		inconsistent, but it is an acceptable compromise. */
 		if (i == 1000) {
-			mutex_exit(&dict_sys.mutex);
+			dict_sys.unfreeze();
 			i = 0;
-			mutex_enter(&dict_sys.mutex);
+			dict_sys.freeze(SRW_LOCK_CALL);
 		}
 	}
 
-	mutex_exit(&dict_sys.mutex);
+	dict_sys.unfreeze();
 
 	if (reset) {
 		page_zip_reset_stat_per_index();
@@ -1432,7 +1415,7 @@ i_s_cmp_per_index_reset_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_per_index =
+struct st_maria_plugin	i_s_innodb_cmp_per_index =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1481,7 +1464,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_per_index =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmp_per_index_reset =
+struct st_maria_plugin	i_s_innodb_cmp_per_index_reset =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1677,7 +1660,7 @@ i_s_cmpmem_reset_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem =
+struct st_maria_plugin	i_s_innodb_cmpmem =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -1726,7 +1709,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem =
         MariaDB_PLUGIN_MATURITY_STABLE,
 };
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_cmpmem_reset =
+struct st_maria_plugin	i_s_innodb_cmpmem_reset =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -2159,7 +2142,7 @@ innodb_metrics_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_metrics =
+struct st_maria_plugin	i_s_innodb_metrics =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -2268,7 +2251,7 @@ i_s_stopword_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_default_stopword =
+struct st_maria_plugin	i_s_innodb_ft_default_stopword =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -2356,20 +2339,16 @@ i_s_fts_deleted_generic_fill(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	/* Prevent DROP of the internal tables for fulltext indexes.
-	FIXME: acquire DDL-blocking MDL on the user table name! */
-	rw_lock_s_lock(&dict_sys.latch);
-
+	MDL_ticket* mdl_ticket = nullptr;
 	user_table = dict_table_open_on_id(
-		innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
 
 	if (!user_table) {
-		rw_lock_s_unlock(&dict_sys.latch);
 		DBUG_RETURN(0);
 	} else if (!dict_table_has_fts_index(user_table)
 		   || !user_table->is_readable()) {
-		dict_table_close(user_table, FALSE, FALSE);
-		rw_lock_s_unlock(&dict_sys.latch);
+		dict_table_close(user_table, false, thd, mdl_ticket);
 		DBUG_RETURN(0);
 	}
 
@@ -2384,9 +2363,7 @@ i_s_fts_deleted_generic_fill(
 
 	fts_table_fetch_doc_ids(trx, &fts_table, deleted);
 
-	dict_table_close(user_table, FALSE, FALSE);
-
-	rw_lock_s_unlock(&dict_sys.latch);
+	dict_table_close(user_table, false, thd, mdl_ticket);
 
 	trx->free();
 
@@ -2443,7 +2420,7 @@ i_s_fts_deleted_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_deleted =
+struct st_maria_plugin	i_s_innodb_ft_deleted =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -2526,7 +2503,7 @@ i_s_fts_being_deleted_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_being_deleted =
+struct st_maria_plugin	i_s_innodb_ft_being_deleted =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -2731,22 +2708,18 @@ i_s_fts_index_cache_fill(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	/* Prevent DROP of the internal tables for fulltext indexes.
-	FIXME: acquire DDL-blocking MDL on the user table name! */
-	rw_lock_s_lock(&dict_sys.latch);
-
+	MDL_ticket* mdl_ticket = nullptr;
 	user_table = dict_table_open_on_id(
-		innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
 
 	if (!user_table) {
-no_fts:
-		rw_lock_s_unlock(&dict_sys.latch);
 		DBUG_RETURN(0);
 	}
 
 	if (!user_table->fts || !user_table->fts->cache) {
-		dict_table_close(user_table, FALSE, FALSE);
-		goto no_fts;
+		dict_table_close(user_table, false, thd, mdl_ticket);
+		DBUG_RETURN(0);
 	}
 
 	cache = user_table->fts->cache;
@@ -2757,7 +2730,7 @@ no_fts:
 	conv_str.f_len = sizeof word;
 	conv_str.f_str = word;
 
-	rw_lock_s_lock(&cache->lock);
+	mysql_mutex_lock(&cache->lock);
 
 	for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) {
 		fts_index_cache_t*      index_cache;
@@ -2769,9 +2742,8 @@ no_fts:
 				 index_cache, thd, &conv_str, tables));
 	}
 
-	rw_lock_s_unlock(&cache->lock);
-	dict_table_close(user_table, FALSE, FALSE);
-	rw_lock_s_unlock(&dict_sys.latch);
+	mysql_mutex_unlock(&cache->lock);
+	dict_table_close(user_table, false, thd, mdl_ticket);
 
 	DBUG_RETURN(ret);
 }
@@ -2794,7 +2766,7 @@ i_s_fts_index_cache_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_index_cache =
+struct st_maria_plugin	i_s_innodb_ft_index_cache =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -2928,9 +2900,7 @@ i_s_fts_index_table_fill_selected(
 		}
 	}
 
-	mutex_enter(&dict_sys.mutex);
 	que_graph_free(graph);
-	mutex_exit(&dict_sys.mutex);
 
 	trx->free();
 
@@ -3178,15 +3148,12 @@ i_s_fts_index_table_fill(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	/* Prevent DROP of the internal tables for fulltext indexes.
-	FIXME: acquire DDL-blocking MDL on the user table name! */
-	rw_lock_s_lock(&dict_sys.latch);
-
+	MDL_ticket* mdl_ticket = nullptr;
 	user_table = dict_table_open_on_id(
-		innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
 
 	if (!user_table) {
-		rw_lock_s_unlock(&dict_sys.latch);
 		DBUG_RETURN(0);
 	}
 
@@ -3204,9 +3171,7 @@ i_s_fts_index_table_fill(
 		}
 	}
 
-	dict_table_close(user_table, FALSE, FALSE);
-
-	rw_lock_s_unlock(&dict_sys.latch);
+	dict_table_close(user_table, false, thd, mdl_ticket);
 
 	ut_free(conv_str.f_str);
 
@@ -3231,7 +3196,7 @@ i_s_fts_index_table_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_index_table =
+struct st_maria_plugin	i_s_innodb_ft_index_table =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -3332,22 +3297,18 @@ i_s_fts_config_fill(
 
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	/* Prevent DROP of the internal tables for fulltext indexes.
-	FIXME: acquire DDL-blocking MDL on the user table name! */
-	rw_lock_s_lock(&dict_sys.latch);
-
+	MDL_ticket* mdl_ticket = nullptr;
 	user_table = dict_table_open_on_id(
-		innodb_ft_aux_table_id, FALSE, DICT_TABLE_OP_NORMAL);
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
 
 	if (!user_table) {
-no_fts:
-		rw_lock_s_unlock(&dict_sys.latch);
 		DBUG_RETURN(0);
 	}
 
 	if (!dict_table_has_fts_index(user_table)) {
-		dict_table_close(user_table, FALSE, FALSE);
-		goto no_fts;
+		dict_table_close(user_table, false, thd, mdl_ticket);
+		DBUG_RETURN(0);
 	}
 
 	fields = table->field;
@@ -3403,9 +3364,7 @@ no_fts:
 
 	fts_sql_commit(trx);
 
-	dict_table_close(user_table, FALSE, FALSE);
-
-	rw_lock_s_unlock(&dict_sys.latch);
+	dict_table_close(user_table, false, thd, mdl_ticket);
 
 	trx->free();
 
@@ -3430,7 +3389,7 @@ i_s_fts_config_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_config =
+struct st_maria_plugin	i_s_innodb_ft_config =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -3730,7 +3689,7 @@ i_s_innodb_buffer_pool_stats_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_stats =
+struct st_maria_plugin	i_s_innodb_buffer_stats =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -3794,12 +3753,11 @@ static const LEX_CSTRING io_values[] =
 {
 	{ STRING_WITH_LEN("IO_NONE") },
 	{ STRING_WITH_LEN("IO_READ") },
-	{ STRING_WITH_LEN("IO_WRITE") },
-	{ STRING_WITH_LEN("IO_PIN") }
+	{ STRING_WITH_LEN("IO_WRITE") }
 };
 
 
-static TypelibBuffer<4> io_values_typelib(io_values);
+static TypelibBuffer<3> io_values_typelib(io_values);
 
 namespace Show {
 /* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
@@ -3922,7 +3880,7 @@ i_s_innodb_buffer_page_fill(
 		OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(0, true));
 
 		OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store(
-			   page_info->fix_count, true));
+			   ~buf_page_t::LRU_MASK & page_info->state, true));
 
 #ifdef BTR_CUR_HASH_ADAPT
 		OK(fields[IDX_BUFFER_PAGE_HASHED]->store(
@@ -3947,7 +3905,7 @@ i_s_innodb_buffer_page_fill(
 		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
 			bool ret = false;
 
-			mutex_enter(&dict_sys.mutex);
+			dict_sys.freeze(SRW_LOCK_CALL);
 
 			const dict_index_t* index =
 				dict_index_get_if_in_cache_low(
@@ -3972,7 +3930,7 @@ i_s_innodb_buffer_page_fill(
 						system_charset_info);
 			}
 
-			mutex_exit(&dict_sys.mutex);
+			dict_sys.unfreeze();
 
 			OK(ret);
 
@@ -3995,12 +3953,27 @@ i_s_innodb_buffer_page_fill(
 			   ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
 			   : 0, true));
 
+		static_assert(buf_page_t::NOT_USED == 0, "compatibility");
+		static_assert(buf_page_t::MEMORY == 1, "compatibility");
+		static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility");
+
 		OK(fields[IDX_BUFFER_PAGE_STATE]->store(
-			   1 + std::min<unsigned>(page_info->page_state,
-						  BUF_BLOCK_FILE_PAGE), true));
+			   std::min<uint32_t>(3, page_info->state) + 1, true));
+
+		static_assert(buf_page_t::UNFIXED == 1U << 29, "comp.");
+		static_assert(buf_page_t::READ_FIX == 4U << 29, "comp.");
+		static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp.");
+
+		unsigned io_fix = page_info->state >> 29;
+		if (io_fix < 4) {
+			io_fix = 1;
+		} else if (io_fix > 5) {
+			io_fix = 3;
+		} else {
+			io_fix -= 2;
+		}
 
-		OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(
-			   1 + page_info->io_fix, true));
+		OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(io_fix, true));
 
 		OK(fields[IDX_BUFFER_PAGE_IS_OLD]->store(
 			   page_info->is_old, true));
@@ -4084,61 +4057,51 @@ i_s_innodb_buffer_page_get_info(
 {
 	page_info->block_id = pos;
 
-	compile_time_assert(BUF_BLOCK_NOT_USED == 0);
-	compile_time_assert(BUF_BLOCK_MEMORY == 1);
-	compile_time_assert(BUF_BLOCK_REMOVE_HASH == 2);
-	compile_time_assert(BUF_BLOCK_FILE_PAGE == 3);
-	compile_time_assert(BUF_BLOCK_ZIP_PAGE == 4);
+	static_assert(buf_page_t::NOT_USED == 0, "compatibility");
+	static_assert(buf_page_t::MEMORY == 1, "compatibility");
+	static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility");
+	static_assert(buf_page_t::UNFIXED == 1U << 29, "compatibility");
+	static_assert(buf_page_t::READ_FIX == 4U << 29, "compatibility");
+	static_assert(buf_page_t::WRITE_FIX == 5U << 29, "compatibility");
 
-	auto state = bpage->state();
-	page_info->page_state= int{state} & 7;
+	page_info->state = bpage->state();
 
-	switch (state) {
-	default:
+	if (page_info->state < buf_page_t::FREED) {
 		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-	case BUF_BLOCK_ZIP_PAGE:
+		page_info->compressed_only = false;
+	} else {
 		const byte*	frame;
 
 		page_info->id = bpage->id();
 
-		page_info->fix_count = bpage->buf_fix_count();
-
 		page_info->oldest_mod = bpage->oldest_modification();
 
 		page_info->access_time = bpage->access_time;
 
 		page_info->zip_ssize = bpage->zip.ssize;
 
-		page_info->io_fix = bpage->io_fix() & 3;
-
 		page_info->is_old = bpage->old;
 
 		page_info->freed_page_clock = bpage->freed_page_clock;
 
-		switch (bpage->io_fix()) {
-		case BUF_IO_NONE:
-		case BUF_IO_WRITE:
-		case BUF_IO_PIN:
-			break;
-		case BUF_IO_READ:
+		if (page_info->state >= buf_page_t::READ_FIX
+		    && page_info->state < buf_page_t::WRITE_FIX) {
 			page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
 			page_info->newest_mod = 0;
 			return;
 		}
 
-		if (state == BUF_BLOCK_FILE_PAGE) {
-			const buf_block_t*block;
-
-			block = reinterpret_cast<const buf_block_t*>(bpage);
-			frame = block->frame;
+		page_info->compressed_only = !bpage->frame,
+		frame = bpage->frame;
+		if (UNIV_LIKELY(frame != nullptr)) {
 #ifdef BTR_CUR_HASH_ADAPT
 			/* Note: this may be a false positive, that
 			is, block->index will not always be set to
 			NULL when the last adaptive hash index
 			reference is dropped. */
-			page_info->hashed = (block->index != NULL);
+			page_info->hashed =
+				reinterpret_cast<const buf_block_t*>(bpage)
+				->index != nullptr;
 #endif /* BTR_CUR_HASH_ADAPT */
 		} else {
 			ut_ad(page_info->zip_ssize);
@@ -4262,7 +4225,7 @@ i_s_innodb_buffer_page_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_page =
+struct st_maria_plugin	i_s_innodb_buffer_page =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -4425,7 +4388,7 @@ i_s_innodb_buf_page_lru_fill(
 		OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(0, true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store(
-			   page_info->fix_count, true));
+			   ~buf_page_t::LRU_MASK & page_info->state, true));
 
 #ifdef BTR_CUR_HASH_ADAPT
 		OK(fields[IDX_BUF_LRU_PAGE_HASHED]->store(
@@ -4450,7 +4413,7 @@ i_s_innodb_buf_page_lru_fill(
 		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
 			bool ret = false;
 
-			mutex_enter(&dict_sys.mutex);
+			dict_sys.freeze(SRW_LOCK_CALL);
 
 			const dict_index_t* index =
 				dict_index_get_if_in_cache_low(
@@ -4475,7 +4438,7 @@ i_s_innodb_buf_page_lru_fill(
 						system_charset_info);
 			}
 
-			mutex_exit(&dict_sys.mutex);
+			dict_sys.unfreeze();
 
 			OK(ret);
 
@@ -4498,11 +4461,22 @@ i_s_innodb_buf_page_lru_fill(
 			   ? 512 << page_info->zip_ssize : 0, true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_STATE]->store(
-			   page_info->page_state == BUF_BLOCK_ZIP_PAGE,
-			   true));
+			   page_info->compressed_only, true));
+
+		static_assert(buf_page_t::UNFIXED == 1U << 29, "comp.");
+		static_assert(buf_page_t::READ_FIX == 4U << 29, "comp.");
+		static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp.");
+
+		unsigned io_fix = page_info->state >> 29;
+		if (io_fix < 4) {
+			io_fix = 1;
+		} else if (io_fix > 5) {
+			io_fix = 3;
+		} else {
+			io_fix -= 2;
+		}
 
-		OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(
-			   1 + page_info->io_fix, true));
+		OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(io_fix, true));
 
 		OK(fields[IDX_BUF_LRU_PAGE_IS_OLD]->store(
 			   page_info->is_old, true));
@@ -4604,7 +4578,7 @@ i_s_innodb_buffer_page_lru_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_buffer_page_lru =
+struct st_maria_plugin	i_s_innodb_buffer_page_lru =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -4770,6 +4744,41 @@ i_s_dict_fill_sys_tables(
 
 	DBUG_RETURN(0);
 }
+
+/** Convert one SYS_TABLES record to dict_table_t.
+@param pcur      persistent cursor position on SYS_TABLES record
+@param mtr       mini-transaction (nullptr=use the dict_sys cache)
+@param rec       record to read from (nullptr=use the dict_sys cache)
+@param table     the converted dict_table_t
+@return error message
+@retval nullptr on success */
+static const char *i_s_sys_tables_rec(const btr_pcur_t &pcur, mtr_t *mtr,
+                                      const rec_t *rec, dict_table_t **table)
+{
+  static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility");
+  size_t len;
+  if (rec_get_1byte_offs_flag(pcur.old_rec))
+  {
+    len= rec_1_get_field_end_info(pcur.old_rec, 0);
+    if (len & REC_1BYTE_SQL_NULL_MASK)
+      return "corrupted SYS_TABLES.NAME";
+  }
+  else
+  {
+    len= rec_2_get_field_end_info(pcur.old_rec, 0);
+    static_assert(REC_2BYTE_EXTERN_MASK == 16384, "compatibility");
+    if (len >= REC_2BYTE_EXTERN_MASK)
+      return "corrupted SYS_TABLES.NAME";
+  }
+
+  if (rec)
+    return dict_load_table_low(mtr, false, rec, table);
+
+  *table= dict_sys.load_table
+    (span<const char>{reinterpret_cast<const char*>(pcur.old_rec), len});
+  return *table ? nullptr : "Table not found in cache";
+}
+
 /*******************************************************************//**
 Function to go through each record in SYS_TABLES table, and fill the
 information_schema.innodb_sys_tables table with related table information
@@ -4783,8 +4792,6 @@ i_s_sys_tables_fill_table(
 	Item*		)	/*!< in: condition (not used) */
 {
 	btr_pcur_t	pcur;
-	const rec_t*	rec;
-	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_tables_fill_table");
@@ -4795,22 +4802,24 @@ i_s_sys_tables_fill_table(
 		DBUG_RETURN(0);
 	}
 
-	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+	for (const rec_t *rec = dict_startscan_system(&pcur, &mtr,
+						      dict_sys.sys_tables);
+	     rec; rec = dict_getnext_system(&pcur, &mtr)) {
+		if (rec_get_deleted_flag(rec, 0)) {
+			continue;
+		}
 
-	while (rec) {
 		const char*	err_msg;
 		dict_table_t*	table_rec;
 
 		/* Create and populate a dict_table_t structure with
 		information from SYS_TABLES row */
-		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
-			heap, rec, &table_rec, false, &mtr);
-
-		mutex_exit(&dict_sys.mutex);
+		err_msg = i_s_sys_tables_rec(pcur, &mtr, rec, &table_rec);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_tables(thd, table_rec,
@@ -4825,17 +4834,13 @@ i_s_sys_tables_fill_table(
 			dict_mem_table_free(table_rec);
 		}
 
-		mem_heap_empty(heap);
-
 		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
-		rec = dict_getnext_system(&pcur, &mtr);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
-	mem_heap_free(heap);
+	mtr.commit();
+	dict_sys.unlock();
 
 	DBUG_RETURN(0);
 }
@@ -4861,7 +4866,7 @@ innodb_sys_tables_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tables =
+struct st_maria_plugin	i_s_innodb_sys_tables =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -4946,73 +4951,61 @@ static ST_FIELD_INFO innodb_sys_tablestats_fields_info[]=
 };
 } // namespace Show
 
-/** Populate information_schema.innodb_sys_tablestats table with information
-from SYS_TABLES.
-@param[in]	thd		thread ID
-@param[in,out]	table		table
-@param[in]	ref_count	table reference count
-@param[in,out]	table_to_fill	fill this table
+/** Populate information_schema.innodb_sys_tablestats table with a table,
+and release exclusive dict_sys.latch.
+@param[in]	thd		connection
+@param[in,out]	table		InnoDB table metadata
+@param[in,out]	table_to_fill	INFORMATION_SCHEMA.INNODB_SYS_TABLESTATS
 @return 0 on success */
 static
 int
-i_s_dict_fill_sys_tablestats(
-	THD*		thd,
-	dict_table_t*	table,
-	ulint		ref_count,
-	TABLE*		table_to_fill)
+i_s_dict_fill_sys_tablestats(THD* thd, dict_table_t *table,
+                             TABLE* table_to_fill)
 {
-	Field**		fields;
-
-	DBUG_ENTER("i_s_dict_fill_sys_tablestats");
-
-	fields = table_to_fill->field;
+  DBUG_ENTER("i_s_dict_fill_sys_tablestats");
 
-	OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
+  Field **fields= table_to_fill->field;
 
-	OK(field_store_string(fields[SYS_TABLESTATS_NAME],
-			      table->name.m_name));
+  {
+    table->stats_mutex_lock();
+    auto _ = make_scope_exit([table]() {
+      table->stats_mutex_unlock(); dict_sys.unlock(); });
 
-	{
-		struct Locking
-		{
-			Locking() { mutex_enter(&dict_sys.mutex); }
-			~Locking() { mutex_exit(&dict_sys.mutex); }
-		} locking;
+    OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
 
-		OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized,
-						      true));
+    OK(field_store_string(fields[SYS_TABLESTATS_NAME],
+                          table->name.m_name));
+    OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized, true));
 
-		if (table->stat_initialized) {
-			OK(fields[SYS_TABLESTATS_NROW]->store(
-				   table->stat_n_rows, true));
+    if (table->stat_initialized)
+    {
+      OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, true));
 
-			OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(
-				   table->stat_clustered_index_size, true));
+      OK(fields[SYS_TABLESTATS_CLUST_SIZE]->
+         store(table->stat_clustered_index_size, true));
 
-			OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(
-				   table->stat_sum_of_other_index_sizes,
-				   true));
+      OK(fields[SYS_TABLESTATS_INDEX_SIZE]->
+         store(table->stat_sum_of_other_index_sizes, true));
 
-			OK(fields[SYS_TABLESTATS_MODIFIED]->store(
-				   table->stat_modified_counter, true));
-		} else {
-			OK(fields[SYS_TABLESTATS_NROW]->store(0, true));
+      OK(fields[SYS_TABLESTATS_MODIFIED]->
+         store(table->stat_modified_counter, true));
+    }
+    else
+    {
+      OK(fields[SYS_TABLESTATS_NROW]->store(0, true));
+      OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0, true));
+      OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0, true));
+      OK(fields[SYS_TABLESTATS_MODIFIED]->store(0, true));
+    }
 
-			OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0, true));
+    OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, true));
 
-			OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0, true));
+    OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->
+       store(table->get_ref_count(), true));
+  }
 
-			OK(fields[SYS_TABLESTATS_MODIFIED]->store(0, true));
-		}
-	}
-
-	OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, true));
-
-	OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->store(ref_count, true));
-
-	OK(schema_table_store_record(thd, table_to_fill));
-
-	DBUG_RETURN(0);
+  OK(schema_table_store_record(thd, table_to_fill));
+  DBUG_RETURN(0);
 }
 
 /*******************************************************************//**
@@ -5030,7 +5023,6 @@ i_s_sys_tables_fill_table_stats(
 {
 	btr_pcur_t	pcur;
 	const rec_t*	rec;
-	mem_heap_t*	heap;
 	mtr_t		mtr;
 
 	DBUG_ENTER("i_s_sys_tables_fill_table_stats");
@@ -5041,56 +5033,41 @@ i_s_sys_tables_fill_table_stats(
 		DBUG_RETURN(0);
 	}
 
-	heap = mem_heap_create(1000);
-	rw_lock_s_lock(&dict_sys.latch);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_TABLES);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_tables);
 
 	while (rec) {
 		const char*	err_msg;
-		dict_table_t*	table_rec;
+		dict_table_t*	table_rec = nullptr;
 
+		mtr.commit();
 		/* Fetch the dict_table_t structure corresponding to
 		this SYS_TABLES record */
-		err_msg = dict_process_sys_tables_rec_and_mtr_commit(
-			heap, rec, &table_rec, true, &mtr);
-
-		ulint ref_count = table_rec ? table_rec->get_ref_count() : 0;
-		mutex_exit(&dict_sys.mutex);
+		err_msg = i_s_sys_tables_rec(pcur, nullptr, nullptr,
+					     &table_rec);
 
-		DBUG_EXECUTE_IF("test_sys_tablestats", {
-			if (strcmp("test/t1", table_rec->name.m_name) == 0 ) {
-				DEBUG_SYNC_C("dict_table_not_protected");
-			}});
-
-		if (table_rec != NULL) {
-			ut_ad(err_msg == NULL);
-			i_s_dict_fill_sys_tablestats(thd, table_rec, ref_count,
+		if (UNIV_LIKELY(!err_msg)) {
+			i_s_dict_fill_sys_tablestats(thd, table_rec,
 						     tables->table);
 		} else {
-			ut_ad(err_msg != NULL);
+			ut_ad(!table_rec);
+			dict_sys.unlock();
 			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
 					    ER_CANT_FIND_SYSTEM_REC, "%s",
 					    err_msg);
 		}
 
-		rw_lock_s_unlock(&dict_sys.latch);
-		mem_heap_empty(heap);
-
 		/* Get the next record */
-		rw_lock_s_lock(&dict_sys.latch);
-		mutex_enter(&dict_sys.mutex);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 
-		mtr_start(&mtr);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
-	rw_lock_s_unlock(&dict_sys.latch);
-	mem_heap_free(heap);
+	mtr.commit();
+	dict_sys.unlock();
 
 	DBUG_RETURN(0);
 }
@@ -5116,7 +5093,7 @@ innodb_sys_tablestats_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablestats =
+struct st_maria_plugin	i_s_innodb_sys_tablestats =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -5285,11 +5262,11 @@ i_s_sys_indexes_fill_table(
 	}
 
 	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
+	dict_sys.lock(SRW_LOCK_CALL);
 	mtr_start(&mtr);
 
 	/* Start scan the SYS_INDEXES table */
-	rec = dict_startscan_system(&pcur, &mtr, SYS_INDEXES);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_indexes);
 
 	/* Process each record in the table */
 	while (rec) {
@@ -5306,8 +5283,8 @@ i_s_sys_indexes_fill_table(
 			rec, DICT_FLD__SYS_INDEXES__SPACE, &space_id);
 		space_id = space_id == 4 ? mach_read_from_4(field)
 			: ULINT_UNDEFINED;
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			if (int err = i_s_dict_fill_sys_indexes(
@@ -5325,13 +5302,13 @@ i_s_sys_indexes_fill_table(
 		mem_heap_empty(heap);
 
 		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
+	mtr.commit();
+	dict_sys.unlock();
 	mem_heap_free(heap);
 
 	DBUG_RETURN(0);
@@ -5357,7 +5334,7 @@ innodb_sys_indexes_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_indexes =
+struct st_maria_plugin	i_s_innodb_sys_indexes =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -5504,10 +5481,10 @@ i_s_sys_columns_fill_table(
 	}
 
 	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_COLUMNS);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_columns);
 
 	while (rec) {
 		const char*	err_msg;
@@ -5521,8 +5498,8 @@ i_s_sys_columns_fill_table(
 						       &table_id, &col_name,
 						       &nth_v_col);
 
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_columns(thd, table_id, col_name,
@@ -5537,17 +5514,18 @@ i_s_sys_columns_fill_table(
 		mem_heap_empty(heap);
 
 		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
+	mtr.commit();
+	dict_sys.unlock();
 	mem_heap_free(heap);
 
 	DBUG_RETURN(0);
 }
+
 /*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns
 @return 0 on success */
@@ -5569,7 +5547,7 @@ innodb_sys_columns_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_columns =
+struct st_maria_plugin	i_s_innodb_sys_columns =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -5693,14 +5671,14 @@ i_s_sys_virtual_fill_table(
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
 	/* deny access to user without PROCESS_ACL privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
+	if (check_global_access(thd, PROCESS_ACL) || !dict_sys.sys_virtual) {
 		DBUG_RETURN(0);
 	}
 
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_VIRTUAL);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_virtual);
 
 	while (rec) {
 		const char*	err_msg;
@@ -5712,8 +5690,8 @@ i_s_sys_virtual_fill_table(
 						       &table_id, &pos,
 						       &base_pos);
 
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos,
@@ -5725,13 +5703,13 @@ i_s_sys_virtual_fill_table(
 		}
 
 		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
+	mtr.commit();
+	dict_sys.unlock();
 
 	DBUG_RETURN(0);
 }
@@ -5883,14 +5861,14 @@ i_s_sys_fields_fill_table(
 	}
 
 	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
 
 	/* will save last index id so that we know whether we move to
 	the next index. This is used to calculate prefix length */
 	last_id = 0;
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_FIELDS);
+	dict_sys.lock(SRW_LOCK_CALL);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_fields);
 
 	while (rec) {
 		ulint		pos;
@@ -5903,8 +5881,8 @@ i_s_sys_fields_fill_table(
 		err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec,
 						      &pos, &index_id, last_id);
 
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_fields(thd, index_id, &field_rec,
@@ -5919,13 +5897,13 @@ i_s_sys_fields_fill_table(
 		mem_heap_empty(heap);
 
 		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
+	mtr.commit();
+	dict_sys.unlock();
 	mem_heap_free(heap);
 
 	DBUG_RETURN(0);
@@ -5951,7 +5929,7 @@ innodb_sys_fields_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_fields =
+struct st_maria_plugin	i_s_innodb_sys_fields =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -6081,16 +6059,15 @@ i_s_sys_foreign_fill_table(
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
 	/* deny access to user without PROCESS_ACL privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
-
+	if (check_global_access(thd, PROCESS_ACL) || !dict_sys.sys_foreign) {
 		DBUG_RETURN(0);
 	}
 
 	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_foreign);
 
 	while (rec) {
 		const char*	err_msg;
@@ -6100,8 +6077,8 @@ i_s_sys_foreign_fill_table(
 		a SYS_FOREIGN row */
 		err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec);
 
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_foreign(thd, &foreign_rec,
@@ -6115,13 +6092,13 @@ i_s_sys_foreign_fill_table(
 		mem_heap_empty(heap);
 
 		/* Get the next record */
-		mtr_start(&mtr);
-		mutex_enter(&dict_sys.mutex);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
+	mtr.commit();
+	dict_sys.unlock();
 	mem_heap_free(heap);
 
 	DBUG_RETURN(0);
@@ -6148,7 +6125,7 @@ innodb_sys_foreign_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign =
+struct st_maria_plugin	i_s_innodb_sys_foreign =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -6274,15 +6251,16 @@ i_s_sys_foreign_cols_fill_table(
 	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
 	/* deny access to user without PROCESS_ACL privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
+	if (check_global_access(thd, PROCESS_ACL)
+	    || !dict_sys.sys_foreign_cols) {
 		DBUG_RETURN(0);
 	}
 
 	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
 
-	rec = dict_startscan_system(&pcur, &mtr, SYS_FOREIGN_COLS);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_foreign_cols);
 
 	while (rec) {
 		const char*	err_msg;
@@ -6295,8 +6273,8 @@ i_s_sys_foreign_cols_fill_table(
 		err_msg = dict_process_sys_foreign_col_rec(
 			heap, rec, &name, &for_col_name, &ref_col_name, &pos);
 
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
+		mtr.commit();
+		dict_sys.unlock();
 
 		if (!err_msg) {
 			i_s_dict_fill_sys_foreign_cols(
@@ -6311,13 +6289,13 @@ i_s_sys_foreign_cols_fill_table(
 		mem_heap_empty(heap);
 
 		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
 		rec = dict_getnext_system(&pcur, &mtr);
 	}
 
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
+	mtr.commit();
+	dict_sys.unlock();
 	mem_heap_free(heap);
 
 	DBUG_RETURN(0);
@@ -6343,7 +6321,7 @@ innodb_sys_foreign_cols_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
+struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -6412,8 +6390,8 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[]=
 #define SYS_TABLESPACES_PAGE_SIZE	4
   Column("PAGE_SIZE", ULong(), NOT_NULL),
 
-#define SYS_TABLESPACES_ZIP_PAGE_SIZE	5
-  Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL),
+#define SYS_TABLESPACES_FILENAME	5
+  Column("FILENAME", Varchar(FN_REFLEN), NOT_NULL),
 
 #define SYS_TABLESPACES_FS_BLOCK_SIZE	6
   Column("FS_BLOCK_SIZE", ULong(),NOT_NULL),
@@ -6428,179 +6406,115 @@ static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[]=
 };
 } // namespace Show
 
-
 extern size_t os_file_get_fs_block_size(const char *path);
 
-/**********************************************************************//**
-Function to fill INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES with information
-collected by scanning SYS_TABLESPACESS table.
+/** Produce one row of INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES.
+@param thd  connection
+@param s    tablespace
+@param t    output table
 @return 0 on success */
-static
-int
-i_s_dict_fill_sys_tablespaces(
-/*==========================*/
-	THD*		thd,		/*!< in: thread */
-	uint32_t	space,		/*!< in: space ID */
-	const char*	name,		/*!< in: tablespace name */
-	ulint		flags,		/*!< in: tablespace flags */
-	TABLE*		table_to_fill)	/*!< in/out: fill this table */
-{
-	Field**	fields;
-	ulint	atomic_blobs	= FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
-	const char* row_format;
-
-	DBUG_ENTER("i_s_dict_fill_sys_tablespaces");
-
-	if (fil_space_t::full_crc32(flags)) {
-		row_format = NULL;
-	} else if (is_system_tablespace(space)) {
-		row_format = "Compact, Redundant or Dynamic";
-	} else if (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
-		row_format = "Compressed";
-	} else if (atomic_blobs) {
-		row_format = "Dynamic";
-	} else {
-		row_format = "Compact or Redundant";
-	}
-
-	fields = table_to_fill->field;
-
-	OK(fields[SYS_TABLESPACES_SPACE]->store(space, true));
-
-	OK(field_store_string(fields[SYS_TABLESPACES_NAME], name));
-
-	OK(fields[SYS_TABLESPACES_FLAGS]->store(flags, true));
-
-	OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format));
-
-	ulint cflags = fil_space_t::is_valid_flags(flags, space)
-		? flags : fsp_flags_convert_from_101(flags);
-	if (cflags == ULINT_UNDEFINED) {
-		fields[SYS_TABLESPACES_PAGE_SIZE]->set_null();
-		fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->set_null();
-		fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->set_null();
-		fields[SYS_TABLESPACES_FILE_SIZE]->set_null();
-		fields[SYS_TABLESPACES_ALLOC_SIZE]->set_null();
-		OK(schema_table_store_record(thd, table_to_fill));
-		DBUG_RETURN(0);
-	}
-
-	OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(
-		   fil_space_t::logical_size(cflags), true));
-
-	OK(fields[SYS_TABLESPACES_ZIP_PAGE_SIZE]->store(
-		   fil_space_t::physical_size(cflags), true));
-
-	size_t fs_block_size = 0;
-	os_file_size_t	file;
-
-	memset(&file, 0xff, sizeof(file));
-
-	if (fil_space_t* s = fil_space_t::get(space)) {
-		const char *filepath = s->chain.start
-			? s->chain.start->name : NULL;
-		if (!filepath) {
-			goto file_done;
-		}
-
-		file = os_file_get_size(filepath);
-		fs_block_size= os_file_get_fs_block_size(filepath);
-
-file_done:
-		s->release();
-	}
-
-	if (file.m_total_size == os_offset_t(~0)) {
-		fs_block_size = 0;
-		file.m_total_size = 0;
-		file.m_alloc_size = 0;
-	}
-
-	OK(fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->store(fs_block_size, true));
-
-	OK(fields[SYS_TABLESPACES_FILE_SIZE]->store(file.m_total_size, true));
-
-	OK(fields[SYS_TABLESPACES_ALLOC_SIZE]->store(file.m_alloc_size, true));
-
-	OK(schema_table_store_record(thd, table_to_fill));
-
-	DBUG_RETURN(0);
+static int i_s_sys_tablespaces_fill(THD *thd, const fil_space_t &s, TABLE *t)
+{
+  DBUG_ENTER("i_s_sys_tablespaces_fill");
+  const char *row_format;
+
+  if (s.full_crc32() || is_system_tablespace(s.id))
+    row_format= nullptr;
+  else if (FSP_FLAGS_GET_ZIP_SSIZE(s.flags))
+    row_format= "Compressed";
+  else if (FSP_FLAGS_HAS_ATOMIC_BLOBS(s.flags))
+    row_format= "Dynamic";
+  else
+    row_format= "Compact or Redundant";
+
+  Field **fields= t->field;
+
+  OK(fields[SYS_TABLESPACES_SPACE]->store(s.id, true));
+  {
+    Field *f= fields[SYS_TABLESPACES_NAME];
+    const auto name= s.name();
+    if (name.data())
+    {
+      OK(f->store(name.data(), name.size(), system_charset_info));
+      f->set_notnull();
+    }
+    else if (srv_is_undo_tablespace(s.id))
+    {
+      char name[15];
+      snprintf(name, sizeof name, "innodb_undo%03zu",
+               (s.id - srv_undo_space_id_start + 1));
+      OK(f->store(name, strlen(name), system_charset_info));
+    } else f->set_notnull();
+  }
+
+  fields[SYS_TABLESPACES_NAME]->set_null();
+  OK(fields[SYS_TABLESPACES_FLAGS]->store(s.flags, true));
+  OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format));
+  const char *filepath= s.chain.start->name;
+  OK(field_store_string(fields[SYS_TABLESPACES_FILENAME], filepath));
+
+  OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(s.physical_size(), true));
+  size_t fs_block_size;
+  os_file_size_t file= os_file_get_size(filepath);
+  if (file.m_total_size == os_offset_t(~0))
+  {
+    file.m_total_size= 0;
+    file.m_alloc_size= 0;
+    fs_block_size= 0;
+  }
+  else
+    fs_block_size= os_file_get_fs_block_size(filepath);
+
+  OK(fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->store(fs_block_size, true));
+  OK(fields[SYS_TABLESPACES_FILE_SIZE]->store(file.m_total_size, true));
+  OK(fields[SYS_TABLESPACES_ALLOC_SIZE]->store(file.m_alloc_size, true));
+
+  OK(schema_table_store_record(thd, t));
+
+  DBUG_RETURN(0);
 }
 
-/*******************************************************************//**
-Function to populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
-Loop through each record in SYS_TABLESPACES, and extract the column
-information and fill the INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES table.
+/** Populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES.
+@param thd    connection
+@param tables table to fill
 @return 0 on success */
-static
-int
-i_s_sys_tablespaces_fill_table(
-/*===========================*/
-	THD*		thd,	/*!< in: thread */
-	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	Item*		)	/*!< in: condition (not used) */
+static int i_s_sys_tablespaces_fill_table(THD *thd, TABLE_LIST *tables, Item*)
 {
-	btr_pcur_t	pcur;
-	const rec_t*	rec;
-	mem_heap_t*	heap;
-	mtr_t		mtr;
-
-	DBUG_ENTER("i_s_sys_tablespaces_fill_table");
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to user without PROCESS_ACL privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
-
-	for (rec = dict_startscan_system(&pcur, &mtr, SYS_TABLESPACES);
-	     rec != NULL;
-	     rec = dict_getnext_system(&pcur, &mtr)) {
-
-		const char*	err_msg;
-		uint32_t	space;
-		const char*	name;
-		ulint		flags;
-
-		/* Extract necessary information from a SYS_TABLESPACES row */
-		err_msg = dict_process_sys_tablespaces(
-			heap, rec, &space, &name, &flags);
-
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
-
-		if (!err_msg) {
-			i_s_dict_fill_sys_tablespaces(
-				thd, space, name, flags,
-				tables->table);
-		} else {
-			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC, "%s",
-					    err_msg);
-		}
-
-		mem_heap_empty(heap);
-
-		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
-	}
-
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
-	mem_heap_free(heap);
-
-	i_s_dict_fill_sys_tablespaces(
-		thd, uint32_t(fil_system.temp_space->id),
-		fil_system.temp_space->name,
-		fil_system.temp_space->flags, tables->table);
+  DBUG_ENTER("i_s_sys_tablespaces_fill_table");
+  RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
 
-	DBUG_RETURN(0);
+  if (check_global_access(thd, PROCESS_ACL))
+    DBUG_RETURN(0);
+
+  int err= 0;
+
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_system.freeze_space_list++;
+
+  for (fil_space_t &space : fil_system.space_list)
+  {
+    if (space.purpose == FIL_TYPE_TABLESPACE && !space.is_stopping() &&
+        space.chain.start)
+    {
+      space.reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      space.s_lock();
+      err= i_s_sys_tablespaces_fill(thd, space, tables->table);
+      space.s_unlock();
+      mysql_mutex_lock(&fil_system.mutex);
+      space.release();
+      if (err)
+        break;
+    }
+  }
+
+  fil_system.freeze_space_list--;
+  mysql_mutex_unlock(&fil_system.mutex);
+  if (err == DB_SUCCESS)
+    err= i_s_sys_tablespaces_fill(thd, *fil_system.temp_space, tables->table);
+  DBUG_RETURN(err);
 }
+
 /*******************************************************************//**
 Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES
 @return 0 on success */
@@ -6622,7 +6536,7 @@ innodb_sys_tablespaces_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablespaces =
+struct st_maria_plugin	i_s_innodb_sys_tablespaces =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -6642,7 +6556,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablespaces =
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
-	"InnoDB SYS_TABLESPACES",
+	"InnoDB tablespaces",
 
 	/* the plugin license (PLUGIN_LICENSE_XXX) */
 	/* int */
@@ -6672,185 +6586,6 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_tablespaces =
 };
 
 namespace Show {
-/**  SYS_DATAFILES  ************************************************/
-/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES */
-static ST_FIELD_INFO innodb_sys_datafiles_fields_info[]=
-{
-#define SYS_DATAFILES_SPACE		0
-  Column("SPACE", ULong(), NOT_NULL),
-
-#define SYS_DATAFILES_PATH		1
-  Column("PATH", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
-
-  CEnd()
-};
-} // namespace Show
-
-/**********************************************************************//**
-Function to fill INFORMATION_SCHEMA.INNODB_SYS_DATAFILES with information
-collected by scanning SYS_DATAFILESS table.
-@return 0 on success */
-static
-int
-i_s_dict_fill_sys_datafiles(
-/*========================*/
-	THD*		thd,		/*!< in: thread */
-	uint32_t	space,		/*!< in: space ID */
-	const char*	path,		/*!< in: absolute path */
-	TABLE*		table_to_fill)	/*!< in/out: fill this table */
-{
-	Field**		fields;
-
-	DBUG_ENTER("i_s_dict_fill_sys_datafiles");
-
-	fields = table_to_fill->field;
-
-	OK(fields[SYS_DATAFILES_SPACE]->store(space, true));
-
-	OK(field_store_string(fields[SYS_DATAFILES_PATH], path));
-
-	OK(schema_table_store_record(thd, table_to_fill));
-
-	DBUG_RETURN(0);
-}
-/*******************************************************************//**
-Function to populate INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
-Loop through each record in SYS_DATAFILES, and extract the column
-information and fill the INFORMATION_SCHEMA.INNODB_SYS_DATAFILES table.
-@return 0 on success */
-static
-int
-i_s_sys_datafiles_fill_table(
-/*=========================*/
-	THD*		thd,	/*!< in: thread */
-	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	Item*		)	/*!< in: condition (not used) */
-{
-	btr_pcur_t	pcur;
-	const rec_t*	rec;
-	mem_heap_t*	heap;
-	mtr_t		mtr;
-
-	DBUG_ENTER("i_s_sys_datafiles_fill_table");
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to user without PROCESS_ACL privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	}
-
-	heap = mem_heap_create(1000);
-	mutex_enter(&dict_sys.mutex);
-	mtr_start(&mtr);
-
-	rec = dict_startscan_system(&pcur, &mtr, SYS_DATAFILES);
-
-	while (rec) {
-		const char*	err_msg;
-		uint32_t	space;
-		const char*	path;
-
-		/* Extract necessary information from a SYS_DATAFILES row */
-		err_msg = dict_process_sys_datafiles(
-			heap, rec, &space, &path);
-
-		mtr_commit(&mtr);
-		mutex_exit(&dict_sys.mutex);
-
-		if (!err_msg) {
-			i_s_dict_fill_sys_datafiles(
-				thd, space, path, tables->table);
-		} else {
-			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-					    ER_CANT_FIND_SYSTEM_REC, "%s",
-					    err_msg);
-		}
-
-		mem_heap_empty(heap);
-
-		/* Get the next record */
-		mutex_enter(&dict_sys.mutex);
-		mtr_start(&mtr);
-		rec = dict_getnext_system(&pcur, &mtr);
-	}
-
-	mtr_commit(&mtr);
-	mutex_exit(&dict_sys.mutex);
-	mem_heap_free(heap);
-
-	DBUG_RETURN(0);
-}
-/*******************************************************************//**
-Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_DATAFILES
-@return 0 on success */
-static
-int
-innodb_sys_datafiles_init(
-/*======================*/
-	void*	p)	/*!< in/out: table schema object */
-{
-	ST_SCHEMA_TABLE*	schema;
-
-	DBUG_ENTER("innodb_sys_datafiles_init");
-
-	schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = Show::innodb_sys_datafiles_fields_info;
-	schema->fill_table = i_s_sys_datafiles_fill_table;
-
-	DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_datafiles =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	MYSQL_INFORMATION_SCHEMA_PLUGIN,
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	&i_s_info,
-
-	/* plugin name */
-	/* const char* */
-	"INNODB_SYS_DATAFILES",
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	plugin_author,
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	"InnoDB SYS_DATAFILES",
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	PLUGIN_LICENSE_GPL,
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	innodb_sys_datafiles_init,
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	i_s_common_deinit,
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	INNODB_VERSION_SHORT,
-
-	/* struct st_mysql_show_var* */
-	NULL,
-
-	/* struct st_mysql_sys_var** */
-	NULL,
-
-        /* Maria extension */
-	INNODB_VERSION_STR,
-        MariaDB_PLUGIN_MATURITY_STABLE,
-};
-
-namespace Show {
 /**  TABLESPACES_ENCRYPTION    ********************************************/
 /* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION */
 static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[]=
@@ -6890,8 +6625,7 @@ static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[]=
 } // namespace Show
 
 /**********************************************************************//**
-Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
-with information collected by scanning SYS_TABLESPACES table.
+Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION.
 @param[in]	thd		thread handle
 @param[in]	space		Tablespace
 @param[in]	table_to_fill	I_S table to fill
@@ -6905,7 +6639,6 @@ i_s_dict_fill_tablespaces_encryption(
 {
 	Field**	fields;
 	struct fil_space_crypt_status_t status;
-
 	DBUG_ENTER("i_s_dict_fill_tablespaces_encryption");
 
 	fields = table_to_fill->field;
@@ -6920,8 +6653,25 @@ i_s_dict_fill_tablespaces_encryption(
 
 	OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space->id, true));
 
-	OK(field_store_string(fields[TABLESPACES_ENCRYPTION_NAME],
-			      space->name));
+	{
+		const auto name = space->name();
+		if (name.data()) {
+			OK(fields[TABLESPACES_ENCRYPTION_NAME]->store(
+				   name.data(), name.size(),
+				   system_charset_info));
+			fields[TABLESPACES_ENCRYPTION_NAME]->set_notnull();
+		} else if (srv_is_undo_tablespace(space->id)) {
+			char undo_name[sizeof "innodb_undo000"];
+			snprintf(undo_name, sizeof(undo_name),
+			         "innodb_undo%03zu",space->id);
+			OK(fields[TABLESPACES_ENCRYPTION_NAME]->store(
+				   undo_name, strlen(undo_name),
+				   system_charset_info));
+			fields[TABLESPACES_ENCRYPTION_NAME]->set_notnull();
+		} else {
+			fields[TABLESPACES_ENCRYPTION_NAME]->set_null();
+		}
+	}
 
 	OK(fields[TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME]->store(
 		   status.scheme, true));
@@ -6977,19 +6727,20 @@ i_s_tablespaces_encryption_fill_table(
 	}
 
 	int err = 0;
-	mutex_enter(&fil_system.mutex);
+	mysql_mutex_lock(&fil_system.mutex);
 	fil_system.freeze_space_list++;
 
-	for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
-	     space; space = UT_LIST_GET_NEXT(space_list, space)) {
-		if (space->purpose == FIL_TYPE_TABLESPACE
-		    && !space->is_stopping()) {
-			space->reacquire();
-			mutex_exit(&fil_system.mutex);
+	for (fil_space_t& space : fil_system.space_list) {
+		if (space.purpose == FIL_TYPE_TABLESPACE
+		    && !space.is_stopping()) {
+			space.reacquire();
+			mysql_mutex_unlock(&fil_system.mutex);
+			space.s_lock();
 			err = i_s_dict_fill_tablespaces_encryption(
-				thd, space, tables->table);
-			mutex_enter(&fil_system.mutex);
-			space->release();
+				thd, &space, tables->table);
+			space.s_unlock();
+			mysql_mutex_lock(&fil_system.mutex);
+			space.release();
 			if (err) {
 				break;
 			}
@@ -6997,7 +6748,7 @@ i_s_tablespaces_encryption_fill_table(
 	}
 
 	fil_system.freeze_space_list--;
-	mutex_exit(&fil_system.mutex);
+	mysql_mutex_unlock(&fil_system.mutex);
 	DBUG_RETURN(err);
 }
 /*******************************************************************//**
@@ -7021,7 +6772,7 @@ innodb_tablespaces_encryption_init(
 	DBUG_RETURN(0);
 }
 
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_tablespaces_encryption =
+struct st_maria_plugin	i_s_innodb_tablespaces_encryption =
 {
 	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
 	/* int */
@@ -7069,318 +6820,3 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_tablespaces_encryption =
 	INNODB_VERSION_STR,
 	MariaDB_PLUGIN_MATURITY_STABLE
 };
-
-namespace Show {
-/**  INNODB_MUTEXES  *********************************************/
-/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES */
-static ST_FIELD_INFO innodb_mutexes_fields_info[]=
-{
-#define MUTEXES_NAME			0
-  Column("NAME", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
-
-#define MUTEXES_CREATE_FILE		1
-  Column("CREATE_FILE", Varchar(OS_FILE_MAX_PATH), NOT_NULL),
-
-#define MUTEXES_CREATE_LINE		2
-  Column("CREATE_LINE", ULong(), NOT_NULL),
-
-#define MUTEXES_OS_WAITS		3
-  Column("OS_WAITS", ULonglong(), NOT_NULL),
-
-  CEnd()
-};
-} // namespace Show
-
-/*******************************************************************//**
-Function to populate INFORMATION_SCHEMA.INNODB_MUTEXES table.
-Loop through each record in mutex and rw_lock lists, and extract the column
-information and fill the INFORMATION_SCHEMA.INNODB_MUTEXES table.
-@return 0 on success */
-static
-int
-i_s_innodb_mutexes_fill_table(
-/*==========================*/
-	THD*		thd,	/*!< in: thread */
-	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
-	Item*		)	/*!< in: condition (not used) */
-{
-	ulint		block_lock_oswait_count = 0;
-	const rw_lock_t* block_lock= nullptr;
-	Field**		fields = tables->table->field;
-
-	DBUG_ENTER("i_s_innodb_mutexes_fill_table");
-	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
-
-	/* deny access to user without PROCESS_ACL privilege */
-	if (check_global_access(thd, PROCESS_ACL)) {
-		DBUG_RETURN(0);
-	} else {
-		struct Locking
-		{
-			Locking() { mutex_enter(&rw_lock_list_mutex); }
-			~Locking() { mutex_exit(&rw_lock_list_mutex); }
-		} locking;
-
-		char lock_name[sizeof "buf0dump.cc:12345"];
-
-		for (const rw_lock_t& lock : rw_lock_list) {
-			if (lock.count_os_wait == 0) {
-				continue;
-			}
-
-			if (buf_pool.is_block_lock(&lock)) {
-				block_lock = &lock;
-				block_lock_oswait_count += lock.count_os_wait;
-				continue;
-			}
-
-			const char* basename = innobase_basename(
-				lock.cfile_name);
-
-			snprintf(lock_name, sizeof lock_name, "%s:%u",
-				 basename, lock.cline);
-
-			OK(field_store_string(fields[MUTEXES_NAME],
-					      lock_name));
-			OK(field_store_string(fields[MUTEXES_CREATE_FILE],
-					      basename));
-			OK(fields[MUTEXES_CREATE_LINE]->store(lock.cline,
-							      true));
-			fields[MUTEXES_CREATE_LINE]->set_notnull();
-			OK(fields[MUTEXES_OS_WAITS]->store(lock.count_os_wait,
-							   true));
-			fields[MUTEXES_OS_WAITS]->set_notnull();
-			OK(schema_table_store_record(thd, tables->table));
-		}
-
-		if (block_lock) {
-			char buf1[IO_SIZE];
-
-			snprintf(buf1, sizeof buf1, "combined %s",
-				 innobase_basename(block_lock->cfile_name));
-
-			OK(field_store_string(fields[MUTEXES_NAME],
-					      "buf_block_t::lock"));
-			OK(field_store_string(fields[MUTEXES_CREATE_FILE],
-					      buf1));
-			OK(fields[MUTEXES_CREATE_LINE]->store(block_lock->cline,
-							      true));
-			fields[MUTEXES_CREATE_LINE]->set_notnull();
-			OK(fields[MUTEXES_OS_WAITS]->store(
-				   block_lock_oswait_count, true));
-			fields[MUTEXES_OS_WAITS]->set_notnull();
-			OK(schema_table_store_record(thd, tables->table));
-		}
-	}
-
-	DBUG_RETURN(0);
-}
-
-/*******************************************************************//**
-Bind the dynamic table INFORMATION_SCHEMA.INNODB_MUTEXES
-@return 0 on success */
-static
-int
-innodb_mutexes_init(
-/*================*/
-	void*	p)	/*!< in/out: table schema object */
-{
-	ST_SCHEMA_TABLE*	schema;
-
-	DBUG_ENTER("innodb_mutexes_init");
-
-	schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = Show::innodb_mutexes_fields_info;
-	schema->fill_table = i_s_innodb_mutexes_fill_table;
-
-	DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_mutexes =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	MYSQL_INFORMATION_SCHEMA_PLUGIN,
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	&i_s_info,
-
-	/* plugin name */
-	/* const char* */
-	"INNODB_MUTEXES",
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	plugin_author,
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	"InnoDB SYS_DATAFILES",
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	PLUGIN_LICENSE_GPL,
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	innodb_mutexes_init,
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	i_s_common_deinit,
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	INNODB_VERSION_SHORT,
-
-	/* struct st_mysql_show_var* */
-	NULL,
-
-	/* struct st_mysql_sys_var** */
-	NULL,
-
-        /* Maria extension */
-	INNODB_VERSION_STR,
-        MariaDB_PLUGIN_MATURITY_STABLE,
-};
-
-namespace Show {
-/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS */
-static ST_FIELD_INFO innodb_sys_semaphore_waits_fields_info[]=
-{
-	// SYS_SEMAPHORE_WAITS_THREAD_ID	0
-  Column("THREAD_ID", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_OBJECT_NAME	1
-  Column("OBJECT_NAME", Varchar(OS_FILE_MAX_PATH), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_FILE	2
-  Column("FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_LINE	3
-  Column("LINE", ULong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_WAIT_TIME	4
-  Column("WAIT_TIME", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_WAIT_OBJECT	5
-  Column("WAIT_OBJECT", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_WAIT_TYPE	6
-  Column("WAIT_TYPE", Varchar(16), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID	7
-  Column("HOLDER_THREAD_ID", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
-  Column("HOLDER_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
-  Column("HOLDER_LINE", ULong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_CREATED_FILE 10
-  Column("CREATED_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_CREATED_LINE 11
-  Column("CREATED_LINE", ULong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
-  Column("WRITER_THREAD", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
-  Column("RESERVATION_MODE", Varchar(16), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_READERS	14
-  Column("READERS", ULong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
-  Column("WAITERS_FLAG", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_LOCK_WORD	16
-  Column("LOCK_WORD", ULonglong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17
-  Column("LAST_WRITER_FILE", Varchar(OS_FILE_MAX_PATH), NULLABLE),
-
-	// SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18
-  Column("LAST_WRITER_LINE", ULong(), NOT_NULL),
-
-	// SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19
-  Column("OS_WAIT_COUNT", ULong(), NOT_NULL),
-
-  CEnd()
-};
-} // namespace Show
-
-
-/*******************************************************************//**
-Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_SEMAPHORE_WAITS
-@return 0 on success */
-static
-int
-innodb_sys_semaphore_waits_init(
-/*============================*/
-	void*	p)	/*!< in/out: table schema object */
-{
-	ST_SCHEMA_TABLE*	schema;
-
-	DBUG_ENTER("innodb_sys_semaphore_waits_init");
-
-	schema = (ST_SCHEMA_TABLE*) p;
-
-	schema->fields_info = Show::innodb_sys_semaphore_waits_fields_info;
-	schema->fill_table = sync_arr_fill_sys_semphore_waits_table;
-
-	DBUG_RETURN(0);
-}
-
-UNIV_INTERN struct st_maria_plugin	i_s_innodb_sys_semaphore_waits =
-{
-	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
-	/* int */
-	MYSQL_INFORMATION_SCHEMA_PLUGIN,
-
-	/* pointer to type-specific plugin descriptor */
-	/* void* */
-	&i_s_info,
-
-	/* plugin name */
-	/* const char* */
-	"INNODB_SYS_SEMAPHORE_WAITS",
-
-	/* plugin author (for SHOW PLUGINS) */
-	/* const char* */
-	maria_plugin_author,
-
-	/* general descriptive text (for SHOW PLUGINS) */
-	/* const char* */
-	"InnoDB SYS_SEMAPHORE_WAITS",
-
-	/* the plugin license (PLUGIN_LICENSE_XXX) */
-	/* int */
-	PLUGIN_LICENSE_GPL,
-
-	/* the function to invoke when plugin is loaded */
-	/* int (*)(void*); */
-	innodb_sys_semaphore_waits_init,
-
-	/* the function to invoke when plugin is unloaded */
-	/* int (*)(void*); */
-	i_s_common_deinit,
-
-	/* plugin version (for SHOW PLUGINS) */
-	/* unsigned int */
-	INNODB_VERSION_SHORT,
-
-	/* struct st_mysql_show_var* */
-	NULL,
-
-	/* struct st_mysql_sys_var** */
-	NULL,
-
-        /* Maria extension */
-	INNODB_VERSION_STR,
-        MariaDB_PLUGIN_MATURITY_STABLE,
-};
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
index 87799e7669c..c8190a4112e 100644
--- a/storage/innobase/handler/i_s.h
+++ b/storage/innobase/handler/i_s.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2019, MariaDB Corporation.
+Copyright (c) 2014, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -59,11 +59,8 @@ extern struct st_maria_plugin	i_s_innodb_sys_fields;
 extern struct st_maria_plugin	i_s_innodb_sys_foreign;
 extern struct st_maria_plugin	i_s_innodb_sys_foreign_cols;
 extern struct st_maria_plugin	i_s_innodb_sys_tablespaces;
-extern struct st_maria_plugin	i_s_innodb_sys_datafiles;
-extern struct st_maria_plugin	i_s_innodb_mutexes;
 extern struct st_maria_plugin	i_s_innodb_sys_virtual;
 extern struct st_maria_plugin	i_s_innodb_tablespaces_encryption;
-extern struct st_maria_plugin	i_s_innodb_sys_semaphore_waits;
 
 /** The latest successfully looked up innodb_fts_aux_table */
 extern table_id_t innodb_ft_aux_table_id;
@@ -91,40 +88,4 @@ do {									\
 	}								\
 } while (0)
 
-/* Don't use a static const variable here, as some C++ compilers (notably
-HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
-#define END_OF_ST_FIELD_INFO {NULL,0,MYSQL_TYPE_NULL,0,0,"",SKIP_OPEN_TABLE}
-
-/** Fields on INFORMATION_SCHEMA.SYS_SEMAMPHORE_WAITS table */
-#define SYS_SEMAPHORE_WAITS_THREAD_ID	0
-#define SYS_SEMAPHORE_WAITS_OBJECT_NAME 1
-#define SYS_SEMAPHORE_WAITS_FILE	2
-#define SYS_SEMAPHORE_WAITS_LINE	3
-#define SYS_SEMAPHORE_WAITS_WAIT_TIME	4
-#define SYS_SEMAPHORE_WAITS_WAIT_OBJECT	5
-#define SYS_SEMAPHORE_WAITS_WAIT_TYPE	6
-#define SYS_SEMAPHORE_WAITS_HOLDER_THREAD_ID 7
-#define SYS_SEMAPHORE_WAITS_HOLDER_FILE 8
-#define SYS_SEMAPHORE_WAITS_HOLDER_LINE 9
-#define SYS_SEMAPHORE_WAITS_CREATED_FILE 10
-#define SYS_SEMAPHORE_WAITS_CREATED_LINE 11
-#define SYS_SEMAPHORE_WAITS_WRITER_THREAD 12
-#define SYS_SEMAPHORE_WAITS_RESERVATION_MODE 13
-#define SYS_SEMAPHORE_WAITS_READERS	14
-#define SYS_SEMAPHORE_WAITS_WAITERS_FLAG 15
-#define SYS_SEMAPHORE_WAITS_LOCK_WORD	16
-#define SYS_SEMAPHORE_WAITS_LAST_WRITER_FILE 17
-#define SYS_SEMAPHORE_WAITS_LAST_WRITER_LINE 18
-#define SYS_SEMAPHORE_WAITS_OS_WAIT_COUNT 19
-
-/*******************************************************************//**
-Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
-@return	0 on success */
-int
-field_store_string(
-/*===============*/
-	Field*		field,	/*!< in/out: target field for storage */
-	const char*	str);	/*!< in: NUL-terminated utf-8 string,
-				or NULL */
-
 #endif /* i_s_h */
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index fc97aabfa13..2cdbbd88330 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2022, MariaDB Corporation.
+Copyright (c) 2016, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,7 +25,6 @@ Created 7/19/1997 Heikki Tuuri
 *******************************************************/
 
 #include "ibuf0ibuf.h"
-#include "sync0sync.h"
 #include "btr0sea.h"
 
 /** Number of bits describing a single page */
@@ -235,11 +234,13 @@ type, counter, and some flags. */
 					format or later */
 
 
-/** The mutex used to block pessimistic inserts to ibuf trees */
-static ib_mutex_t	ibuf_pessimistic_insert_mutex;
-
-/** The mutex protecting the insert buffer structs */
-static ib_mutex_t	ibuf_mutex;
+#ifndef SAFE_MUTEX
+static
+#endif /* SAFE_MUTEX */
+/** The mutex protecting the insert buffer */
+mysql_mutex_t ibuf_mutex,
+	/** The mutex covering pessimistic inserts into the change buffer */
+	ibuf_pessimistic_insert_mutex;
 
 /** The area in pages from which contract looks for page numbers for merge */
 const ulint		IBUF_MERGE_AREA = 8;
@@ -313,47 +314,30 @@ ibuf_header_page_get(
 /*=================*/
 	mtr_t*	mtr)	/*!< in/out: mini-transaction */
 {
-	buf_block_t*	block;
-
 	ut_ad(!ibuf_inside(mtr));
-	page_t* page = NULL;
 
-	block = buf_page_get(
+	buf_block_t* block = buf_page_get(
 		page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
 		0, RW_X_LATCH, mtr);
 
-	if (block) {
-		buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
-		page = buf_block_get_frame(block);
-	}
-
-	return page;
+	return block ? block->page.frame : nullptr;
 }
 
 /** Acquire the change buffer root page.
 @param[in,out]  mtr     mini-transaction
 @return change buffer root page, SX-latched */
-static buf_block_t *ibuf_tree_root_get(mtr_t *mtr)
+static buf_block_t *ibuf_tree_root_get(mtr_t *mtr, dberr_t *err= nullptr)
 {
-	buf_block_t*	block;
-
-	ut_ad(ibuf_inside(mtr));
-	ut_ad(mutex_own(&ibuf_mutex));
-
-	mtr_sx_lock_index(ibuf.index, mtr);
-
-	/* only segment list access is exclusive each other */
-	block = buf_page_get(
-		page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO),
-		0, RW_SX_LATCH, mtr);
+  ut_ad(ibuf_inside(mtr));
+  mysql_mutex_assert_owner(&ibuf_mutex);
 
-	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
+  mtr_sx_lock_index(ibuf.index, mtr);
 
-	ut_ad(page_get_space_id(block->frame) == IBUF_SPACE_ID);
-	ut_ad(page_get_page_no(block->frame) == FSP_IBUF_TREE_ROOT_PAGE_NO);
-	ut_ad(ibuf.empty == page_is_empty(block->frame));
-
-	return block;
+  buf_block_t *block=
+    buf_page_get_gen(page_id_t{IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO},
+                     0, RW_SX_LATCH, nullptr, BUF_GET, mtr, err);
+  ut_ad(!block || ibuf.empty == page_is_empty(block->page.frame));
+  return block;
 }
 
 /******************************************************************//**
@@ -366,12 +350,11 @@ ibuf_close(void)
 		return;
 	}
 
-	mutex_free(&ibuf_pessimistic_insert_mutex);
-
-	mutex_free(&ibuf_mutex);
+	mysql_mutex_destroy(&ibuf_pessimistic_insert_mutex);
+	mysql_mutex_destroy(&ibuf_mutex);
 
 	dict_table_t*	ibuf_table = ibuf.index->table;
-	rw_lock_free(&ibuf.index->lock);
+	ibuf.index->lock.free();
 	dict_mem_index_free(ibuf.index);
 	dict_mem_table_free(ibuf_table);
 	ibuf.index = NULL;
@@ -386,7 +369,7 @@ ibuf_size_update(
 /*=============*/
 	const page_t*	root)	/*!< in: ibuf tree root */
 {
-	ut_ad(mutex_own(&ibuf_mutex));
+	mysql_mutex_assert_owner(&ibuf_mutex);
 
 	ibuf.free_list_len = flst_get_len(root + PAGE_HEADER
 					   + PAGE_BTR_IBUF_FREE_LIST);
@@ -406,19 +389,17 @@ ibuf_init_at_db_start(void)
 /*=======================*/
 {
 	page_t*		root;
-	ulint		n_used;
 
 	ut_ad(!ibuf.index);
-	dberr_t err;
 	mtr_t mtr;
 	mtr.start();
 	compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
 	compile_time_assert(IBUF_SPACE_ID == 0);
-	mtr_x_lock_space(fil_system.sys_space, &mtr);
+	mtr.x_lock_space(fil_system.sys_space);
+	dberr_t err;
 	buf_block_t* header_page = buf_page_get_gen(
 		page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
-		0, RW_X_LATCH, nullptr, BUF_GET,
-		__FILE__, __LINE__, &mtr, &err);
+		0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
 
 	if (!header_page) {
 err_exit:
@@ -434,25 +415,20 @@ err_exit:
 
 	fseg_n_reserved_pages(*header_page,
 			      IBUF_HEADER + IBUF_TREE_SEG_HEADER
-			      + header_page->frame, &n_used, &mtr);
-
-	ut_ad(n_used >= 2);
-
-	ibuf.seg_size = n_used;
-
-	{
-		buf_block_t*	block;
-
-		block = buf_page_get_gen(
-			page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO),
-			0, RW_X_LATCH, nullptr, BUF_GET,
-			__FILE__, __LINE__, &mtr, &err);
-
-		if (!block) goto err_exit;
-
-		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+			      + header_page->page.frame, &ibuf.seg_size, &mtr);
 
+	do {
+		DBUG_EXECUTE_IF("intermittent_read_failure", continue;);
+		ut_ad(ibuf.seg_size >= 2);
+	} while (0);
+
+	if (buf_block_t* block =
+	    buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
+				       FSP_IBUF_TREE_ROOT_PAGE_NO),
+			     0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) {
 		root = buf_block_get_frame(block);
+	} else {
+		goto err_exit;
 	}
 
 	DBUG_EXECUTE_IF("ibuf_init_corrupt",
@@ -465,7 +441,7 @@ err_exit:
 		goto err_exit;
 	}
 
-	/* At startup we initialize ibuf to have a maximum of
+	/* At startup we intialize ibuf to have a maximum of
 	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
 	buffer pool size. Once ibuf struct is initialized this
 	value is updated with the user supplied size by calling
@@ -473,28 +449,26 @@ err_exit:
 	ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
 			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
 
-	mutex_create(LATCH_ID_IBUF, &ibuf_mutex);
-
-	mutex_create(LATCH_ID_IBUF_PESSIMISTIC_INSERT,
-		     &ibuf_pessimistic_insert_mutex);
-
-	mutex_enter(&ibuf_mutex);
+	mysql_mutex_init(ibuf_mutex_key, &ibuf_mutex, nullptr);
+	mysql_mutex_init(ibuf_pessimistic_insert_mutex_key,
+			 &ibuf_pessimistic_insert_mutex, nullptr);
 
+	mysql_mutex_lock(&ibuf_mutex);
 	ibuf_size_update(root);
-	mutex_exit(&ibuf_mutex);
+	mysql_mutex_unlock(&ibuf_mutex);
 
 	ibuf.empty = page_is_empty(root);
 	mtr.commit();
 
 	ibuf.index = dict_mem_index_create(
-		dict_mem_table_create("innodb_change_buffer",
-				      fil_system.sys_space, 1, 0, 0, 0),
+		dict_table_t::create(
+			{C_STRING_WITH_LEN("innodb_change_buffer")},
+			fil_system.sys_space, 1, 0, 0, 0),
 		"CLUST_IND",
 		DICT_CLUSTERED | DICT_IBUF, 1);
 	ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
 	ibuf.index->n_uniq = REC_MAX_N_FIELDS;
-	rw_lock_create(index_tree_rw_lock_key, &ibuf.index->lock,
-		       SYNC_IBUF_INDEX_TREE);
+	ibuf.index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
 #ifdef BTR_CUR_ADAPT
 	ibuf.index->search_info = btr_search_info_create(ibuf.index->heap);
 #endif /* BTR_CUR_ADAPT */
@@ -508,9 +482,8 @@ err_exit:
 	ib::info() << "Dumping the change buffer";
 	ibuf_mtr_start(&mtr);
 	btr_pcur_t pcur;
-	if (DB_SUCCESS == btr_pcur_open_at_index_side(
-		    true, ibuf.index, BTR_SEARCH_LEAF, &pcur,
-		    true, 0, &mtr)) {
+	if (DB_SUCCESS
+	    == pcur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr)) {
 		while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
 			rec_print_old(stderr, btr_pcur_get_rec(&pcur));
 		}
@@ -533,9 +506,9 @@ ibuf_max_size_update(
 	if (UNIV_UNLIKELY(!ibuf.index)) return;
 	ulint	new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
 			    * new_val) / 100;
-	mutex_enter(&ibuf_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
 	ibuf.max_size = new_size;
-	mutex_exit(&ibuf_mutex);
+	mysql_mutex_unlock(&ibuf_mutex);
 }
 
 # ifdef UNIV_DEBUG
@@ -648,7 +621,7 @@ ibuf_bitmap_page_set_bits(
 
 	ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
 
-	byte* map_byte = &block->frame[IBUF_BITMAP + byte_offset];
+	byte* map_byte = &block->page.frame[IBUF_BITMAP + byte_offset];
 	byte b = *map_byte;
 
 	if (bit == IBUF_BITMAP_FREE) {
@@ -690,45 +663,22 @@ inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size)
 stored.
 @param[in]	page_id		page id of the file page
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	file		file name
-@param[in]	line		line where called
 @param[in,out]	mtr		mini-transaction
 @return bitmap page where the file page is mapped, that is, the bitmap
 page containing the descriptor bits for the file page; the bitmap page
 is x-latched */
 static
 buf_block_t*
-ibuf_bitmap_get_map_page_func(
+ibuf_bitmap_get_map_page(
 	const page_id_t		page_id,
 	ulint			zip_size,
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr)
 {
-	buf_block_t* block = buf_page_get_gen(
-		ibuf_bitmap_page_no_calc(page_id, zip_size),
-		zip_size, RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED,
-		file, line, mtr);
-
-	if (block) {
-		buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
-	}
-
-	return block;
+  return buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, zip_size),
+                          zip_size, RW_X_LATCH, nullptr,
+                          BUF_GET_POSSIBLY_FREED, mtr);
 }
 
-/** Gets the ibuf bitmap page where the bits describing a given file page are
-stored.
-@param[in]	page_id		page id of the file page
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in,out]	mtr		mini-transaction
-@return bitmap page where the file page is mapped, that is, the bitmap
-page containing the descriptor bits for the file page; the bitmap page
-is x-latched */
-#define ibuf_bitmap_get_map_page(page_id, zip_size, mtr)	\
-	ibuf_bitmap_get_map_page_func(page_id, zip_size, \
-				      __FILE__, __LINE__, mtr)
-
 /************************************************************************//**
 Sets the free bits of the page in the ibuf bitmap. This is done in a separate
 mini-transaction, hence this operation does not restrict further work to only
@@ -745,7 +695,7 @@ ibuf_set_free_bits_low(
 	mtr_t*			mtr)	/*!< in/out: mtr */
 {
 	ut_ad(mtr->is_named_space(block->page.id().space()));
-	if (!page_is_leaf(block->frame)) {
+	if (!page_is_leaf(block->page.frame)) {
 		return;
 	}
 
@@ -779,42 +729,36 @@ ibuf_set_free_bits_func(
 #endif /* UNIV_IBUF_DEBUG */
 	ulint		val)	/*!< in: value to set: < 4 */
 {
-	if (!page_is_leaf(block->frame)) {
-		return;
-	}
-
-	mtr_t	mtr;
-	mtr.start();
-	const page_id_t id(block->page.id());
-
-	const fil_space_t* space = mtr.set_named_space_id(id.space());
+  if (!page_is_leaf(block->page.frame))
+    return;
 
-	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(id,
-							    block->zip_size(),
-							    &mtr);
+  mtr_t	mtr;
+  mtr.start();
+  const page_id_t id(block->page.id());
+  const fil_space_t *space= mtr.set_named_space_id(id.space());
 
-	if (space->purpose != FIL_TYPE_TABLESPACE) {
-		mtr.set_log_mode(MTR_LOG_NO_REDO);
-	}
+  if (buf_block_t *bitmap_page=
+      ibuf_bitmap_get_map_page(id, block->zip_size(), &mtr))
+  {
+    if (space->purpose != FIL_TYPE_TABLESPACE)
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
 
 #ifdef UNIV_IBUF_DEBUG
-	if (max_val != ULINT_UNDEFINED) {
-		ulint	old_val;
-
-		old_val = ibuf_bitmap_page_get_bits(
-			bitmap_page, id,
-			IBUF_BITMAP_FREE, &mtr);
-		ut_a(old_val <= max_val);
-	}
+    if (max_val != ULINT_UNDEFINED)
+    {
+      ulint old_val= ibuf_bitmap_page_get_bits(bitmap_page, id,
+                                               IBUF_BITMAP_FREE, &mtr);
+      ut_a(old_val <= max_val);
+    }
 
-	ut_a(val <= ibuf_index_page_calc_free(block));
+    ut_a(val <= ibuf_index_page_calc_free(block));
 #endif /* UNIV_IBUF_DEBUG */
 
-	ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
-		bitmap_page, id, block->physical_size(),
-		val, &mtr);
+    ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>
+      (bitmap_page, id, block->physical_size(), val, &mtr);
+  }
 
-	mtr.commit();
+  mtr.commit();
 }
 
 /************************************************************************//**
@@ -889,7 +833,7 @@ ibuf_update_free_bits_zip(
 	buf_block_t*	block,	/*!< in/out: index page */
 	mtr_t*		mtr)	/*!< in/out: mtr */
 {
-	ut_ad(page_is_leaf(block->frame));
+	ut_ad(page_is_leaf(block->page.frame));
 	ut_ad(block->zip_size());
 
 	ulint after = ibuf_index_page_calc_free_zip(block);
@@ -954,8 +898,6 @@ Must not be called when recv_no_ibuf_operations==true.
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	x_latch		FALSE if relaxed check (avoid latching the
 bitmap page)
-@param[in]	file		file name
-@param[in]	line		line where called
 @param[in,out]	mtr		mtr which will contain an x-latch to the
 bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
 in which case a new transaction is created.
@@ -967,8 +909,6 @@ ibuf_page_low(
 #ifdef UNIV_DEBUG
 	bool			x_latch,
 #endif /* UNIV_DEBUG */
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr)
 {
 	ibool	ret;
@@ -1000,15 +940,13 @@ ibuf_page_low(
 		not be modified by any other thread. Nobody should be
 		calling ibuf_add_free_page() or ibuf_remove_free_page()
 		while the page is linked to the insert buffer b-tree. */
-		dberr_t err = DB_SUCCESS;
-
 		buf_block_t* block = buf_page_get_gen(
 			ibuf_bitmap_page_no_calc(page_id, zip_size),
-			zip_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH,
-			file, line, &local_mtr, &err);
+			zip_size, RW_NO_LATCH, nullptr, BUF_GET, &local_mtr);
 
-		ret = ibuf_bitmap_page_get_bits_low(
-			block->frame, page_id, zip_size,
+		ret = block
+			&& ibuf_bitmap_page_get_bits_low(
+			block->page.frame, page_id, zip_size,
 			MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
 
 		mtr_commit(&local_mtr);
@@ -1021,11 +959,12 @@ ibuf_page_low(
 		mtr_start(mtr);
 	}
 
-	ret = ibuf_bitmap_page_get_bits(ibuf_bitmap_get_map_page_func(
-						page_id, zip_size, file, line,
-						mtr)->frame,
-					page_id, zip_size,
-					IBUF_BITMAP_IBUF, mtr);
+	buf_block_t *block = ibuf_bitmap_get_map_page(page_id, zip_size,
+						      mtr);
+	ret = block
+		&& ibuf_bitmap_page_get_bits(block->page.frame,
+					     page_id, zip_size,
+					     IBUF_BITMAP_IBUF, mtr);
 
 	if (mtr == &local_mtr) {
 		mtr_commit(mtr);
@@ -1317,8 +1256,9 @@ ibuf_dummy_index_create(
 	dict_table_t*	table;
 	dict_index_t*	index;
 
-	table = dict_mem_table_create("IBUF_DUMMY", NULL, n, 0,
-				      comp ? DICT_TF_COMPACT : 0, 0);
+	table = dict_table_t::create({C_STRING_WITH_LEN("IBUF_DUMMY")},
+				     nullptr, n, 0,
+				     comp ? DICT_TF_COMPACT : 0, 0);
 
 	index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n);
 
@@ -1806,7 +1746,7 @@ dare to start a pessimistic insert to the insert buffer.
 @return whether enough free pages in list */
 static inline bool ibuf_data_enough_free_for_insert()
 {
-	ut_ad(mutex_own(&ibuf_mutex));
+	mysql_mutex_assert_owner(&ibuf_mutex);
 
 	/* We want a big margin of free pages, because a B-tree can sometimes
 	grow in size also if records are deleted from it, as the node pointers
@@ -1826,7 +1766,7 @@ ibool
 ibuf_data_too_much_free(void)
 /*=========================*/
 {
-	ut_ad(mutex_own(&ibuf_mutex));
+	mysql_mutex_assert_owner(&ibuf_mutex);
 
 	return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height);
 }
@@ -1843,8 +1783,12 @@ static bool ibuf_add_free_page()
 	mtr.start();
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
-	mtr_x_lock_space(fil_system.sys_space, &mtr);
+	mtr.x_lock_space(fil_system.sys_space);
 	header_page = ibuf_header_page_get(&mtr);
+	if (!header_page) {
+		mtr.commit();
+		return false;
+	}
 
 	/* Allocate a new page: NOTE that if the page has been a part of a
 	non-clustered index which has subsequently been dropped, then the
@@ -1856,32 +1800,40 @@ static bool ibuf_add_free_page()
 	of a deadlock. This is the reason why we created a special ibuf
 	header page apart from the ibuf tree. */
 
-	block = fseg_alloc_free_page(
+	dberr_t err;
+	block = fseg_alloc_free_page_general(
 		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
-		&mtr);
+		false, &mtr, &mtr, &err);
 
-	if (block == NULL) {
+	if (!block) {
 		mtr.commit();
 		return false;
 	}
 
-	ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
+	ut_ad(block->page.lock.not_recursive());
 	ibuf_enter(&mtr);
-	mutex_enter(&ibuf_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
 
-	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
-
-	mtr.write<2>(*block, block->frame + FIL_PAGE_TYPE,
+	mtr.write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
 		     FIL_PAGE_IBUF_FREE_LIST);
+	buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr);
+	if (UNIV_UNLIKELY(!ibuf_root)) {
+corrupted:
+		/* Do not bother to try to free the allocated block, because
+		the change buffer is seriously corrupted already. */
+		mysql_mutex_unlock(&ibuf_mutex);
+		ibuf_mtr_commit(&mtr);
+		return false;
+	}
 
 	/* Add the page to the free list and update the ibuf size data */
 
-	flst_add_last(ibuf_tree_root_get(&mtr),
-		      PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		      block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
-
-	ibuf.seg_size++;
-	ibuf.free_list_len++;
+	err = flst_add_last(ibuf_root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+			    block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+			    &mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto corrupted;
+	}
 
 	/* Set the bit indicating that this page is now an ibuf tree page
 	(level 2 page) */
@@ -1889,22 +1841,24 @@ static bool ibuf_add_free_page()
 	const page_id_t page_id(block->page.id());
 	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
 
-	mutex_exit(&ibuf_mutex);
+	if (UNIV_UNLIKELY(!bitmap_page)) {
+		goto corrupted;
+	}
 
-	ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id,
-						    srv_page_size, true,
-						    &mtr);
+	ibuf.seg_size++;
+	ibuf.free_list_len++;
+
+	mysql_mutex_unlock(&ibuf_mutex);
 
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id,
+						    srv_page_size, true, &mtr);
 	ibuf_mtr_commit(&mtr);
 	return true;
 }
 
 /*********************************************************************//**
 Removes a page from the free list and frees it to the fsp system. */
-static
-void
-ibuf_remove_free_page(void)
-/*=======================*/
+static void ibuf_remove_free_page()
 {
 	mtr_t	mtr;
 	mtr_t	mtr2;
@@ -1916,18 +1870,18 @@ ibuf_remove_free_page(void)
 	/* Acquire the fsp latch before the ibuf header, obeying the latching
 	order */
 
-	mtr_x_lock_space(fil_system.sys_space, &mtr);
+	mtr.x_lock_space(fil_system.sys_space);
 	header_page = ibuf_header_page_get(&mtr);
 
 	/* Prevent pessimistic inserts to insert buffer trees for a while */
 	ibuf_enter(&mtr);
-	mutex_enter(&ibuf_pessimistic_insert_mutex);
-	mutex_enter(&ibuf_mutex);
-
-	if (!ibuf_data_too_much_free()) {
+	mysql_mutex_lock(&ibuf_pessimistic_insert_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
 
-		mutex_exit(&ibuf_mutex);
-		mutex_exit(&ibuf_pessimistic_insert_mutex);
+	if (!header_page || !ibuf_data_too_much_free()) {
+early_exit:
+		mysql_mutex_unlock(&ibuf_mutex);
+		mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
 
 		ibuf_mtr_commit(&mtr);
 
@@ -1938,10 +1892,16 @@ ibuf_remove_free_page(void)
 
 	buf_block_t* root = ibuf_tree_root_get(&mtr2);
 
-	mutex_exit(&ibuf_mutex);
+	if (UNIV_UNLIKELY(!root)) {
+		ibuf_mtr_commit(&mtr2);
+		goto early_exit;
+	}
+
+	mysql_mutex_unlock(&ibuf_mutex);
 
-	uint32_t page_no = flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
-					 + root->frame).page;
+	const uint32_t page_no = flst_get_last(PAGE_HEADER
+					       + PAGE_BTR_IBUF_FREE_LIST
+					       + root->page.frame).page;
 
 	/* NOTE that we must release the latch on the ibuf tree root
 	because in fseg_free_page we access level 1 pages, and the root
@@ -1957,44 +1917,60 @@ ibuf_remove_free_page(void)
 	page from it. */
 
 	compile_time_assert(IBUF_SPACE_ID == 0);
-	fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
-		       fil_system.sys_space, page_no, &mtr);
+	const page_id_t	page_id{IBUF_SPACE_ID, page_no};
+	buf_block_t* bitmap_page = nullptr;
+	dberr_t err = fseg_free_page(
+		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+		fil_system.sys_space, page_no, &mtr);
 
-	const page_id_t	page_id(IBUF_SPACE_ID, page_no);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
 
 	ibuf_enter(&mtr);
 
-	mutex_enter(&ibuf_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
 
-	root = ibuf_tree_root_get(&mtr);
+	root = ibuf_tree_root_get(&mtr, &err);
+	if (UNIV_UNLIKELY(!root)) {
+		mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+		goto func_exit;
+	}
 
 	ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
-				       + root->frame).page);
-
-	buf_block_t* block = buf_page_get(page_id, 0, RW_X_LATCH, &mtr);
-	buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
+				       + root->page.frame).page);
 
 	/* Remove the page from the free list and update the ibuf size data */
+	if (buf_block_t* block =
+	    buf_page_get_gen(page_id, 0, RW_X_LATCH, nullptr, BUF_GET,
+			     &mtr, &err)) {
+		err = flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+				  block,
+				  PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+				  &mtr);
+	}
 
-	flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
-		    block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
-
-	mutex_exit(&ibuf_pessimistic_insert_mutex);
-
-	ibuf.seg_size--;
-	ibuf.free_list_len--;
+	mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
 
-	/* Set the bit indicating that this page is no more an ibuf tree page
-	(level 2 page) */
-
-	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+	if (err == DB_SUCCESS) {
+		ibuf.seg_size--;
+		ibuf.free_list_len--;
+		bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+	}
 
-	mutex_exit(&ibuf_mutex);
+func_exit:
+	mysql_mutex_unlock(&ibuf_mutex);
 
-	ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(
-		bitmap_page, page_id, srv_page_size, false, &mtr);
+	if (bitmap_page) {
+		/* Set the bit indicating that this page is no more an
+		ibuf tree page (level 2 page) */
+		ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(
+			bitmap_page, page_id, srv_page_size, false, &mtr);
+	}
 
-	buf_page_free(fil_system.sys_space, page_no, &mtr, __FILE__, __LINE__);
+	if (err == DB_SUCCESS) {
+		buf_page_free(fil_system.sys_space, page_no, &mtr);
+	}
 
 	ibuf_mtr_commit(&mtr);
 }
@@ -2015,9 +1991,9 @@ ibuf_free_excess_pages(void)
 
 		ibool	too_much_free;
 
-		mutex_enter(&ibuf_mutex);
+		mysql_mutex_lock(&ibuf_mutex);
 		too_much_free = ibuf_data_too_much_free();
-		mutex_exit(&ibuf_mutex);
+		mysql_mutex_unlock(&ibuf_mutex);
 
 		if (!too_much_free) {
 			return;
@@ -2076,23 +2052,25 @@ ibuf_get_merge_page_nos_func(
 
 	*n_stored = 0;
 
-	limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
-		       buf_pool_get_curr_size() / 4);
-
 	if (page_rec_is_supremum(rec)) {
 
 		rec = page_rec_get_prev_const(rec);
+		if (UNIV_UNLIKELY(!rec)) {
+corruption:
+			ut_ad("corrupted page" == 0);
+			return 0;
+		}
 	}
 
 	if (page_rec_is_infimum(rec)) {
-
 		rec = page_rec_get_next_const(rec);
+		if (!rec || page_rec_is_supremum(rec)) {
+			return 0;
+		}
 	}
 
-	if (page_rec_is_supremum(rec)) {
-
-		return(0);
-	}
+	limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
+		       buf_pool_get_curr_size() / 4);
 
 	first_page_no = ibuf_rec_get_page_no(mtr, rec);
 	first_space_id = ibuf_rec_get_space(mtr, rec);
@@ -2124,7 +2102,9 @@ ibuf_get_merge_page_nos_func(
 		prev_page_no = rec_page_no;
 		prev_space_id = rec_space_id;
 
-		rec = page_rec_get_prev_const(rec);
+		if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+			goto corruption;
+		}
 	}
 
 	rec = page_rec_get_next_const(rec);
@@ -2138,7 +2118,7 @@ ibuf_get_merge_page_nos_func(
 	sum_volumes = 0;
 	volume_for_page = 0;
 
-	while (*n_stored < limit) {
+	while (*n_stored < limit && rec) {
 		if (page_rec_is_supremum(rec)) {
 			/* When no more records available, mark this with
 			another 'impossible' pair of space id, page no */
@@ -2325,9 +2305,9 @@ static void ibuf_delete_recs(const page_id_t page_id)
   mtr_t mtr;
 loop:
   btr_pcur_t pcur;
+  pcur.btr_cur.page_cur.index= ibuf.index;
   ibuf_mtr_start(&mtr);
-  if (btr_pcur_open(ibuf.index, &tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
-                    &pcur, &mtr) != DB_SUCCESS)
+  if (btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr))
     goto func_exit;
   if (!btr_pcur_is_on_user_rec(&pcur))
   {
@@ -2393,11 +2373,11 @@ tablespace_deleted:
 			buf_page_get_gen(page_id_t(space_id, page_nos[i]),
 					 zip_size, RW_X_LATCH, nullptr,
 					 BUF_GET_POSSIBLY_FREED,
-					 __FILE__, __LINE__, &mtr, &err, true);
+					 &mtr, &err, true);
 			bool remove = !block
-				|| fil_page_get_type(block->frame)
+				|| fil_page_get_type(block->page.frame)
 				!= FIL_PAGE_INDEX
-				|| !page_is_leaf(block->frame);
+				|| !page_is_leaf(block->page.frame);
 			mtr.commit();
 			if (err == DB_TABLESPACE_DELETED) {
 				goto tablespace_deleted;
@@ -2447,45 +2427,39 @@ ulint ibuf_contract()
 {
 	if (UNIV_UNLIKELY(!ibuf.index)) return 0;
 	mtr_t		mtr;
-	btr_pcur_t	pcur;
+	btr_cur_t	cur;
 	ulint		sum_sizes;
 	uint32_t	page_nos[IBUF_MAX_N_PAGES_MERGED];
 	uint32_t	space_ids[IBUF_MAX_N_PAGES_MERGED];
 
 	ibuf_mtr_start(&mtr);
 
-	/* Open a cursor to a randomly chosen leaf of the tree, at a random
-	position within the leaf */
-	bool available;
-
-	available = btr_pcur_open_at_rnd_pos(ibuf.index, BTR_SEARCH_LEAF,
-					     &pcur, &mtr);
-	/* No one should make this index unavailable when server is running */
-	ut_a(available);
+	if (cur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr) !=
+	    DB_SUCCESS) {
+		return 0;
+	}
 
-	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+	ut_ad(page_validate(btr_cur_get_page(&cur), ibuf.index));
 
-	if (page_is_empty(btr_pcur_get_page(&pcur))) {
+	if (page_is_empty(btr_cur_get_page(&cur))) {
 		/* If a B-tree page is empty, it must be the root page
 		and the whole B-tree must be empty. InnoDB does not
 		allow empty B-tree pages other than the root. */
 		ut_ad(ibuf.empty);
-		ut_ad(btr_pcur_get_block(&pcur)->page.id()
+		ut_ad(btr_cur_get_block(&cur)->page.id()
 		      == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
 
 		ibuf_mtr_commit(&mtr);
-		btr_pcur_close(&pcur);
 
 		return(0);
 	}
 
 	ulint n_pages = 0;
 	sum_sizes = ibuf_get_merge_page_nos(TRUE,
-					    btr_pcur_get_rec(&pcur), &mtr,
+					    btr_cur_get_rec(&cur), &mtr,
 					    space_ids,
 					    page_nos, &n_pages);
 	ibuf_mtr_commit(&mtr);
-	btr_pcur_close(&pcur);
 
 	ibuf_read_merge_pages(space_ids, page_nos, n_pages);
 
@@ -2530,17 +2504,18 @@ ibuf_merge_space(
 
 	/* Position the cursor on the first matching record. */
 
-	btr_pcur_open(
-		ibuf.index, &tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
-		&mtr);
-
-	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+	pcur.btr_cur.page_cur.index = ibuf.index;
+	dberr_t err = btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF,
+				    &pcur, &mtr);
+	ut_ad(err != DB_SUCCESS || page_validate(btr_pcur_get_page(&pcur),
+						 ibuf.index));
 
 	ulint		sum_sizes = 0;
 	uint32_t	pages[IBUF_MAX_N_PAGES_MERGED];
 	uint32_t	spaces[IBUF_MAX_N_PAGES_MERGED];
 
-	if (page_is_empty(btr_pcur_get_page(&pcur))) {
+	if (err != DB_SUCCESS) {
+	} else if (page_is_empty(btr_pcur_get_page(&pcur))) {
 		/* If a B-tree page is empty, it must be the root page
 		and the whole B-tree must be empty. InnoDB does not
 		allow empty B-tree pages other than the root. */
@@ -2558,8 +2533,6 @@ ibuf_merge_space(
 
 	ibuf_mtr_commit(&mtr);
 
-	btr_pcur_close(&pcur);
-
 	if (n_pages > 0) {
 		ut_ad(n_pages <= UT_ARR_SIZE(pages));
 
@@ -2803,14 +2776,16 @@ ibuf_get_volume_buffered(
 	page = page_align(rec);
 	ut_ad(page_validate(page, ibuf.index));
 
-	if (page_rec_is_supremum(rec)) {
-		rec = page_rec_get_prev_const(rec);
+	if (page_rec_is_supremum(rec)
+	    && UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+corruption:
+		ut_ad("corrupted page" == 0);
+		return srv_page_size;
 	}
 
 	uint32_t prev_page_no;
 
-	for (; !page_rec_is_infimum(rec);
-	     rec = page_rec_get_prev_const(rec)) {
+	for (; !page_rec_is_infimum(rec); ) {
 		ut_ad(page_align(rec) == page);
 
 		if (page_no != ibuf_rec_get_page_no(mtr, rec)
@@ -2822,6 +2797,10 @@ ibuf_get_volume_buffered(
 		volume += ibuf_get_volume_buffered_count(
 			mtr, rec,
 			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+
+		if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+			goto corruption;
+		}
 	}
 
 	/* Look at the previous page */
@@ -2833,30 +2812,30 @@ ibuf_get_volume_buffered(
 		goto count_later;
 	}
 
-	{
-		buf_block_t*	block;
-
-		block = buf_page_get(
-			page_id_t(IBUF_SPACE_ID, prev_page_no),
-			0, RW_X_LATCH, mtr);
-
-		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
-
+	if (buf_block_t* block =
+	    buf_page_get(page_id_t(IBUF_SPACE_ID, prev_page_no),
+			 0, RW_X_LATCH, mtr)) {
 		prev_page = buf_block_get_frame(block);
 		ut_ad(page_validate(prev_page, ibuf.index));
+	} else {
+		return srv_page_size;
 	}
 
-#ifdef UNIV_BTR_DEBUG
 	static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
 	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-	ut_a(!memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT,
-				page + FIL_PAGE_OFFSET, 4));
-#endif /* UNIV_BTR_DEBUG */
 
-	rec = page_get_supremum_rec(prev_page);
-	rec = page_rec_get_prev_const(rec);
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT,
+					    page + FIL_PAGE_OFFSET, 4))) {
+		return srv_page_size;
+	}
+
+	rec = page_rec_get_prev_const(page_get_supremum_rec(prev_page));
 
-	for (;; rec = page_rec_get_prev_const(rec)) {
+	if (UNIV_UNLIKELY(!rec)) {
+		goto corruption;
+	}
+
+	for (;;) {
 		ut_ad(page_align(rec) == prev_page);
 
 		if (page_rec_is_infimum(rec)) {
@@ -2877,6 +2856,10 @@ ibuf_get_volume_buffered(
 		volume += ibuf_get_volume_buffered_count(
 			mtr, rec,
 			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+
+		if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+			goto corruption;
+		}
 	}
 
 count_later:
@@ -2888,6 +2871,9 @@ count_later:
 
 	for (; !page_rec_is_supremum(rec);
 	     rec = page_rec_get_next_const(rec)) {
+		if (UNIV_UNLIKELY(!rec)) {
+			return srv_page_size;
+		}
 		if (page_no != ibuf_rec_get_page_no(mtr, rec)
 		    || space != ibuf_rec_get_space(mtr, rec)) {
 
@@ -2908,39 +2894,34 @@ count_later:
 		return(volume);
 	}
 
-	{
-		buf_block_t*	block;
-
-		block = buf_page_get(
-			page_id_t(IBUF_SPACE_ID, next_page_no),
-			0, RW_X_LATCH, mtr);
-
-		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
-
+	if (buf_block_t* block =
+	    buf_page_get(page_id_t(IBUF_SPACE_ID, next_page_no),
+			 0, RW_X_LATCH, mtr)) {
 		next_page = buf_block_get_frame(block);
 		ut_ad(page_validate(next_page, ibuf.index));
+	} else {
+		return srv_page_size;
 	}
 
-#ifdef UNIV_BTR_DEBUG
 	static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
 	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
-	ut_a(!memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
-				page + FIL_PAGE_OFFSET, 4));
-#endif /* UNIV_BTR_DEBUG */
+
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+					    page + FIL_PAGE_OFFSET, 4))) {
+		return 0;
+	}
 
 	rec = page_get_infimum_rec(next_page);
 	rec = page_rec_get_next_const(rec);
 
-	for (;; rec = page_rec_get_next_const(rec)) {
-		ut_ad(page_align(rec) == next_page);
-
-		if (page_rec_is_supremum(rec)) {
-
+	for (; ; rec = page_rec_get_next_const(rec)) {
+		if (!rec || page_rec_is_supremum(rec)) {
 			/* We give up */
-
 			return(srv_page_size);
 		}
 
+		ut_ad(page_align(rec) == next_page);
+
 		if (page_no != ibuf_rec_get_page_no(mtr, rec)
 		    || space != ibuf_rec_get_space(mtr, rec)) {
 
@@ -2961,7 +2942,6 @@ ibuf_update_max_tablespace_id(void)
 /*===============================*/
 {
 	if (UNIV_UNLIKELY(!ibuf.index)) return;
-	ulint		max_space_id;
 	const rec_t*	rec;
 	const byte*	field;
 	ulint		len;
@@ -2972,26 +2952,27 @@ ibuf_update_max_tablespace_id(void)
 
 	ibuf_mtr_start(&mtr);
 
-	btr_pcur_open_at_index_side(
-		false, ibuf.index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
+	if (pcur.open_leaf(false, ibuf.index, BTR_SEARCH_LEAF, &mtr)
+	    != DB_SUCCESS) {
+func_exit:
+		ibuf_mtr_commit(&mtr);
+		return;
+	}
 
 	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
 
-	btr_pcur_move_to_prev(&pcur, &mtr);
-
-	if (btr_pcur_is_before_first_on_page(&pcur)) {
-		/* The tree is empty */
+	if (!btr_pcur_move_to_prev(&pcur, &mtr)
+	    || btr_pcur_is_before_first_on_page(&pcur)) {
+		goto func_exit;
+	}
 
-		max_space_id = 0;
-	} else {
-		rec = btr_pcur_get_rec(&pcur);
+	rec = btr_pcur_get_rec(&pcur);
 
-		field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
 
-		ut_a(len == 4);
+	ut_a(len == 4);
 
-		max_space_id = mach_read_from_4(field);
-	}
+	const uint32_t max_space_id = mach_read_from_4(field);
 
 	ibuf_mtr_commit(&mtr);
 
@@ -3150,7 +3131,7 @@ ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits)
 
 /** Buffer an operation in the insert/delete buffer, instead of doing it
 directly to the disk page, if this is possible.
-@param[in]	mode		BTR_MODIFY_PREV or BTR_MODIFY_TREE
+@param[in]	mode		BTR_MODIFY_PREV or BTR_INSERT_TREE
 @param[in]	op		operation type
 @param[in]	no_counter	TRUE=use 5.0.3 format; FALSE=allow delete
 buffering
@@ -3162,10 +3143,10 @@ or clustered
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]	thr		query thread
 @return DB_SUCCESS, DB_STRONG_FAIL or other error */
-static MY_ATTRIBUTE((warn_unused_result))
+static TRANSACTIONAL_TARGET MY_ATTRIBUTE((warn_unused_result))
 dberr_t
 ibuf_insert_low(
-	ulint			mode,
+	btr_latch_mode		mode,
 	ibuf_op_t		op,
 	ibool			no_counter,
 	const dtuple_t*		entry,
@@ -3186,7 +3167,7 @@ ibuf_insert_low(
 	lint		min_n_recs;
 	rec_t*		ins_rec;
 	buf_block_t*	bitmap_page;
-	buf_block_t*	block;
+	buf_block_t*	block		= NULL;
 	page_t*		root;
 	dberr_t		err;
 	ibool		do_merge;
@@ -3246,18 +3227,18 @@ ibuf_insert_low(
 	the new entry to it without exceeding the free space limit for the
 	page. */
 
-	if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
+	if (mode == BTR_INSERT_TREE) {
 		for (;;) {
-			mutex_enter(&ibuf_pessimistic_insert_mutex);
-			mutex_enter(&ibuf_mutex);
+			mysql_mutex_lock(&ibuf_pessimistic_insert_mutex);
+			mysql_mutex_lock(&ibuf_mutex);
 
 			if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
 
 				break;
 			}
 
-			mutex_exit(&ibuf_mutex);
-			mutex_exit(&ibuf_pessimistic_insert_mutex);
+			mysql_mutex_unlock(&ibuf_mutex);
+			mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
 
 			if (!ibuf_add_free_page()) {
 
@@ -3268,8 +3249,28 @@ ibuf_insert_low(
 	}
 
 	ibuf_mtr_start(&mtr);
+	pcur.btr_cur.page_cur.index = ibuf.index;
+
+	err = btr_pcur_open(ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+func_exit:
+		ibuf_mtr_commit(&mtr);
+		ut_free(pcur.old_rec_buf);
+		mem_heap_free(heap);
+
+		if (err == DB_SUCCESS && mode == BTR_INSERT_TREE) {
+			ibuf_contract_after_insert(entry_size);
+		}
+
+		if (do_merge) {
+#ifdef UNIV_IBUF_DEBUG
+			ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+			ibuf_read_merge_pages(space_ids, page_nos, n_stored);
+		}
+		return err;
+	}
 
-	btr_pcur_open(ibuf.index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
 	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
 
 	/* Find out the volume of already buffered inserts for the same index
@@ -3304,9 +3305,9 @@ ibuf_insert_low(
 		until after the IBUF_OP_DELETE has been buffered. */
 
 fail_exit:
-		if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
-			mutex_exit(&ibuf_mutex);
-			mutex_exit(&ibuf_pessimistic_insert_mutex);
+		if (mode == BTR_INSERT_TREE) {
+			mysql_mutex_unlock(&ibuf_mutex);
+			mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
 		}
 
 		err = DB_STRONG_FAIL;
@@ -3317,7 +3318,7 @@ fail_exit:
 	buffer pool, but we do not have to care about it, since we are
 	holding a latch on the insert buffer leaf page that contains
 	buffered changes for (space, page_no).  If the page enters the
-	buffer pool, buf_page_read_complete() for (space, page_no) will
+	buffer pool, buf_page_t::read_complete() for (space, page_no) will
 	have to acquire a latch on the same insert buffer leaf page,
 	which it cannot do until we have buffered the IBUF_OP_DELETE
 	and done mtr_commit(&mtr) to release the latch. */
@@ -3328,22 +3329,27 @@ fail_exit:
 
 	/* We check if the index page is suitable for buffered entries */
 
-	if (buf_pool.page_hash_contains(page_id)) {
+	if (!bitmap_page || buf_pool.page_hash_contains(
+		    page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
 commit_exit:
 		ibuf_mtr_commit(&bitmap_mtr);
 		goto fail_exit;
+	} else if (!lock_sys.rd_lock_try()) {
+		goto commit_exit;
 	} else {
-		lock_mutex_enter();
-		const auto lock_exists = lock_sys.get_first(page_id);
-		lock_mutex_exit();
-		if (lock_exists) {
+		hash_cell_t* cell = lock_sys.rec_hash.cell_get(page_id.fold());
+		lock_sys.rec_hash.latch(cell)->acquire();
+		const lock_t* lock = lock_sys_t::get_first(*cell, page_id);
+		lock_sys.rec_hash.latch(cell)->release();
+		lock_sys.rd_unlock();
+		if (lock) {
 			goto commit_exit;
 		}
 	}
 
 	if (op == IBUF_OP_INSERT) {
 		ulint	bits = ibuf_bitmap_page_get_bits(
-			bitmap_page->frame, page_id, physical_size,
+			bitmap_page->page.frame, page_id, physical_size,
 			IBUF_BITMAP_FREE, &bitmap_mtr);
 
 		if (buffered + entry_size + page_dir_calc_reserved_space(1)
@@ -3415,15 +3421,20 @@ commit_exit:
 			ibuf.empty = page_is_empty(root);
 		}
 	} else {
-		ut_ad(BTR_LATCH_MODE_WITHOUT_INTENTION(mode)
-		      == BTR_MODIFY_TREE);
+		ut_ad(mode == BTR_INSERT_TREE);
 
 		/* We acquire an sx-latch to the root page before the insert,
 		because a pessimistic insert releases the tree x-latch,
 		which would cause the sx-latching of the root after that to
 		break the latching order. */
-
-		root = ibuf_tree_root_get(&mtr)->frame;
+		if (buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr)) {
+			root = ibuf_root->page.frame;
+		} else {
+			err = DB_CORRUPTION;
+			mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+			mysql_mutex_unlock(&ibuf_mutex);
+			goto ibuf_insert_done;
+		}
 
 		err = btr_cur_optimistic_insert(
 			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
@@ -3439,15 +3450,16 @@ commit_exit:
 				&dummy_big_rec, 0, thr, &mtr);
 		}
 
-		mutex_exit(&ibuf_pessimistic_insert_mutex);
+		mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
 		ibuf_size_update(root);
-		mutex_exit(&ibuf_mutex);
+		mysql_mutex_unlock(&ibuf_mutex);
 		ibuf.empty = page_is_empty(root);
 
 		block = btr_cur_get_block(cursor);
 		ut_ad(block->page.id().space() == IBUF_SPACE_ID);
 	}
 
+ibuf_insert_done:
 	if (offsets_heap) {
 		mem_heap_free(offsets_heap);
 	}
@@ -3458,25 +3470,7 @@ commit_exit:
 				       thr_get_trx(thr)->id, &mtr);
 	}
 
-func_exit:
-	ibuf_mtr_commit(&mtr);
-	btr_pcur_close(&pcur);
-
-	mem_heap_free(heap);
-
-	if (err == DB_SUCCESS
-	    && BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
-		ibuf_contract_after_insert(entry_size);
-	}
-
-	if (do_merge) {
-#ifdef UNIV_IBUF_DEBUG
-		ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
-#endif
-		ibuf_read_merge_pages(space_ids, page_nos, n_stored);
-	}
-
-	return(err);
+	goto func_exit;
 }
 
 /** Buffer an operation in the change buffer, instead of applying it
@@ -3489,6 +3483,7 @@ is clustered or unique.
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in,out]	thr		query thread
 @return true if success */
+TRANSACTIONAL_TARGET
 bool
 ibuf_insert(
 	ibuf_op_t		op,
@@ -3576,7 +3571,8 @@ check_watch:
 	that the issuer of IBUF_OP_DELETE has called
 	buf_pool_t::watch_set(). */
 
-	if (buf_pool.page_hash_contains<true>(page_id)) {
+	if (buf_pool.page_hash_contains<true>(
+		    page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
 		/* A buffer pool watch has been set or the
 		page has been read into the buffer pool.
 		Do not buffer the request.  If a purge operation
@@ -3600,7 +3596,7 @@ skip_watch:
 			      entry, entry_size,
 			      index, page_id, zip_size, thr);
 	if (err == DB_FAIL) {
-		err = ibuf_insert_low(BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+		err = ibuf_insert_low(BTR_INSERT_TREE,
 				      op, no_counter, entry, entry_size,
 				      index, page_id, zip_size, thr);
 	}
@@ -3611,83 +3607,47 @@ skip_watch:
 	DBUG_RETURN(err == DB_SUCCESS);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /********************************************************************//**
 During merge, inserts to an index page a secondary index entry extracted
 from the insert buffer.
-@return	newly inserted record */
-static MY_ATTRIBUTE((nonnull))
-rec_t*
+@return	error code */
+static
+dberr_t
 ibuf_insert_to_index_page_low(
 /*==========================*/
 	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
-	buf_block_t*	block,	/*!< in/out: index page where the buffered
-				entry should be placed */
-	dict_index_t*	index,	/*!< in: record descriptor */
 	rec_offs**	offsets,/*!< out: offsets on *rec */
 	mem_heap_t*	heap,	/*!< in/out: memory heap */
 	mtr_t*		mtr,	/*!< in/out: mtr */
 	page_cur_t*	page_cur)/*!< in/out: cursor positioned on the record
 				after which to insert the buffered entry */
 {
-	rec_t*		rec;
-	DBUG_ENTER("ibuf_insert_to_index_page_low");
-
-	rec = page_cur_tuple_insert(page_cur, entry, index,
-				    offsets, &heap, 0, mtr);
-	if (rec != NULL) {
-		DBUG_RETURN(rec);
-	}
-
-	/* Page reorganization or recompression should already have
-	been attempted by page_cur_tuple_insert(). Besides, per
-	ibuf_index_page_calc_free_zip() the page should not have been
-	recompressed or reorganized. */
-	ut_ad(!is_buf_block_get_page_zip(block));
+  if (page_cur_tuple_insert(page_cur, entry, offsets, &heap, 0, mtr))
+    return DB_SUCCESS;
 
-	/* If the record did not fit, reorganize */
+  /* Page reorganization or recompression should already have been
+  attempted by page_cur_tuple_insert(). Besides, per
+  ibuf_index_page_calc_free_zip() the page should not have been
+  recompressed or reorganized. */
+  ut_ad(!is_buf_block_get_page_zip(page_cur->block));
 
-	btr_page_reorganize(page_cur, index, mtr);
+  /* If the record did not fit, reorganize */
+  if (dberr_t err= btr_page_reorganize(page_cur, mtr))
+    return err;
 
-	/* This time the record must fit */
-
-	rec = page_cur_tuple_insert(page_cur, entry, index,
-				    offsets, &heap, 0, mtr);
-	if (rec != NULL) {
-		DBUG_RETURN(rec);
-	}
-
-	ib::error() << "Insert buffer insert fails; page free "
-		    << page_get_max_insert_size(block->frame, 1)
-		    << ", dtuple size "
-		    << rec_get_converted_size(index, entry, 0);
-
-	fputs("InnoDB: Cannot insert index record ", stderr);
-	dtuple_print(stderr, entry);
-	fputs("\nInnoDB: The table where this index record belongs\n"
-	      "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
-	      "InnoDB: that table.\n", stderr);
-
-	if (buf_block_t *bitmap_page =  ibuf_bitmap_get_map_page(
-			block->page.id(), block->zip_size(), mtr)) {
-
-		ib::error() << "page " << block->page.id() << ", size "
-			    << block->physical_size() << ", bitmap bits "
-			    << ibuf_bitmap_page_get_bits(bitmap_page->frame,
-					block->page.id(), block->zip_size(),
-					IBUF_BITMAP_FREE, mtr);
-	}
+  /* This time the record must fit */
+  if (page_cur_tuple_insert(page_cur, entry, offsets, &heap, 0, mtr))
+    return DB_SUCCESS;
 
-	ib::error() << BUG_REPORT_MSG;
-
-	ut_ad(0);
-	DBUG_RETURN(NULL);
+  return DB_CORRUPTION;
 }
 
 /************************************************************************
 During merge, inserts to an index page a secondary index entry extracted
 from the insert buffer. */
 static
-void
+dberr_t
 ibuf_insert_to_index_page(
 /*======================*/
 	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
@@ -3697,14 +3657,11 @@ ibuf_insert_to_index_page(
 	mtr_t*		mtr)	/*!< in: mtr */
 {
 	page_cur_t	page_cur;
-	ulint		low_match;
 	page_t*		page		= buf_block_get_frame(block);
 	rec_t*		rec;
 	rec_offs*	offsets;
 	mem_heap_t*	heap;
 
-	DBUG_ENTER("ibuf_insert_to_index_page");
-
 	DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF,
 			    block->page.id().space(),
 			    block->page.id().page_no()));
@@ -3723,38 +3680,30 @@ ibuf_insert_to_index_page(
 
 	if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
 			  != (ibool)!!page_is_comp(page))) {
-		ib::warn() << "Trying to insert a record from the insert"
-			" buffer to an index page but the 'compact' flag does"
-			" not match!";
-		goto dump;
+		return DB_CORRUPTION;
 	}
 
 	rec = page_rec_get_next(page_get_infimum_rec(page));
 
-	if (page_rec_is_supremum(rec)) {
-		ib::warn() << "Trying to insert a record from the insert"
-			" buffer to an index page but the index page"
-			" is empty!";
-		goto dump;
+	if (!rec || page_rec_is_supremum(rec)) {
+		return DB_CORRUPTION;
 	}
 
 	if (!rec_n_fields_is_sane(index, rec, entry)) {
-		ib::warn() << "Trying to insert a record from the insert"
-			" buffer to an index page but the number of fields"
-			" does not match!";
-		rec_print(stderr, rec, index);
-dump:
-		dtuple_print(stderr, entry);
-		ut_ad(0);
+		return DB_CORRUPTION;
+	}
 
-		ib::warn() << "The table where this index record belongs"
-			" is now probably corrupt. Please run CHECK TABLE on"
-			" your tables. " << BUG_REPORT_MSG;
+	ulint up_match = 0, low_match = 0;
+	page_cur.index = index;
+	page_cur.block = block;
 
-		DBUG_VOID_RETURN;
+	if (page_cur_search_with_match(entry, PAGE_CUR_LE,
+				       &up_match, &low_match, &page_cur,
+				       nullptr)) {
+		return DB_CORRUPTION;
 	}
 
-	low_match = page_cur_search(block, index, entry, &page_cur);
+	dberr_t err = DB_SUCCESS;
 
 	heap = mem_heap_create(
 		sizeof(upd_t)
@@ -3795,7 +3744,7 @@ dump:
 		if (!row_upd_changes_field_size_or_external(index, offsets,
 							    update)
 		    && (!page_zip || btr_cur_update_alloc_zip(
-				page_zip, &page_cur, index, offsets,
+				page_zip, &page_cur, offsets,
 				rec_offs_size(offsets), false, mtr))) {
 			/* This is the easy case. Do something similar
 			to btr_cur_update_in_place(). */
@@ -3836,25 +3785,21 @@ dump:
 		/* Delete the different-length record, and insert the
 		buffered one. */
 
-		lock_rec_store_on_page_infimum(block, rec);
-		page_cur_delete_rec(&page_cur, index, offsets, mtr);
-		page_cur_move_to_prev(&page_cur);
-		rec = ibuf_insert_to_index_page_low(entry, block, index,
-				      		    &offsets, heap, mtr,
-						    &page_cur);
-
-		ut_ad(!cmp_dtuple_rec(entry, rec, offsets));
-		lock_rec_restore_from_page_infimum(block, rec, block);
+		page_cur_delete_rec(&page_cur, offsets, mtr);
+		if (!(page_cur_move_to_prev(&page_cur))) {
+			err = DB_CORRUPTION;
+			goto updated_in_place;
+		}
 	} else {
 		offsets = NULL;
-		ibuf_insert_to_index_page_low(entry, block, index,
-					      &offsets, heap, mtr,
-					      &page_cur);
 	}
+
+	err = ibuf_insert_to_index_page_low(entry, &offsets, heap, mtr,
+                                            &page_cur);
 updated_in_place:
 	mem_heap_free(heap);
 
-	DBUG_VOID_RETURN;
+	return err;
 }
 
 /****************************************************************//**
@@ -3866,18 +3811,21 @@ ibuf_set_del_mark(
 /*==============*/
 	const dtuple_t*		entry,	/*!< in: entry */
 	buf_block_t*		block,	/*!< in/out: block */
-	const dict_index_t*	index,	/*!< in: record descriptor */
+	dict_index_t*		index,	/*!< in: record descriptor */
 	mtr_t*			mtr)	/*!< in: mtr */
 {
 	page_cur_t	page_cur;
-	ulint		low_match;
+	page_cur.block = block;
+	page_cur.index = index;
+	ulint		up_match = 0, low_match = 0;
 
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(dtuple_check_typed(entry));
 
-	low_match = page_cur_search(block, index, entry, &page_cur);
-
-	if (low_match == dtuple_get_n_fields(entry)) {
+	if (!page_cur_search_with_match(entry, PAGE_CUR_LE,
+					&up_match, &low_match, &page_cur,
+					nullptr)
+	    && low_match == dtuple_get_n_fields(entry)) {
 		rec_t* rec = page_cur_get_rec(&page_cur);
 
 		/* Delete mark the old index record. According to a
@@ -3926,16 +3874,19 @@ ibuf_delete(
 				before latching any further pages */
 {
 	page_cur_t	page_cur;
-	ulint		low_match;
+	page_cur.block = block;
+	page_cur.index = index;
+	ulint		up_match = 0, low_match = 0;
 
 	ut_ad(ibuf_inside(mtr));
 	ut_ad(dtuple_check_typed(entry));
 	ut_ad(!index->is_spatial());
 	ut_ad(!index->is_clust());
 
-	low_match = page_cur_search(block, index, entry, &page_cur);
-
-	if (low_match == dtuple_get_n_fields(entry)) {
+	if (!page_cur_search_with_match(entry, PAGE_CUR_LE,
+					&up_match, &low_match, &page_cur,
+					nullptr)
+	    && low_match == dtuple_get_n_fields(entry)) {
 		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
 		page_t*		page	= buf_block_get_frame(block);
 		rec_t*		rec	= page_cur_get_rec(&page_cur);
@@ -3977,8 +3928,6 @@ ibuf_delete(
 			return;
 		}
 
-		lock_update_delete(block, rec);
-
 		if (!page_zip) {
 			max_ins_size
 				= page_get_max_insert_size_after_reorganize(
@@ -3987,7 +3936,7 @@ ibuf_delete(
 #ifdef UNIV_ZIP_DEBUG
 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
-		page_cur_delete_rec(&page_cur, index, offsets, mtr);
+		page_cur_delete_rec(&page_cur, offsets, mtr);
 #ifdef UNIV_ZIP_DEBUG
 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
@@ -4001,8 +3950,6 @@ ibuf_delete(
 		if (UNIV_LIKELY_NULL(heap)) {
 			mem_heap_free(heap);
 		}
-	} else {
-		/* The record must have been purged already. */
 	}
 }
 
@@ -4016,15 +3963,12 @@ ibuf_restore_pos(
 	const page_id_t	page_id,/*!< in: page identifier */
 	const dtuple_t*	search_tuple,
 				/*!< in: search tuple for entries of page_no */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
 	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor whose
 				position is to be restored */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ut_ad(mode == BTR_MODIFY_LEAF
-	      || BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE);
-
-	if (UNIV_LIKELY(btr_pcur_restore_position(mode, pcur, mtr) ==
+	if (UNIV_LIKELY(pcur->restore_position(mode, mtr) ==
 	      btr_pcur_t::SAME_ALL)) {
 		return true;
 	}
@@ -4041,9 +3985,6 @@ ibuf_restore_pos(
 		rec_print_old(stderr, btr_pcur_get_rec(pcur));
 		rec_print_old(stderr, pcur->old_rec);
 		dtuple_print(stderr, search_tuple);
-
-		rec_print_old(stderr,
-			      page_rec_get_next(btr_pcur_get_rec(pcur)));
 	}
 
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
@@ -4061,8 +4002,6 @@ static MY_ATTRIBUTE((warn_unused_result, nonnull))
 bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
 		     const dtuple_t* search_tuple, mtr_t* mtr)
 {
-	ibool		success;
-	page_t*		root;
 	dberr_t		err;
 
 	ut_ad(ibuf_inside(mtr));
@@ -4072,15 +4011,16 @@ bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
 	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur))
 	      == page_id.space());
 
-	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
-					    0, mtr);
-
-	if (success) {
+	switch (btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
+					  BTR_CREATE_FLAG, mtr)) {
+	case DB_FAIL:
+		break;
+	case DB_SUCCESS:
 		if (page_is_empty(btr_pcur_get_page(pcur))) {
 			/* If a B-tree page is empty, it must be the root page
 			and the whole B-tree must be empty. InnoDB does not
 			allow empty B-tree pages other than the root. */
-			root = btr_pcur_get_page(pcur);
+			ut_d(const page_t* root = btr_pcur_get_page(pcur));
 
 			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
 			ut_ad(page_get_page_no(root)
@@ -4091,7 +4031,8 @@ bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
 			ut_ad(!ibuf.empty);
 			ibuf.empty = true;
 		}
-
+		/* fall through */
+	default:
 		return(FALSE);
 	}
 
@@ -4106,27 +4047,26 @@ bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
 
 	ibuf_mtr_start(mtr);
-	mutex_enter(&ibuf_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
+	mtr_x_lock_index(ibuf.index, mtr);
 
 	if (!ibuf_restore_pos(page_id, search_tuple,
-			      BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-			      pcur, mtr)) {
-
-		mutex_exit(&ibuf_mutex);
-		ut_ad(mtr->has_committed());
+			      BTR_PURGE_TREE_ALREADY_LATCHED, pcur, mtr)) {
+		mysql_mutex_unlock(&ibuf_mutex);
 		goto func_exit;
 	}
 
-	root = ibuf_tree_root_get(mtr)->frame;
+	if (buf_block_t* ibuf_root = ibuf_tree_root_get(mtr)) {
+		btr_cur_pessimistic_delete(&err, TRUE,
+					   btr_pcur_get_btr_cur(pcur),
+					   BTR_CREATE_FLAG, false, mtr);
+		ut_a(err == DB_SUCCESS);
 
-	btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0,
-				   false, mtr);
-	ut_a(err == DB_SUCCESS);
-
-	ibuf_size_update(root);
-	mutex_exit(&ibuf_mutex);
+		ibuf_size_update(ibuf_root->page.frame);
+		ibuf.empty = page_is_empty(ibuf_root->page.frame);
+	}
 
-	ibuf.empty = page_is_empty(root);
+	mysql_mutex_unlock(&ibuf_mutex);
 	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
 
 func_exit:
@@ -4158,7 +4098,7 @@ bool ibuf_page_exists(const page_id_t id, ulint zip_size)
 	if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
 		    id, zip_size, &mtr)) {
 		bitmap_bits = ibuf_bitmap_page_get_bits(
-			bitmap_page->frame, id, zip_size,
+			bitmap_page->page.frame, id, zip_size,
 			IBUF_BITMAP_BUFFERED, &mtr) != 0;
 	}
 	ibuf_mtr_commit(&mtr);
@@ -4196,28 +4136,20 @@ exist entries for such a page if the page belonged to an index which
 subsequently was dropped.
 @param block    X-latched page to try to apply changes to, or NULL to discard
 @param page_id  page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
-void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
-                                   ulint zip_size)
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+                                      const page_id_t page_id,
+                                      ulint zip_size)
 {
 	if (trx_sys_hdr_page(page_id)) {
-		return;
+		return DB_SUCCESS;
 	}
 
-	btr_pcur_t	pcur;
-#ifdef UNIV_IBUF_DEBUG
-	ulint		volume			= 0;
-#endif /* UNIV_IBUF_DEBUG */
-	bool		corruption_noticed	= false;
-	mtr_t		mtr;
-
-	/* Counts for merged & discarded operations. */
-	ulint		mops[IBUF_OP_COUNT];
-	ulint		dops[IBUF_OP_COUNT];
-
 	ut_ad(!block || page_id == block->page.id());
-	ut_ad(!block || block->page.state() == BUF_BLOCK_FILE_PAGE);
-	ut_ad(!block || block->page.status == buf_page_t::NORMAL);
+	ut_ad(!block || block->page.frame);
+	ut_ad(!block || !block->page.is_ibuf_exist());
+	ut_ad(!block || !block->page.is_reinit());
 	ut_ad(!trx_sys_hdr_page(page_id));
 	ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
 
@@ -4225,13 +4157,20 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 
 	if (ibuf_fixed_addr_page(page_id, physical_size)
 	    || fsp_descr_page(page_id, physical_size)) {
-		return;
+		return DB_SUCCESS;
 	}
 
+	btr_pcur_t	pcur;
+#ifdef UNIV_IBUF_DEBUG
+	ulint		volume			= 0;
+#endif /* UNIV_IBUF_DEBUG */
+	dberr_t		err = DB_SUCCESS;
+	mtr_t		mtr;
+
 	fil_space_t* space = fil_space_t::get(page_id.space());
 
 	if (UNIV_UNLIKELY(!space)) {
-		block = NULL;
+		block = nullptr;
 	} else {
 		ulint	bitmap_bits = 0;
 
@@ -4241,24 +4180,25 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 			page_id, zip_size, &mtr);
 
 		if (bitmap_page
-		    && fil_page_get_type(bitmap_page->frame)
+		    && fil_page_get_type(bitmap_page->page.frame)
 		    != FIL_PAGE_TYPE_ALLOCATED) {
 			bitmap_bits = ibuf_bitmap_page_get_bits(
-				bitmap_page->frame, page_id, zip_size,
+				bitmap_page->page.frame, page_id, zip_size,
 				IBUF_BITMAP_BUFFERED, &mtr);
 		}
 
 		ibuf_mtr_commit(&mtr);
 
-		if (bitmap_bits && fseg_page_is_free(
-				space, page_id.page_no())) {
+		if (bitmap_bits
+		    && DB_SUCCESS
+		    == fseg_page_is_allocated(space, page_id.page_no())) {
 			ibuf_mtr_start(&mtr);
 			mtr.set_named_space(space);
 			ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
 			ibuf_mtr_commit(&mtr);
 			bitmap_bits = 0;
 			if (!block
-			    || btr_page_get_index_id(block->frame)
+			    || btr_page_get_index_id(block->page.frame)
 			    != DICT_IBUF_ID_MIN + IBUF_SPACE_ID) {
 				ibuf_delete_recs(page_id);
 			}
@@ -4267,66 +4207,54 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
 		if (!bitmap_bits) {
 			/* No changes are buffered for this page. */
 			space->release();
-			return;
+			return DB_SUCCESS;
 		}
 	}
 
-	mem_heap_t* heap = mem_heap_create(512);
-
-	const dtuple_t* search_tuple = ibuf_search_tuple_build(
-		page_id.space(), page_id.page_no(), heap);
-
-	if (block != NULL) {
+	if (!block) {
+	} else if (!fil_page_index_page_check(block->page.frame)
+		   || !page_is_leaf(block->page.frame)) {
+		space->set_corrupted();
+		err = DB_CORRUPTION;
+		block = nullptr;
+	} else {
 		/* Move the ownership of the x-latch on the page to this OS
 		thread, so that we can acquire a second x-latch on it. This
 		is needed for the insert operations to the index page to pass
 		the debug checks. */
 
-		rw_lock_x_lock_move_ownership(&(block->lock));
+		block->page.lock.claim_ownership();
+	}
 
-		if (!fil_page_index_page_check(block->frame)
-		    || !page_is_leaf(block->frame)) {
+	mem_heap_t* heap = mem_heap_create(512);
 
-			corruption_noticed = true;
+	const dtuple_t* search_tuple = ibuf_search_tuple_build(
+		page_id.space(), page_id.page_no(), heap);
 
-			ib::error() << "Corruption in the tablespace. Bitmap"
-				" shows insert buffer records to page "
-				<< page_id << " though the page type is "
-				<< fil_page_get_type(block->frame)
-				<< ", which is not an index leaf page. We try"
-				" to resolve the problem by skipping the"
-				" insert buffer merge for this page. Please"
-				" run CHECK TABLE on your tables to determine"
-				" if they are corrupt after this.";
-			ut_ad(0);
-		}
-	}
+	/* Counts for merged & discarded operations. */
+	ulint mops[IBUF_OP_COUNT];
+	ulint dops[IBUF_OP_COUNT];
 
 	memset(mops, 0, sizeof(mops));
 	memset(dops, 0, sizeof(dops));
+	pcur.btr_cur.page_cur.index = ibuf.index;
 
 loop:
 	ibuf_mtr_start(&mtr);
 
 	/* Position pcur in the insert buffer at the first entry for this
 	index page */
-	btr_pcur_open_on_user_rec(
-		ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
-		&pcur, &mtr);
+	if (btr_pcur_open_on_user_rec(search_tuple,
+				      BTR_MODIFY_LEAF, &pcur, &mtr)
+	    != DB_SUCCESS) {
+		err = DB_CORRUPTION;
+		goto reset_bit;
+	}
 
 	if (block) {
-		ut_ad(rw_lock_own(&block->lock, RW_LOCK_X));
-		buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-		rw_lock_x_lock(&block->lock);
-
+		block->page.fix();
+		block->page.lock.x_lock_recursive();
 		mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
-		/* This is a user page (secondary index leaf page),
-		but we pretend that it is a change buffer page in
-		order to obey the latching order. This should be OK,
-		because buffered changes are applied immediately while
-		the block is io-fixed. Other threads must not try to
-		latch an io-fixed block. */
-		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
 	}
 
 	if (space) {
@@ -4356,7 +4284,7 @@ loop:
 			goto reset_bit;
 		}
 
-		if (corruption_noticed) {
+		if (err) {
 			fputs("InnoDB: Discarding record\n ", stderr);
 			rec_print_old(stderr, rec);
 			fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
@@ -4384,7 +4312,7 @@ loop:
 			dummy_index->table->space = space;
 			dummy_index->table->space_id = space->id;
 
-			ut_ad(page_validate(block->frame, dummy_index));
+			ut_ad(page_validate(block->page.frame, dummy_index));
 
 			switch (op) {
 			case IBUF_OP_INSERT:
@@ -4434,19 +4362,10 @@ loop:
 				ibuf_mtr_start(&mtr);
 				mtr.set_named_space(space);
 
-				ut_ad(rw_lock_own(&block->lock, RW_LOCK_X));
-				buf_block_buf_fix_inc(block,
-						      __FILE__, __LINE__);
-				rw_lock_x_lock(&block->lock);
+				block->page.lock.x_lock_recursive();
+				block->fix();
 				mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
 
-				/* This is a user page (secondary
-				index leaf page), but it should be OK
-				to use too low latching order for it,
-				as the block is io-fixed. */
-				buf_block_dbg_add_level(
-					block, SYNC_IBUF_TREE_NODE);
-
 				if (!ibuf_restore_pos(page_id, search_tuple,
 						      BTR_MODIFY_LEAF,
 						      &pcur, &mtr)) {
@@ -4478,8 +4397,6 @@ loop:
 			goto loop;
 		} else if (btr_pcur_is_after_last_on_page(&pcur)) {
 			ibuf_mtr_commit(&mtr);
-			btr_pcur_close(&pcur);
-
 			goto loop;
 		}
 	}
@@ -4490,17 +4407,19 @@ reset_bit:
 	}
 
 	ibuf_mtr_commit(&mtr);
+	ut_free(pcur.old_rec_buf);
 
 	if (space) {
 		space->release();
 	}
 
-	btr_pcur_close(&pcur);
 	mem_heap_free(heap);
 
 	ibuf.n_merges++;
 	ibuf_add_ops(ibuf.n_merged_ops, mops);
 	ibuf_add_ops(ibuf.n_discarded_ops, dops);
+
+	return err;
 }
 
 /** Delete all change buffer entries for a tablespace,
@@ -4536,15 +4455,19 @@ void ibuf_delete_for_discarded_space(ulint space)
 	cursor positioned at the first entry for this space id */
 
 	memset(dops, 0, sizeof(dops));
+	pcur.btr_cur.page_cur.index = ibuf.index;
+
 loop:
 	log_free_check();
 	ibuf_mtr_start(&mtr);
 
 	/* Position pcur in the insert buffer at the first entry for the
 	space */
-	btr_pcur_open_on_user_rec(
-		ibuf.index, &search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
-		&pcur, &mtr);
+	if (btr_pcur_open_on_user_rec(&search_tuple,
+				      BTR_MODIFY_LEAF, &pcur, &mtr)
+	    != DB_SUCCESS) {
+		goto leave_loop;
+	}
 
 	if (!btr_pcur_is_on_user_rec(&pcur)) {
 		ut_ad(btr_pcur_is_after_last_on_page(&pcur));
@@ -4573,20 +4496,20 @@ loop:
 			we start from the beginning again */
 
 			ut_ad(mtr.has_committed());
+clear:
+			ut_free(pcur.old_rec_buf);
 			goto loop;
 		}
 
 		if (btr_pcur_is_after_last_on_page(&pcur)) {
 			ibuf_mtr_commit(&mtr);
-			btr_pcur_close(&pcur);
-
-			goto loop;
+			goto clear;
 		}
 	}
 
 leave_loop:
 	ibuf_mtr_commit(&mtr);
-	btr_pcur_close(&pcur);
+	ut_free(pcur.old_rec_buf);
 
 	ibuf_add_ops(ibuf.n_discarded_ops, dops);
 }
@@ -4602,11 +4525,11 @@ ibuf_is_empty(void)
 
 	ibuf_mtr_start(&mtr);
 
-	ut_d(mutex_enter(&ibuf_mutex));
+	ut_d(mysql_mutex_lock(&ibuf_mutex));
 	const buf_block_t* root = ibuf_tree_root_get(&mtr);
-	bool is_empty = page_is_empty(root->frame);
-	ut_a(is_empty == ibuf.empty);
-	ut_d(mutex_exit(&ibuf_mutex));
+	bool is_empty = root && page_is_empty(root->page.frame);
+	ut_ad(!root || is_empty == ibuf.empty);
+	ut_d(mysql_mutex_unlock(&ibuf_mutex));
 	ibuf_mtr_commit(&mtr);
 
 	return(is_empty);
@@ -4620,7 +4543,7 @@ ibuf_print(
 	FILE*	file)	/*!< in: file where to print */
 {
 	if (UNIV_UNLIKELY(!ibuf.index)) return;
-	mutex_enter(&ibuf_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
 
 	fprintf(file,
 		"Ibuf: size " ULINTPF ", free list len " ULINTPF ","
@@ -4636,7 +4559,7 @@ ibuf_print(
 	fputs("discarded operations:\n ", file);
 	ibuf_print_ops(ibuf.n_discarded_ops, file);
 
-	mutex_exit(&ibuf_mutex);
+	mysql_mutex_unlock(&ibuf_mutex);
 }
 
 /** Check the insert buffer bitmaps on IMPORT TABLESPACE.
@@ -4679,7 +4602,7 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 			return DB_CORRUPTION;
 		}
 
-		if (buf_is_zeroes(span<const byte>(bitmap_page->frame,
+		if (buf_is_zeroes(span<const byte>(bitmap_page->page.frame,
 						   physical_size))) {
 			/* This means we got all-zero page instead of
 			ibuf bitmap page. The subsequent page should be
@@ -4691,7 +4614,7 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 				buf_block_t* block = buf_page_get(
 					page_id_t(space->id, curr_page),
 					zip_size, RW_S_LATCH, &mtr);
-	                        page_t*	page = buf_block_get_frame(block);
+				page_t*	page = buf_block_get_frame(block);
 				ut_ad(buf_is_zeroes(span<const byte>(
 							    page,
 							    physical_size)));
@@ -4707,7 +4630,8 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 			const page_id_t	cur_page_id(space->id, offset);
 
 			if (ibuf_bitmap_page_get_bits(
-				    bitmap_page->frame, cur_page_id, zip_size,
+				    bitmap_page->page.frame,
+				    cur_page_id, zip_size,
 				    IBUF_BITMAP_IBUF, &mtr)) {
 
 				mtr_commit(&mtr);
@@ -4723,7 +4647,8 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 			}
 
 			if (ibuf_bitmap_page_get_bits(
-				    bitmap_page->frame, cur_page_id, zip_size,
+				    bitmap_page->page.frame,
+				    cur_page_id, zip_size,
 				    IBUF_BITMAP_BUFFERED, &mtr)) {
 
 				ib_errf(trx->mysql_thd,
@@ -4750,13 +4675,13 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
 
 void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset)
 {
-  ut_a(page_is_leaf(block->frame));
+  ut_a(page_is_leaf(block->page.frame));
   const page_id_t id{block->page.id()};
   const auto zip_size= block->zip_size();
 
   if (buf_block_t *bitmap_page= ibuf_bitmap_get_map_page(id, zip_size, mtr))
   {
-    if (ibuf_bitmap_page_get_bits(bitmap_page->frame, id, zip_size,
+    if (ibuf_bitmap_page_get_bits(bitmap_page->page.frame, id, zip_size,
                                   IBUF_BITMAP_BUFFERED, mtr))
       ibuf_delete_recs(id);
 
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index c0dcc6f39d3..a56598d3620 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -2,7 +2,7 @@
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2014, 2021, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,8 +25,7 @@ The B-tree
 Created 6/2/1994 Heikki Tuuri
 *******************************************************/
 
-#ifndef btr0btr_h
-#define btr0btr_h
+#pragma once
 
 #include "dict0dict.h"
 #include "data0data.h"
@@ -56,146 +55,20 @@ not acceptable for it to lead to mysterious memory corruption, but it
 is acceptable for the program to die with a clear assert failure. */
 #define BTR_MAX_LEVELS		100
 
-/** Latching modes for btr_cur_search_to_nth_level(). */
-enum btr_latch_mode {
-	/** Search a record on a leaf page and S-latch it. */
-	BTR_SEARCH_LEAF = RW_S_LATCH,
-	/** (Prepare to) modify a record on a leaf page and X-latch it. */
-	BTR_MODIFY_LEAF	= RW_X_LATCH,
-	/** Obtain no latches. */
-	BTR_NO_LATCHES = RW_NO_LATCH,
-	/** Start modifying the entire B-tree. */
-	BTR_MODIFY_TREE = 33,
-	/** Continue modifying the entire B-tree. */
-	BTR_CONT_MODIFY_TREE = 34,
-	/** Search the previous record. */
-	BTR_SEARCH_PREV = 35,
-	/** Modify the previous record. */
-	BTR_MODIFY_PREV = 36,
-	/** Start searching the entire B-tree. */
-	BTR_SEARCH_TREE = 37,
-	/** Continue searching the entire B-tree. */
-	BTR_CONT_SEARCH_TREE = 38,
-
-	/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
-	exclusive. */
-	/** The search tuple will be inserted to the secondary index
-	at the searched position.  When the leaf page is not in the
-	buffer pool, try to use the change buffer. */
-	BTR_INSERT = 512,
-
-	/** Try to delete mark a secondary index leaf page record at
-	the searched position using the change buffer when the page is
-	not in the buffer pool. */
-	BTR_DELETE_MARK	= 4096,
-
-	/** Try to purge the record using the change buffer when the
-	secondary index leaf page is not in the buffer pool. */
-	BTR_DELETE = 8192,
-
-	/** The caller is already holding dict_index_t::lock S-latch. */
-	BTR_ALREADY_S_LATCHED = 16384,
-	/** Search and S-latch a leaf page, assuming that the
-	dict_index_t::lock S-latch is being held. */
-	BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
-	| BTR_ALREADY_S_LATCHED,
-	/** Search the entire index tree, assuming that the
-	dict_index_t::lock S-latch is being held. */
-	BTR_SEARCH_TREE_ALREADY_S_LATCHED = BTR_SEARCH_TREE
-	| BTR_ALREADY_S_LATCHED,
-	/** Search and X-latch a leaf page, assuming that the
-	dict_index_t::lock S-latch is being held. */
-	BTR_MODIFY_LEAF_ALREADY_S_LATCHED = BTR_MODIFY_LEAF
-	| BTR_ALREADY_S_LATCHED,
-
-	/** Attempt to delete-mark a secondary index record. */
-	BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
-	/** Attempt to delete-mark a secondary index record
-	while holding the dict_index_t::lock S-latch. */
-	BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
-	| BTR_ALREADY_S_LATCHED,
-	/** Attempt to purge a secondary index record. */
-	BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
-	/** Attempt to purge a secondary index record
-	while holding the dict_index_t::lock S-latch. */
-	BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
-	| BTR_ALREADY_S_LATCHED,
-
-	/** In the case of BTR_MODIFY_TREE, the caller specifies
-	the intention to delete record only. It is used to optimize
-	block->lock range.*/
-	BTR_LATCH_FOR_DELETE = 65536,
-
-	/** Attempt to purge a secondary index record in the tree. */
-	BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE
-};
-
-/** This flag ORed to btr_latch_mode says that we do the search in query
-optimization */
-#define BTR_ESTIMATE		1024U
-
-/** This flag ORed to BTR_INSERT says that we can ignore possible
-UNIQUE definition on secondary indexes when we decide if we can use
-the insert buffer to speed up inserts */
-#define BTR_IGNORE_SEC_UNIQUE	2048U
-
-/** In the case of BTR_MODIFY_TREE, the caller specifies the intention
-to insert record only. It is used to optimize block->lock range.*/
-#define BTR_LATCH_FOR_INSERT	32768U
-
-/** This flag is for undo insert of rtree. For rtree, we need this flag
-to find proper rec to undo insert.*/
-#define BTR_RTREE_UNDO_INS	131072U
-
-/** In the case of BTR_MODIFY_LEAF, the caller intends to allocate or
-free the pages of externally stored fields. */
-#define BTR_MODIFY_EXTERNAL	262144U
-
-/** Try to delete mark the record at the searched position when the
-record is in spatial index */
-#define BTR_RTREE_DELETE_MARK	524288U
-
 #define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode)		\
-	((latch_mode) & ulint(~(BTR_INSERT			\
+	btr_latch_mode((latch_mode) & ~(BTR_INSERT	\
 				| BTR_DELETE_MARK		\
 				| BTR_RTREE_UNDO_INS		\
 				| BTR_RTREE_DELETE_MARK		\
 				| BTR_DELETE			\
-				| BTR_ESTIMATE			\
 				| BTR_IGNORE_SEC_UNIQUE		\
 				| BTR_ALREADY_S_LATCHED		\
 				| BTR_LATCH_FOR_INSERT		\
-				| BTR_LATCH_FOR_DELETE		\
-				| BTR_MODIFY_EXTERNAL)))
-
-#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode)		\
-	((latch_mode) & ulint(~(BTR_LATCH_FOR_INSERT		\
-				| BTR_LATCH_FOR_DELETE		\
-				| BTR_MODIFY_EXTERNAL)))
-
-/** Report that an index page is corrupted.
-@param[in]	buffer block
-@param[in]	index tree */
-ATTRIBUTE_COLD ATTRIBUTE_NORETURN __attribute__((nonnull))
-void btr_corruption_report(const buf_block_t* block,const dict_index_t* index);
-
-/** Assert that a B-tree page is not corrupted.
-@param block buffer block containing a B-tree page
-@param index the B-tree index */
-#define btr_assert_not_corrupted(block, index)		\
-	if (!!page_is_comp(buf_block_get_frame(block))	\
-	    != index->table->not_redundant())		\
-		btr_corruption_report(block, index)
+				| BTR_LATCH_FOR_DELETE))
 
-/**************************************************************//**
-Gets the root node of a tree and sx-latches it for segment access.
-@return root page, sx-latched */
-page_t*
-btr_root_get(
-/*=========*/
-	const dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*			mtr)	/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull));
+#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode)			\
+	btr_latch_mode((latch_mode)					\
+		       & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE))
 
 /**************************************************************//**
 Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
@@ -206,67 +79,21 @@ btr_root_adjust_on_import(
 	const dict_index_t*	index)	/*!< in: index tree */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
-	const dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((warn_unused_result));
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
 
 /** Get an index page and declare its latching order level.
 @param[in]	index	index tree
 @param[in]	page	page number
 @param[in]	mode	latch mode
 @param[in]	merge	whether change buffer merge should be attempted
-@param[in]	file	file name
-@param[in]	line	line where called
 @param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
 @return block */
-inline buf_block_t* btr_block_get_func(const dict_index_t& index,
-				       uint32_t page, ulint mode, bool merge,
-				       const char* file, unsigned line,
-				       mtr_t* mtr)
-{
-	dberr_t err;
-
-	if (buf_block_t* block = buf_page_get_gen(
-		    page_id_t(index.table->space->id, page),
-		    index.table->space->zip_size(), mode, NULL, BUF_GET,
-		    file, line, mtr, &err, merge && !index.is_clust())) {
-		ut_ad(err == DB_SUCCESS);
-		if (mode != RW_NO_LATCH) {
-			buf_block_dbg_add_level(block, index.is_ibuf()
-						? SYNC_IBUF_TREE_NODE
-						: SYNC_TREE_NODE);
-		}
-		return block;
-	} else {
-		ut_ad(err != DB_SUCCESS);
-
-		if (err == DB_DECRYPTION_FAILED) {
-			if (index.table) {
-				index.table->file_unreadable = true;
-			}
-		}
-
-		return NULL;
-	}
-}
+buf_block_t *btr_block_get(const dict_index_t &index,
+                           uint32_t page, ulint mode, bool merge,
+                           mtr_t *mtr, dberr_t *err= nullptr);
 
-/** Gets a buffer page and declares its latching order level.
-@param index index tree
-@param page page number
-@param mode latch mode
-@param merge whether change buffer merge should be attempted
-@param mtr mini-transaction handle
-@return the block descriptor */
-# define btr_block_get(index, page, mode, merge, mtr)		\
-	btr_block_get_func(index, page, mode, merge, __FILE__, __LINE__, mtr)
 /**************************************************************//**
 Gets the index id field of a page.
 @return index id */
@@ -305,17 +132,6 @@ inline uint32_t btr_page_get_prev(const page_t* page)
 }
 
 /**************************************************************//**
-Releases the latch on a leaf page and bufferunfixes it. */
-UNIV_INLINE
-void
-btr_leaf_page_release(
-/*==================*/
-	buf_block_t*	block,		/*!< in: buffer block */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
-					BTR_MODIFY_LEAF */
-	mtr_t*		mtr)		/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull));
-/**************************************************************//**
 Gets the child node file address in a node pointer.
 NOTE: the offsets array must contain all offsets for the record since
 we read the last field according to offsets and assume that it contains
@@ -336,6 +152,7 @@ btr_node_ptr_get_child_page_no(
 @param[in]	index_id		index id
 @param[in]	index			index, or NULL to create a system table
 @param[in,out]	mtr			mini-transaction
+@param[out]	err			error code
 @return	page number of the created root
 @retval	FIL_NULL	if did not succeed */
 uint32_t
@@ -344,23 +161,21 @@ btr_create(
 	fil_space_t*		space,
 	index_id_t		index_id,
 	dict_index_t*		index,
-	mtr_t*			mtr);
+	mtr_t*			mtr,
+	dberr_t*		err)
+	MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result));
 
 /** Free a persistent index tree if it exists.
-@param[in]	page_id		root page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	space		tablespce
+@param[in]	page		root page number
 @param[in]	index_id	PAGE_INDEX_ID contents
 @param[in,out]	mtr		mini-transaction */
-void
-btr_free_if_exists(
-	const page_id_t		page_id,
-	ulint			zip_size,
-	index_id_t		index_id,
-	mtr_t*			mtr);
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+                        index_id_t index_id, mtr_t *mtr);
 
-/** Free an index tree in a temporary tablespace.
-@param[in]	page_id		root page id */
-void btr_free(const page_id_t page_id);
+/** Drop a temporary table
+@param table   temporary table */
+void btr_drop_temporary_table(const dict_table_t &table);
 
 /** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
 @param[in,out]	index	clustered index
@@ -396,11 +211,11 @@ btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
 @param[in,out]	mtr	mini-transaction */
 void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
 
+ATTRIBUTE_COLD __attribute__((nonnull))
 /** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
 @param[in]      index   clustered index with instant ALTER TABLE
 @param[in]      all     whether to reset FIL_PAGE_TYPE as well
 @param[in,out]  mtr     mini-transaction */
-ATTRIBUTE_COLD __attribute__((nonnull))
 void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
 
 /*************************************************************//**
@@ -423,8 +238,9 @@ btr_root_raise_and_insert(
 				that can be emptied, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mtr */
-	MY_ATTRIBUTE((warn_unused_result));
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*************************************************************//**
 Reorganizes an index page.
 
@@ -434,15 +250,12 @@ be done either within the same mini-transaction, or by invoking
 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
 IBUF_BITMAP_FREE is unaffected by reorganization.
 
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool
-btr_page_reorganize(
-/*================*/
-	page_cur_t*	cursor,	/*!< in/out: page cursor */
-	dict_index_t*	index,	/*!< in: the index tree of the page */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((nonnull));
+@param cursor  page cursor
+@param mtr     mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 /** Decide if the page should be split at the convergence point of inserts
 converging to the left.
 @param[in]	cursor	insert position
@@ -481,23 +294,20 @@ btr_page_split_and_insert(
 				that can be emptied, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr)	/*!< in: mtr */
-	MY_ATTRIBUTE((warn_unused_result));
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*******************************************************//**
 Inserts a data tuple to a tree on a non-leaf level. It is assumed
 that mtr holds an x-latch on the tree. */
-void
-btr_insert_on_non_leaf_level_func(
-/*==============================*/
+dberr_t
+btr_insert_on_non_leaf_level(
 	ulint		flags,	/*!< in: undo logging and locking flags */
 	dict_index_t*	index,	/*!< in: index */
 	ulint		level,	/*!< in: level, must be > 0 */
 	dtuple_t*	tuple,	/*!< in: the record to be inserted */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	mtr_t*		mtr);	/*!< in: mtr */
-#define btr_insert_on_non_leaf_level(f,i,l,t,m)			\
-	btr_insert_on_non_leaf_level_func(f,i,l,t,__FILE__,__LINE__,m)
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** Set a child page pointer record as the predefined minimum record.
 @tparam has_prev  whether the page is supposed to have a left sibling
@@ -508,9 +318,9 @@ template<bool has_prev= false>
 inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
                                  mtr_t *mtr)
 {
-  ut_ad(block.frame == page_align(rec));
-  ut_ad(!page_is_leaf(block.frame));
-  ut_ad(has_prev == page_has_prev(block.frame));
+  ut_ad(block.page.frame == page_align(rec));
+  ut_ad(!page_is_leaf(block.page.frame));
+  ut_ad(has_prev == page_has_prev(block.page.frame));
 
   rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
 
@@ -523,13 +333,11 @@ inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
 }
 
 /** Seek to the parent page of a B-tree page.
-@param[in,out]	index	b-tree
-@param[in]	block	child page
 @param[in,out]	mtr	mini-transaction
-@param[out]	cursor	cursor pointing to the x-latched parent page */
-void btr_page_get_father(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
-			 btr_cur_t* cursor)
-	MY_ATTRIBUTE((nonnull));
+@param[in,out]	cursor	cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+	MY_ATTRIBUTE((nonnull,warn_unused_result));
 #ifdef UNIV_DEBUG
 /************************************************************//**
 Checks that the node pointer to a page is appropriate.
@@ -551,53 +359,29 @@ level lifts the records of the page to the father page, thus reducing the
 tree height. It is assumed that mtr holds an x-latch on the tree and on the
 page. If cursor is on the leaf level, mtr must also hold x-latches to
 the brothers, if they exist.
-@return TRUE on success */
-ibool
+@return error code
+@retval DB_FAIL if the tree could not be merged */
+dberr_t
 btr_compress(
 /*=========*/
 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
 				or lift; the page must not be empty:
 				when deleting records, use btr_discard_page()
 				if the page would become empty */
-	ibool		adjust,	/*!< in: TRUE if should adjust the
-				cursor position even if compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((nonnull));
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*************************************************************//**
 Discards a page from a B-tree. This is used to remove the last record from
 a B-tree page: the whole page must be removed at the same time. This cannot
 be used for the root page, which is allowed to be empty. */
-void
+dberr_t
 btr_discard_page(
 /*=============*/
 	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
 				the root page */
 	mtr_t*		mtr);	/*!< in: mtr */
-/**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
-	const dict_index_t*	index,	/*!< in: index */
-	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
-				is s-latched */
-	MY_ATTRIBUTE((warn_unused_result));
-/**************************************************************//**
-Gets the number of reserved and used pages in a B-tree.
-@return	number of pages reserved, or ULINT_UNDEFINED if the index
-is unavailable */
-UNIV_INTERN
-ulint
-btr_get_size_and_reserved(
-/*======================*/
-	dict_index_t*	index,	/*!< in: index */
-	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	ulint*		used,	/*!< out: number of pages used (<= reserved) */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
-				is s-latched */
-	__attribute__((nonnull));
 
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
@@ -614,9 +398,10 @@ btr_page_alloc(
 					in the tree */
 	mtr_t*		mtr,		/*!< in/out: mini-transaction
 					for the allocation */
-	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
+	mtr_t*		init_mtr,	/*!< in/out: mini-transaction
 					for x-latching and initializing
 					the page */
+	dberr_t*	err)		/*!< out: error code */
 	MY_ATTRIBUTE((warn_unused_result));
 /** Empty an index page (possibly the root page). @see btr_page_create().
 @param[in,out]	block		page to be emptied
@@ -648,10 +433,11 @@ btr_page_create(
 @param[in,out]	index	index tree
 @param[in,out]	block	block to be freed
 @param[in,out]	mtr	mini-transaction
-@param[in]	blob	whether this is freeing a BLOB page */
+@param[in]	blob	whether this is freeing a BLOB page
+@param[in]	latched	whether index->table->space->x_lock() was called */
 MY_ATTRIBUTE((nonnull))
-void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
-		   bool blob = false);
+dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr,
+                      bool blob= false, bool space_latched= false);
 
 /**************************************************************//**
 Gets the root node of a tree and x- or s-latches it.
@@ -659,11 +445,11 @@ Gets the root node of a tree and x- or s-latches it.
 buf_block_t*
 btr_root_block_get(
 /*===============*/
-	const dict_index_t*	index,	/*!< in: index tree */
+	dict_index_t*		index,	/*!< in: index tree */
 	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
 					or RW_X_LATCH */
-	mtr_t*			mtr);	/*!< in: mtr */
-
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err);	/*!< out: error code */
 /*************************************************************//**
 Reorganizes an index page.
 
@@ -673,15 +459,15 @@ be done either within the same mini-transaction, or by invoking
 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
 IBUF_BITMAP_FREE is unaffected by reorganization.
 
-@retval true if the operation was successful
-@retval false if it is a compressed page, and recompression failed */
-bool btr_page_reorganize_block(
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize_block(
 	ulint		z_level,/*!< in: compression level to be used
 				if dealing with compressed page */
 	buf_block_t*	block,	/*!< in/out: B-tree page */
 	dict_index_t*	index,	/*!< in: the index tree of the page */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	__attribute__((nonnull));
+	__attribute__((nonnull, warn_unused_result));
 
 #ifdef UNIV_BTR_PRINT
 /*************************************************************//**
@@ -736,16 +522,15 @@ dberr_t btr_level_list_remove(const buf_block_t& block,
 If page is the only on its level, this function moves its records to the
 father page, thus reducing the tree height.
 @return father block */
-UNIV_INTERN
 buf_block_t*
 btr_lift_page_up(
-/*=============*/
 	dict_index_t*	index,	/*!< in: index tree */
 	buf_block_t*	block,	/*!< in: page which is the only on its level;
 				must not be empty: use
 				btr_discard_only_page_on_level if the last
 				record from the page should be removed */
-	mtr_t*		mtr)	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	dberr_t*	err)	/*!< out: error code */
 	__attribute__((nonnull));
 
 #define BTR_N_LEAF_PAGES	1
@@ -756,6 +541,3 @@ btr_lift_page_up(
 /****************************************************************
 Global variable controlling if scrubbing should be performed */
 extern my_bool srv_immediate_scrub_data_uncompressed;
-extern Atomic_counter<uint32_t> btr_validate_index_running;
-
-#endif
diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl
index 89826e8f214..9a9e39b6b4c 100644
--- a/storage/innobase/include/btr0btr.inl
+++ b/storage/innobase/include/btr0btr.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,10 +24,7 @@ The B-tree
 Created 6/2/1994 Heikki Tuuri
 *******************************************************/
 
-#include "mach0data.h"
-#include "mtr0mtr.h"
 #include "mtr0log.h"
-#include "page0zip.h"
 
 /**************************************************************//**
 Gets the index id field of a page.
@@ -50,7 +47,7 @@ void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
 {
   ut_ad(level <= BTR_MAX_NODE_LEVEL);
   constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
-  byte *b= my_assume_aligned<2>(&block->frame[field]);
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
   if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
       UNIV_LIKELY_NULL(block->page.zip.data))
     memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
@@ -63,7 +60,7 @@ void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
 inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
 {
   constexpr uint16_t field= FIL_PAGE_NEXT;
-  byte *b= my_assume_aligned<4>(&block->frame[field]);
+  byte *b= my_assume_aligned<4>(&block->page.frame[field]);
   if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
       UNIV_LIKELY_NULL(block->page.zip.data))
     memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
@@ -76,7 +73,7 @@ inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
 inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
 {
   constexpr uint16_t field= FIL_PAGE_PREV;
-  byte *b= my_assume_aligned<4>(&block->frame[field]);
+  byte *b= my_assume_aligned<4>(&block->page.frame[field]);
   if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
       UNIV_LIKELY_NULL(block->page.zip.data))
     memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
@@ -112,38 +109,3 @@ btr_node_ptr_get_child_page_no(
 
 	return(page_no);
 }
-
-/**************************************************************//**
-Releases the latches on a leaf page and bufferunfixes it. */
-UNIV_INLINE
-void
-btr_leaf_page_release(
-/*==================*/
-	buf_block_t*	block,		/*!< in: buffer block */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
-					BTR_MODIFY_LEAF */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	ut_ad(latch_mode == BTR_SEARCH_LEAF
-	      || latch_mode == BTR_MODIFY_LEAF
-	      || latch_mode == BTR_NO_LATCHES);
-
-	ut_ad(!mtr->memo_contains_flagged(block, MTR_MEMO_MODIFY));
-
-	mtr_memo_type_t mode;
-	switch (latch_mode) {
-		case BTR_SEARCH_LEAF:
-			mode = MTR_MEMO_PAGE_S_FIX;
-			break;
-		case BTR_MODIFY_LEAF:
-			mode = MTR_MEMO_PAGE_X_FIX;
-			break;
-		case BTR_NO_LATCHES:
-			mode = MTR_MEMO_BUF_FIX;
-			break;
-		default:
-			ut_a(0);
-	}
-
-	mtr->memo_release(block, mode);
-}
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
index 943836f8759..9fcea86d95d 100644
--- a/storage/innobase/include/btr0bulk.h
+++ b/storage/innobase/include/btr0bulk.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -170,7 +170,7 @@ public:
 	inline void release();
 
 	/** Start mtr and latch block */
-	inline dberr_t latch();
+	inline void latch();
 
 	/** Check if required space is available in the page for the rec
 	to be inserted.	We check fill factor & padding here.
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index 2cc7eb726a4..f6abc9f5e52 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -33,6 +33,9 @@ Created 10/16/1994 Heikki Tuuri
 #include "rem0types.h"
 #include "gis0type.h"
 #include "my_base.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "srw_lock.h"
+#endif
 
 /** Mode flags for btr_cur operations; these can be ORed */
 enum {
@@ -60,46 +63,13 @@ enum {
 	BTR_KEEP_IBUF_BITMAP = 32
 };
 
-/* btr_cur_latch_leaves() returns latched blocks and savepoints. */
-struct btr_latch_leaves_t {
-	/* left block, target block and right block */
-	buf_block_t*	blocks[3];
-	ulint		savepoints[3];
-};
-
 #include "que0types.h"
 #include "row0types.h"
 
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the page cursor component of a tree cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_cur_get_page_cur(
-/*=================*/
-	const btr_cur_t*	cursor);/*!< in: tree cursor */
-/*********************************************************//**
-Returns the buffer block on which the tree cursor is positioned.
-@return pointer to buffer block */
-UNIV_INLINE
-buf_block_t*
-btr_cur_get_block(
-/*==============*/
-	const btr_cur_t*	cursor);/*!< in: tree cursor */
-/*********************************************************//**
-Returns the record pointer of a tree cursor.
-@return pointer to record */
-UNIV_INLINE
-rec_t*
-btr_cur_get_rec(
-/*============*/
-	const btr_cur_t*	cursor);/*!< in: tree cursor */
-#else /* UNIV_DEBUG */
-# define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
-# define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
-# define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
-#endif /* UNIV_DEBUG */
+#define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
+#define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
+#define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
+
 /*********************************************************//**
 Returns the compressed page on which the tree cursor is positioned.
 @return pointer to compressed page, or NULL if the page is not compressed */
@@ -120,7 +90,7 @@ btr_cur_get_page(
 Returns the index of a cursor.
 @param cursor b-tree cursor
 @return index */
-#define btr_cur_get_index(cursor) ((cursor)->index)
+#define btr_cur_get_index(cursor) ((cursor)->index())
 /*********************************************************//**
 Positions a tree cursor at a given record. */
 UNIV_INLINE
@@ -150,104 +120,36 @@ bool
 btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
 	ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
 
-/** Optimistically latches the leaf page or pages requested.
-@param[in]	block		guessed buffer block
-@param[in]	modify_clock	modify clock value
-@param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
-@param[in,out]	cursor		cursor
-@param[in]	file		file name
-@param[in]	line		line where called
-@param[in]	mtr		mini-transaction
-@return true if success */
-bool
-btr_cur_optimistic_latch_leaves(
-	buf_block_t*	block,
-	ib_uint64_t	modify_clock,
-	ulint*		latch_mode,
-	btr_cur_t*	cursor,
-	const char*	file,
-	unsigned	line,
-	mtr_t*		mtr);
-
-/** Searches an index tree and positions a tree cursor on a given level.
+MY_ATTRIBUTE((warn_unused_result))
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
 to node pointer page number fields on the upper levels of the tree!
-Note that if mode is PAGE_CUR_LE, which is used in inserts, then
 cursor->up_match and cursor->low_match both will have sensible values.
-If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
-@param index      index
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
 @param level      the tree level of search
 @param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
                   it cannot get compared to the node ptr page number field!
-@param mode       PAGE_CUR_L, NOTE that if the search is made using a unique
-                  prefix of a record, mode should be PAGE_CUR_LE, not
-                  PAGE_CUR_GE, as the latter may end up on the previous page of
-                  the record! Inserts should always be made using PAGE_CUR_LE
-                  to search the position!
-@param latch_mode BTR_SEARCH_LEAF, ..., ORed with at most one of BTR_INSERT,
-                  BTR_DELETE_MARK, BTR_DELETE, or BTR_ESTIMATE;
-                  cursor->left_block is used to store a pointer to the left
-                  neighbor page, in the cases BTR_SEARCH_PREV and
-                  BTR_MODIFY_PREV; NOTE that if ahi_latch, we might not have a
-                  cursor page latch, we assume that ahi_latch protects the
-                  record!
+@param latch      RW_S_LATCH or RW_X_LATCH
 @param cursor     tree cursor; the cursor page is s- or x-latched, but see also
                   above!
-@param file       file name
-@param line       line where called
 @param mtr        mini-transaction
-@param autoinc    PAGE_ROOT_AUTO_INC to be written (0 if none)
 @return DB_SUCCESS on success or error code otherwise */
-dberr_t btr_cur_search_to_nth_level(dict_index_t *index, ulint level,
+dberr_t btr_cur_search_to_nth_level(ulint level,
                                     const dtuple_t *tuple,
-                                    page_cur_mode_t mode, ulint latch_mode,
-                                    btr_cur_t *cursor, const char *file,
-                                    unsigned line, mtr_t *mtr,
-                                    ib_uint64_t autoinc= 0);
-
-/*****************************************************************//**
-Opens a cursor at either end of an index.
-@return DB_SUCCESS or error code */
-dberr_t
-btr_cur_open_at_index_side_func(
-/*============================*/
-	bool		from_left,	/*!< in: true if open to the low end,
-					false if to the high end */
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: latch mode */
-	btr_cur_t*	cursor,		/*!< in/out: cursor */
-	ulint		level,		/*!< in: level to search for
-					(0=leaf) */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((nonnull));
-
-#define btr_cur_open_at_index_side(f,i,l,c,lv,m)			\
-	btr_cur_open_at_index_side_func(f,i,l,c,lv,__FILE__,__LINE__,m)
+                                    rw_lock_type_t rw_latch,
+                                    btr_cur_t *cursor, mtr_t *mtr);
 
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-bool
-btr_cur_open_at_rnd_pos_func(
-/*=========================*/
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
-	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr);		/*!< in: mtr */
-#define btr_cur_open_at_rnd_pos(i,l,c,m)				\
-	btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
 /*************************************************************//**
 Tries to perform an insert to a page in an index tree, next to cursor.
 It is assumed that mtr holds an x-latch on the page. The operation does
 not succeed if there is too little space on the page. If there is just
 one record on the page, the insert will always succeed; this is to
 prevent trying to split a page with just one record.
-@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
 dberr_t
 btr_cur_optimistic_insert(
 /*======================*/
@@ -324,7 +226,6 @@ btr_cur_update_alloc_zip_func(
 /*==========================*/
 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
 	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
-	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
 #ifdef UNIV_DEBUG
 	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
 #endif /* UNIV_DEBUG */
@@ -334,11 +235,11 @@ btr_cur_update_alloc_zip_func(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 #ifdef UNIV_DEBUG
-# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
-	btr_cur_update_alloc_zip_func(page_zip,cursor,index,offsets,len,cr,mtr)
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,offsets,len,cr,mtr)
 #else /* UNIV_DEBUG */
-# define btr_cur_update_alloc_zip(page_zip,cursor,index,offsets,len,cr,mtr) \
-	btr_cur_update_alloc_zip_func(page_zip,cursor,index,len,cr,mtr)
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,len,cr,mtr)
 #endif /* UNIV_DEBUG */
 
 /** Apply an update vector to a record. No field size changes are allowed.
@@ -468,44 +369,36 @@ that mtr holds an x-latch on the tree and on the cursor page. To avoid
 deadlocks, mtr must also own x-latches to brothers of page, if those
 brothers exist. NOTE: it is assumed that the caller has reserved enough
 free extents so that the compression will always succeed if done!
-@return TRUE if compression occurred */
-ibool
+@return whether compression occurred */
+bool
 btr_cur_compress_if_useful(
 /*=======================*/
 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
-				cursor does not stay valid if compression
-				occurs */
-	ibool		adjust,	/*!< in: TRUE if should adjust the
-				cursor position even if compression occurs */
+				cursor does not stay valid if !adjust and
+				compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 	MY_ATTRIBUTE((nonnull));
 /*******************************************************//**
 Removes the record on which the tree cursor is positioned. It is assumed
 that the mtr has an x-latch on the page where the cursor is positioned,
 but no latch on the whole tree.
-@return TRUE if success, i.e., the page did not become too empty */
-ibool
-btr_cur_optimistic_delete_func(
-/*===========================*/
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
 	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
 				cursor stays valid: if deletion succeeds,
 				on function exit it points to the successor
 				of the deleted record */
-# ifdef UNIV_DEBUG
 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
-# endif /* UNIV_DEBUG */
 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
 				TRUE on a leaf page of a secondary
 				index, the mtr must be committed
 				before latching any further pages */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-# ifdef UNIV_DEBUG
-#  define btr_cur_optimistic_delete(cursor, flags, mtr)		\
-	btr_cur_optimistic_delete_func(cursor, flags, mtr)
-# else /* UNIV_DEBUG */
-#  define btr_cur_optimistic_delete(cursor, flags, mtr)		\
-	btr_cur_optimistic_delete_func(cursor, mtr)
-# endif /* UNIV_DEBUG */
 /*************************************************************//**
 Removes the record on which the tree cursor is positioned. Tries
 to compress the page if its fillfactor drops below a threshold
@@ -537,8 +430,8 @@ btr_cur_pessimistic_delete(
 /** Delete the node pointer in a parent page.
 @param[in,out]	parent	cursor pointing to parent record
 @param[in,out]	mtr	mini-transaction */
-void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
-	MY_ATTRIBUTE((nonnull));
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /***********************************************************//**
 Parses a redo log record of updating a record in-place.
 @return end of log record or NULL */
@@ -564,47 +457,20 @@ struct btr_pos_t
   page_id_t       page_id;     /* Out: Page where we found the tuple */
 };
 
-/** Estimates the number of rows in a given index range.
-@param[in]	index	index
-@param[in/out]	range_start
-@param[in/out]	range_ end
-@return estimated number of rows */
-ha_rows
-btr_estimate_n_rows_in_range(
-	dict_index_t*	index,
-        btr_pos_t*      range_start,
-        btr_pos_t*      range_end);
-
-
-/** Statistics for one field of an index. */
-struct index_field_stats_t
-{
-  ib_uint64_t n_diff_key_vals;
-  ib_uint64_t n_sample_sizes;
-  ib_uint64_t n_non_null_key_vals;
-
-  index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
-                      ib_uint64_t n_sample_sizes= 0,
-                      ib_uint64_t n_non_null_key_vals= 0)
-      : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
-        n_non_null_key_vals(n_non_null_key_vals)
-  {
-  }
-};
-
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-index->stat_n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array index->stat_n_non_null_key_vals.
-@param[in]	index	index
-@return stat vector if the index is available and we get the estimated numbers,
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index);
+/** Estimates the number of rows in a given index range. Do search in the
+left page, then if there are pages between left and right ones, read a few
+pages to the right, if the right page is reached, fetch it and count the exact
+number of rows, otherwise count the estimated(see
+btr_estimate_n_rows_in_range_on_level() for details) number if rows, and
+fetch the right page. If leaves are reached, unlatch non-leaf pages except
+the right leaf parent. After the right leaf page is fetched, commit mtr.
+@param[in]  index index
+@param[in]  range_start range start
+@param[in]  range_end   range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+                                     btr_pos_t *range_start,
+                                     btr_pos_t *range_end);
 
 /** Gets the externally stored size of a record, in units of a database page.
 @param[in]	rec	record
@@ -758,19 +624,6 @@ btr_rec_copy_externally_stored_field(
 	ulint*			len,
 	mem_heap_t*		heap);
 
-/** Latches the leaf page or pages requested.
-@param[in]	block		leaf page where the search converged
-@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
-@param[in]	cursor		cursor
-@param[in]	mtr		mini-transaction
-@return	blocks and savepoints which actually latched. */
-btr_latch_leaves_t
-btr_cur_latch_leaves(
-	buf_block_t*		block,
-	ulint			latch_mode,
-	btr_cur_t*		cursor,
-	mtr_t*			mtr);
-
 /*######################################################################*/
 
 /** In the pessimistic delete, if the page data size drops below this
@@ -829,24 +682,18 @@ enum btr_cur_method {
 /** The tree cursor: the definition appears here only for the compiler
 to know struct size! */
 struct btr_cur_t {
-	dict_index_t*	index;		/*!< index where positioned */
 	page_cur_t	page_cur;	/*!< page cursor */
 	purge_node_t*	purge_node;	/*!< purge node, for BTR_DELETE */
-	buf_block_t*	left_block;	/*!< this field is used to store
-					a pointer to the left neighbor
-					page, in the cases
-					BTR_SEARCH_PREV and
-					BTR_MODIFY_PREV */
 	/*------------------------------*/
 	que_thr_t*	thr;		/*!< this field is only used
-					when btr_cur_search_to_nth_level
+					when search_leaf()
 					is called for an index entry
 					insertion: the calling query
 					thread is passed here to be
 					used in the insert buffer */
 	/*------------------------------*/
 	/** The following fields are used in
-	btr_cur_search_to_nth_level to pass information: */
+	search_leaf() to pass information: */
 	/* @{ */
 	enum btr_cur_method	flag;	/*!< Search method used */
 	ulint		tree_height;	/*!< Tree height if the search is done
@@ -855,8 +702,7 @@ struct btr_cur_t {
 	ulint		up_match;	/*!< If the search mode was PAGE_CUR_LE,
 					the number of matched fields to the
 					the first user record to the right of
-					the cursor record after
-					btr_cur_search_to_nth_level;
+					the cursor record after search_leaf();
 					for the mode PAGE_CUR_GE, the matched
 					fields to the first user record AT THE
 					CURSOR or to the right of it;
@@ -873,8 +719,7 @@ struct btr_cur_t {
 	ulint		low_match;	/*!< if search mode was PAGE_CUR_LE,
 					the number of matched fields to the
 					first user record AT THE CURSOR or
-					to the left of it after
-					btr_cur_search_to_nth_level;
+					to the left of it after search_leaf();
 					NOT defined for PAGE_CUR_GE or any
 					other search modes; see also the NOTE
 					in up_match! */
@@ -894,28 +739,45 @@ struct btr_cur_t {
 					information of the path through
 					the tree */
 	rtr_info_t*	rtr_info;	/*!< rtree search info */
-	btr_cur_t():thr(NULL), rtr_info(NULL) {}
-					/* default values */
-	/** Zero-initialize all fields */
-	void init()
-	{
-		index = NULL;
-		memset(&page_cur, 0, sizeof page_cur);
-		purge_node = NULL;
-		left_block = NULL;
-		thr = NULL;
-		flag = btr_cur_method(0);
-		tree_height = 0;
-		up_match = 0;
-		up_bytes = 0;
-		low_match = 0;
-		low_bytes = 0;
-		n_fields = 0;
-		n_bytes = 0;
-		fold = 0;
-		path_arr = NULL;
-		rtr_info = NULL;
-	}
+  btr_cur_t() { memset((void*) this, 0, sizeof *this); }
+
+  dict_index_t *index() const { return page_cur.index; }
+  buf_block_t *block() const { return page_cur.block; }
+
+  /** Open the cursor on the first or last record.
+  @param first         true=first record, false=last record
+  @param index         B-tree
+  @param latch_mode    which latches to acquire
+  @param mtr           mini-transaction
+  @return error code */
+  dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+                    mtr_t *mtr);
+
+  /** Search the leaf page record corresponding to a key.
+  @param tuple      key to search for, with correct n_fields_cmp
+  @param mode       search mode; PAGE_CUR_LE for unique prefix or for inserting
+  @param latch_mode latch mode
+  @param mtr        mini-transaction
+  @return error code */
+  dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                      btr_latch_mode latch_mode, mtr_t *mtr);
+
+  /** Search the leaf page record corresponding to a key, exclusively latching
+  all sibling pages on the way.
+  @param tuple      key to search for, with correct n_fields_cmp
+  @param mode       search mode; PAGE_CUR_LE for unique prefix or for inserting
+  @param mtr        mini-transaction
+  @return error code */
+  dberr_t pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                                  mtr_t *mtr);
+
+  /** Open the cursor at a random leaf page record.
+  @param offsets   temporary memory for rec_get_offsets()
+  @param heap      memory heap for rec_get_offsets()
+  @param mtr       mini-transaction
+  @return error code */
+  inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
+                                  mtr_t &mtr);
 };
 
 /** Modify the delete-mark flag of a record.
@@ -932,9 +794,9 @@ is still a good change of success a little later.  Try this many
 times. */
 #define BTR_CUR_RETRY_DELETE_N_TIMES	100
 /** If pessimistic delete fails because of lack of file space, there
-is still a good change of success a little later.  Sleep this many
-microseconds between retries. */
-#define BTR_CUR_RETRY_SLEEP_TIME	50000
+is still a good change of success a little later.  Sleep this time
+between retries. */
+static const std::chrono::milliseconds BTR_CUR_RETRY_SLEEP_TIME(50);
 
 /** The reference in a field for which data is stored on a different page.
 The reference is at the end of the 'locally' stored part of the field.
@@ -967,16 +829,16 @@ earlier version of the row.  In rollback we are not allowed to free an
 inherited external field. */
 #define BTR_EXTERN_INHERITED_FLAG	64U
 
-/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
-extern Atomic_counter<ulint>	btr_cur_n_non_sea;
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_non_sea;
 /** Old value of btr_cur_n_non_sea.  Copied by
 srv_refresh_innodb_monitor_stats().  Referenced by
 srv_printf_innodb_monitor(). */
 extern ulint	btr_cur_n_non_sea_old;
-#ifdef BTR_CUR_HASH_ADAPT
 /** Number of successful adaptive hash index lookups in
-btr_cur_search_to_nth_level(). */
-extern ulint	btr_cur_n_sea;
+btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_sea;
 /** Old value of btr_cur_n_sea.  Copied by
 srv_refresh_innodb_monitor_stats().  Referenced by
 srv_printf_innodb_monitor(). */
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
index 8a45b714936..955cf34288e 100644
--- a/storage/innobase/include/btr0cur.inl
+++ b/storage/innobase/include/btr0cur.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,44 +36,6 @@ if (btr_cur_limit_optimistic_insert_debug > 1\
 # define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
 #endif /* UNIV_DEBUG */
 
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the page cursor component of a tree cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_cur_get_page_cur(
-/*=================*/
-	const btr_cur_t*	cursor)	/*!< in: tree cursor */
-{
-	return(&((btr_cur_t*) cursor)->page_cur);
-}
-
-/*********************************************************//**
-Returns the buffer block on which the tree cursor is positioned.
-@return pointer to buffer block */
-UNIV_INLINE
-buf_block_t*
-btr_cur_get_block(
-/*==============*/
-	const btr_cur_t*	cursor)	/*!< in: tree cursor */
-{
-	return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the record pointer of a tree cursor.
-@return pointer to record */
-UNIV_INLINE
-rec_t*
-btr_cur_get_rec(
-/*============*/
-	const btr_cur_t*	cursor)	/*!< in: tree cursor */
-{
-	return(page_cur_get_rec(btr_cur_get_page_cur(cursor)));
-}
-#endif /* UNIV_DEBUG */
-
 /*********************************************************//**
 Returns the compressed page on which the tree cursor is positioned.
 @return pointer to compressed page, or NULL if the page is not compressed */
@@ -109,11 +71,8 @@ btr_cur_position(
 	buf_block_t*	block,	/*!< in: buffer block of rec */
 	btr_cur_t*	cursor)	/*!< out: cursor */
 {
-	ut_ad(page_align(rec) == block->frame);
-
 	page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
-
-	cursor->index = index;
+	cursor->page_cur.index = index;
 }
 
 /*********************************************************************//**
@@ -139,14 +98,14 @@ btr_cur_compress_recommendation(
 
 	if (!page_has_siblings(page)
 	    || page_get_data_size(page)
-	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
 
 		/* The page fillfactor has dropped below a predefined
 		minimum value OR the level in the B-tree contains just
 		one page: we recommend compression if this is not the
 		root page. */
 
-		return cursor->index->page
+		return cursor->index()->page
 			!= btr_cur_get_block(cursor)->page.id().page_no();
 	}
 
@@ -174,14 +133,14 @@ btr_cur_can_delete_without_compress(
 
 	if (!page_has_siblings(page) || page_get_n_recs(page) < 2
 	    || page_get_data_size(page) - rec_size
-	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index)) {
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
 
 		/* The page fillfactor will drop below a predefined
 		minimum value, OR the level in the B-tree contains just
 		one page, OR the page will become empty: we recommend
 		compression if this is not the root page. */
 
-		return cursor->index->page
+		return cursor->index()->page
 			== btr_cur_get_block(cursor)->page.id().page_no();
 	}
 
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
index a9212db0e04..0523829bdc3 100644
--- a/storage/innobase/include/btr0defragment.h
+++ b/storage/innobase/include/btr0defragment.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
-Copyright (C) 2014, 2020, MariaDB Corporation.
+Copyright (C) 2014, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -43,13 +43,11 @@ Check whether the given index is in btr_defragment_wq. */
 bool
 btr_defragment_find_index(
 	dict_index_t*	index);	/*!< Index to find. */
-/******************************************************************//**
-Add an index to btr_defragment_wq. Return a pointer to os_event if this
-is a synchronized defragmentation. */
-os_event_t
-btr_defragment_add_index(
-	dict_index_t*	index,	/*!< index to be added  */
-	dberr_t*	err);	/*!< out: error code */
+/** Defragment an index.
+@param pcur      persistent cursor
+@param thd       current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd);
 /******************************************************************//**
 When table is dropped, this function is called to mark a table as removed in
 btr_efragment_wq. The difference between this function and the remove_index
@@ -57,17 +55,9 @@ function is this will not NULL the event. */
 void
 btr_defragment_remove_table(
 	dict_table_t*	table);	/*!< Index to be removed. */
-/******************************************************************//**
-Mark an index as removed from btr_defragment_wq. */
-void
-btr_defragment_remove_index(
-	dict_index_t*	index);	/*!< Index to be removed. */
 /*********************************************************************//**
 Check whether we should save defragmentation statistics to persistent storage.*/
-UNIV_INTERN
-void
-btr_defragment_save_defrag_stats_if_needed(
-	dict_index_t*	index);	/*!< in: index */
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index);
 
 /* Stop defragmentation.*/
 void btr_defragment_end();
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
index 584cc143359..c66a3bfa329 100644
--- a/storage/innobase/include/btr0pcur.h
+++ b/storage/innobase/include/btr0pcur.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ The index tree persistent cursor
 Created 2/23/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef btr0pcur_h
-#define btr0pcur_h
+#pragma once
 
 #include "dict0dict.h"
 #include "btr0cur.h"
@@ -47,13 +46,6 @@ of a scroll cursor easier */
 };
 
 /**************************************************************//**
-Allocates memory for a persistent cursor object and initializes the cursor.
-@return own: persistent cursor */
-btr_pcur_t*
-btr_pcur_create_for_mysql(void);
-/*============================*/
-
-/**************************************************************//**
 Resets a persistent cursor object, freeing ::old_rec_buf if it is
 allocated and resetting the other members to their initial values. */
 void
@@ -62,12 +54,6 @@ btr_pcur_reset(
 	btr_pcur_t*	cursor);/*!< in, out: persistent cursor */
 
 /**************************************************************//**
-Frees the memory for a persistent cursor object. */
-void
-btr_pcur_free_for_mysql(
-/*====================*/
-	btr_pcur_t*	cursor);	/*!< in, own: persistent cursor */
-/**************************************************************//**
 Copies the stored position of a pcur to another pcur. */
 void
 btr_pcur_copy_stored_position(
@@ -84,79 +70,22 @@ btr_pcur_init(
 /*==========*/
 	btr_pcur_t*	pcur);	/*!< in: persistent cursor */
 
-/** Free old_rec_buf.
-@param[in]	pcur	Persistent cursor holding old_rec to be freed. */
-UNIV_INLINE
-void
-btr_pcur_free(
-	btr_pcur_t*	pcur);
-
-/**************************************************************//**
-Initializes and opens a persistent cursor to an index tree. It should be
-closed with btr_pcur_close. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_low(
-/*==============*/
-	dict_index_t*	index,	/*!< in: index */
-	ulint		level,	/*!< in: level in the btree */
-	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
-	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
-				NOTE that if the search is made using a unique
-				prefix of a record, mode should be
-				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
-				may end up on the previous page from the
-				record! */
-	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
-	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	ib_uint64_t	autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written
-				(0 if none) */
-	mtr_t*		mtr);	/*!< in: mtr */
-#define btr_pcur_open(i,t,md,l,c,m)				\
-	btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,0,m)
 /** Opens an persistent cursor to an index tree without initializing the
 cursor.
-@param index      index
 @param tuple      tuple on which search done
 @param mode       PAGE_CUR_L, ...; NOTE that if the search is made using a
                   unique prefix of a record, mode should be PAGE_CUR_LE, not
                   PAGE_CUR_GE, as the latter may end up on the previous page of
                   the record!
-@param latch_mode BTR_SEARCH_LEAF, ...; NOTE that if ahi_latch then we might
-                  not acquire a cursor page latch, but assume that the
-                  ahi_latch protects the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
 @param cursor     memory buffer for persistent cursor
-@param file       file name
-@param line       line where called
-@param mtr        mtr
+@param mtr        mini-transaction
 @return DB_SUCCESS on success or error code otherwise. */
-UNIV_INLINE
-dberr_t btr_pcur_open_with_no_init_func(dict_index_t *index,
-                                        const dtuple_t *tuple,
-                                        page_cur_mode_t mode, ulint latch_mode,
-                                        btr_pcur_t *cursor, const char *file,
-                                        unsigned line, mtr_t *mtr);
-# define btr_pcur_open_with_no_init(ix,t,md,l,cur,m)		\
-	btr_pcur_open_with_no_init_func(ix,t,md,l,cur,__FILE__,__LINE__,m)
-
-/*****************************************************************//**
-Opens a persistent cursor at either end of an index. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_at_index_side(
-/*========================*/
-	bool		from_left,	/*!< in: true if open to the low end,
-					false if to the high end */
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: latch mode */
-	btr_pcur_t*	pcur,		/*!< in/out: cursor */
-	bool		init_pcur,	/*!< in: whether to initialize pcur */
-	ulint		level,		/*!< in: level to search for
-					(0=leaf) */
-	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((nonnull));
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+                                   btr_latch_mode latch_mode,
+                                   btr_pcur_t *cursor, mtr_t *mtr);
+
 /**************************************************************//**
 Gets the up_match value for a pcur after a search.
 @return number of matched fields at the cursor or to the right if
@@ -175,44 +104,7 @@ ulint
 btr_pcur_get_low_match(
 /*===================*/
 	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
-/**************************************************************//**
-If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
-user record satisfying the search condition, in the case PAGE_CUR_L or
-PAGE_CUR_LE, on the last user record. If no such user record exists, then
-in the first case sets the cursor after last in tree, and in the latter case
-before first in tree. The latching mode must be BTR_SEARCH_LEAF or
-BTR_MODIFY_LEAF. */
-void
-btr_pcur_open_on_user_rec_func(
-/*===========================*/
-	dict_index_t*	index,		/*!< in: index */
-	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
-	page_cur_mode_t	mode,		/*!< in: PAGE_CUR_L, ... */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
-					BTR_MODIFY_LEAF */
-	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
-					cursor */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr);		/*!< in: mtr */
-#define btr_pcur_open_on_user_rec(i,t,md,l,c,m)				\
-	btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-UNIV_INLINE
-bool
-btr_pcur_open_at_rnd_pos_func(
-/*==========================*/
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
-	btr_pcur_t*	cursor,		/*!< in/out: B-tree pcur */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr);		/*!< in: mtr */
-#define btr_pcur_open_at_rnd_pos(i,l,c,m)				\
-	btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+
 /**************************************************************//**
 Frees the possible memory heap of a persistent cursor and sets the latch
 mode of the persistent cursor to BTR_NO_LATCHES.
@@ -222,9 +114,7 @@ cursor is currently positioned. The latch is acquired by the
 are not allowed, you must take care (if using the cursor in S-mode) to
 manually release the latch by either calling
 btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
-or by committing the mini-transaction right after btr_pcur_close().
-A subsequent attempt to crawl the same page in the same mtr would cause
-an assertion failure. */
+or by mtr_t::commit(). */
 UNIV_INLINE
 void
 btr_pcur_close(
@@ -242,9 +132,6 @@ btr_pcur_store_position(
 /*====================*/
 	btr_pcur_t*	cursor, /*!< in: persistent cursor */
 	mtr_t*		mtr);	/*!< in: mtr */
-
-#define btr_pcur_restore_position(l,cur,mtr)				\
-	(cur)->restore_position(l,__FILE__,__LINE__,mtr)
 /*********************************************************//**
 Gets the rel_pos field for a cursor whose position has been stored.
 @return BTR_PCUR_ON, ... */
@@ -293,13 +180,14 @@ btr_pcur_move_to_next(
 /*********************************************************//**
 Moves the persistent cursor to the previous record in the tree. If no records
 are left, the cursor stays 'before first in tree'.
-@return TRUE if the cursor was not before first in tree */
-ibool
+@return true if the cursor was not before first in tree */
+bool
 btr_pcur_move_to_prev(
 /*==================*/
 	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
 				function may release the page latch */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*********************************************************//**
 Moves the persistent cursor to the next user record in the tree. If no user
 records are left, the cursor ends up 'after last in tree'.
@@ -316,60 +204,18 @@ Moves the persistent cursor to the first record on the next page.
 Releases the latch on the current page, and bufferunfixes it.
 Note that there must not be modifications on the current page,
 as then the x-latch can be released only in mtr_commit. */
-void
+dberr_t
 btr_pcur_move_to_next_page(
 /*=======================*/
 	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
 				last record of the current page */
-	mtr_t*		mtr);	/*!< in: mtr */
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the btr cursor component of a persistent cursor.
-@return pointer to btr cursor component */
-UNIV_INLINE
-btr_cur_t*
-btr_pcur_get_btr_cur(
-/*=================*/
-	const btr_pcur_t*	cursor);	/*!< in: persistent cursor */
-/*********************************************************//**
-Returns the page cursor component of a persistent cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_pcur_get_page_cur(
-/*==================*/
-	const btr_pcur_t*	cursor);	/*!< in: persistent cursor */
-/*********************************************************//**
-Returns the page of a persistent cursor.
-@return pointer to the page */
-UNIV_INLINE
-page_t*
-btr_pcur_get_page(
-/*==============*/
-	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
-/*********************************************************//**
-Returns the buffer block of a persistent cursor.
-@return pointer to the block */
-UNIV_INLINE
-buf_block_t*
-btr_pcur_get_block(
-/*===============*/
-	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
-/*********************************************************//**
-Returns the record of a persistent cursor.
-@return pointer to the record */
-UNIV_INLINE
-rec_t*
-btr_pcur_get_rec(
-/*=============*/
-	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
-#else /* UNIV_DEBUG */
-# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
-# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
-# define btr_pcur_get_page(cursor) ((cursor)->btr_cur.page_cur.block->frame)
-# define btr_pcur_get_block(cursor) ((cursor)->btr_cur.page_cur.block)
-# define btr_pcur_get_rec(cursor) ((cursor)->btr_cur.page_cur.rec)
-#endif /* UNIV_DEBUG */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+#define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#define btr_pcur_get_page(cursor) btr_pcur_get_block(cursor)->page.frame
+
 /*********************************************************//**
 Checks if the persistent cursor is on a user record. */
 UNIV_INLINE
@@ -401,17 +247,19 @@ static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor);
 Checks if the persistent cursor is after the last user record in
 the index tree. */
 static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor);
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /*********************************************************//**
 Moves the persistent cursor to the next record on the same page. */
 UNIV_INLINE
-void
+rec_t*
 btr_pcur_move_to_next_on_page(
 /*==========================*/
 	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /*********************************************************//**
 Moves the persistent cursor to the previous record on the same page. */
 UNIV_INLINE
-void
+rec_t*
 btr_pcur_move_to_prev_on_page(
 /*==========================*/
 	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
@@ -448,103 +296,164 @@ enum pcur_pos_t {
 /* The persistent B-tree cursor structure. This is used mainly for SQL
 selects, updates, and deletes. */
 
-struct btr_pcur_t{
-	/** Return value of restore_position() */
-	enum restore_status {
-		/** cursor position on user rec and points on the record with
-		the same field values as in the stored record */
-		SAME_ALL,
-		/** cursor position is on user rec and points on the record with
-		the same unique field values as in the stored record */
-		SAME_UNIQ,
-		/** cursor position is not on user rec or points on the record
-		with not the same uniq field values as in the stored record */
-		NOT_SAME
-	};
-	/** a B-tree cursor */
-	btr_cur_t	btr_cur;
-	/** see TODO note below!
-	BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
-	depending on the latching state of the page and tree where the cursor
-	is positioned; BTR_NO_LATCHES means that the cursor is not currently
-	positioned:
-	we say then that the cursor is detached; it can be restored to
-	attached if the old position was stored in old_rec */
-	ulint		latch_mode;
-	/** true if old_rec is stored */
-	bool		old_stored;
-	/** if cursor position is stored, contains an initial segment of the
-	latest record cursor was positioned either on, before or after */
-	rec_t*		old_rec;
-	/** btr_cur.index->n_core_fields when old_rec was copied */
-	uint16		old_n_core_fields;
-	/** number of fields in old_rec */
-	uint16		old_n_fields;
-	/** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
-	whether cursor was on, before, or after the old_rec record */
-	enum btr_pcur_pos_t	rel_pos;
-	/** buffer block when the position was stored */
-	buf::Block_hint		block_when_stored;
-	/** the modify clock value of the buffer block when the cursor position
-	was stored */
-	ib_uint64_t	modify_clock;
-	/** btr_pcur_store_position() and btr_pcur_restore_position() state. */
-	enum pcur_pos_t	pos_state;
-	/** PAGE_CUR_G, ... */
-	page_cur_mode_t	search_mode;
-	/** the transaction, if we know it; otherwise this field is not defined;
-	can ONLY BE USED in error prints in fatal assertion failures! */
-	trx_t*		trx_if_known;
-	/*-----------------------------*/
-	/* NOTE that the following fields may possess dynamically allocated
-	memory which should be freed if not needed anymore! */
-
-	/** NULL, or a dynamically allocated buffer for old_rec */
-	byte*		old_rec_buf;
-	/** old_rec_buf size if old_rec_buf is not NULL */
-	ulint		buf_size;
-
-	btr_pcur_t() :
-		btr_cur(), latch_mode(RW_NO_LATCH),
-		old_stored(false), old_rec(NULL),
-		old_n_fields(0), rel_pos(btr_pcur_pos_t(0)),
-		block_when_stored(),
-		modify_clock(0), pos_state(BTR_PCUR_NOT_POSITIONED),
-		search_mode(PAGE_CUR_UNSUPP), trx_if_known(NULL),
-		old_rec_buf(NULL), buf_size(0)
-	{
-		btr_cur.init();
-	}
-
-	/** Return the index of this persistent cursor */
-	dict_index_t*	index() const { return(btr_cur.index); }
-	/** Restores the stored position of a persistent cursor bufferfixing
-	the page and obtaining the specified latches. If the cursor position
-	was saved when the
-	(1) cursor was positioned on a user record: this function restores the
-	position to the last record LESS OR EQUAL to the stored record;
-	(2) cursor was positioned on a page infimum record: restores the
-	position to the last record LESS than the user record which was the
-	successor of the page infimum;
-	(3) cursor was positioned on the page supremum: restores to the first
-	record GREATER than the user record which was the predecessor of the
-	supremum.
-	(4) cursor was positioned before the first or after the last in an
-	empty tree: restores to before first or after the last in the tree.
-	@param latch_mode BTR_SEARCH_LEAF, ...
-	@param file file name
-	@param line line where called
-	@param mtr mtr
-	@return btr_pcur_t::SAME_ALL cursor position on user rec and points on
-	the record with the same field values as in the stored record,
-	btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the
-	record with the same unique field values as in the stored record,
-	btr_pcur_t::NOT_SAME cursor position is not on user rec or points on
-	the record with not the samebuniq field values as in the stored */
-	restore_status restore_position(ulint latch_mode, const char *file,
-	                                unsigned line, mtr_t *mtr);
+struct btr_pcur_t
+{
+  /** Return value of restore_position() */
+  enum restore_status {
+    /** cursor position on user rec and points on the record with
+    the same field values as in the stored record */
+    SAME_ALL,
+    /** cursor position is on user rec and points on the record with
+    the same unique field values as in the stored record */
+    SAME_UNIQ,
+    /** cursor position is not on user rec or points on the record
+    with not the same uniq field values as in the stored record */
+    NOT_SAME,
+    /** the index tree is corrupted */
+    CORRUPTED
+  };
+  /** a B-tree cursor */
+  btr_cur_t btr_cur;
+  /** @see BTR_PCUR_WAS_POSITIONED
+  BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
+  depending on the latching state of the page and tree where the cursor
+  is positioned; BTR_NO_LATCHES means that the cursor is not currently
+  positioned:
+  we say then that the cursor is detached; it can be restored to
+  attached if the old position was stored in old_rec */
+  btr_latch_mode latch_mode= BTR_NO_LATCHES;
+  /** if cursor position is stored, contains an initial segment of the
+  latest record cursor was positioned either on, before or after */
+  rec_t *old_rec= nullptr;
+  /** btr_cur.index()->n_core_fields when old_rec was copied */
+  uint16 old_n_core_fields= 0;
+  /** number of fields in old_rec */
+  uint16 old_n_fields= 0;
+  /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
+  whether cursor was on, before, or after the old_rec record */
+  btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0);
+  /** buffer block when the position was stored */
+  buf::Block_hint block_when_stored;
+  /** the modify clock value of the buffer block when the cursor position
+  was stored */
+  ib_uint64_t modify_clock= 0;
+  /** btr_pcur_store_position() and restore_position() state. */
+  enum pcur_pos_t pos_state= BTR_PCUR_NOT_POSITIONED;
+  page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+  /** the transaction, if we know it; otherwise this field is not defined;
+  can ONLY BE USED in error prints in fatal assertion failures! */
+  trx_t *trx_if_known= nullptr;
+  /** a dynamically allocated buffer for old_rec */
+  byte *old_rec_buf= nullptr;
+  /** old_rec_buf size if old_rec_buf is not NULL */
+  ulint buf_size= 0;
+
+  /** Return the index of this persistent cursor */
+  dict_index_t *index() const { return(btr_cur.index()); }
+  MY_ATTRIBUTE((nonnull, warn_unused_result))
+  /** Restores the stored position of a persistent cursor bufferfixing
+  the page and obtaining the specified latches. If the cursor position
+  was saved when the
+  (1) cursor was positioned on a user record: this function restores the
+  position to the last record LESS OR EQUAL to the stored record;
+  (2) cursor was positioned on a page infimum record: restores the
+  position to the last record LESS than the user record which was the
+  successor of the page infimum;
+  (3) cursor was positioned on the page supremum: restores to the first
+  record GREATER than the user record which was the predecessor of the
+  supremum.
+  (4) cursor was positioned before the first or after the last in an
+  empty tree: restores to before first or after the last in the tree.
+  @param latch_mode  BTR_SEARCH_LEAF, ...
+  @param mtr         mini-transaction
+  @retval SAME_ALL cursor position on user rec and points on
+  the record with the same field values as in the stored record,
+  @retval SAME_UNIQ cursor position is on user rec and points on the
+  record with the same unique field values as in the stored record,
+  @retval NOT_SAME cursor position is not on user rec or points on
+  the record with not the same uniq field values as in the stored
+  @retval CORRUPTED if the index is corrupted */
+  restore_status restore_position(btr_latch_mode latch_mode, mtr_t *mtr);
+
+  /** Open the cursor on the first or last record.
+  @param first         true=first record, false=last record
+  @param index         B-tree
+  @param latch_mode    which latches to acquire
+  @param mtr           mini-transaction
+  @return error code */
+  dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+                    mtr_t *mtr)
+
+  {
+    this->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+    search_mode= first ? PAGE_CUR_G : PAGE_CUR_L;
+    pos_state= BTR_PCUR_IS_POSITIONED;
+    old_rec= nullptr;
+
+    return btr_cur.open_leaf(first, index, this->latch_mode, mtr);
+  }
 };
 
-#include "btr0pcur.inl"
+inline buf_block_t *btr_pcur_get_block(btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  return cursor->btr_cur.page_cur.block;
+}
 
-#endif
+inline const buf_block_t *btr_pcur_get_block(const btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  return cursor->btr_cur.page_cur.block;
+}
+
+inline rec_t *btr_pcur_get_rec(const btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+  return cursor->btr_cur.page_cur.rec;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. */
+inline
+dberr_t
+btr_pcur_open(
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+        page_cur_mode_t	mode,	/*!< in: PAGE_CUR_LE, ... */
+	btr_latch_mode	latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+  cursor->search_mode= mode;
+  cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+  cursor->trx_if_known= nullptr;
+  return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/** Open a cursor on the first user record satisfying the search condition;
+in case of no match, after the last index record. */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline
+dberr_t
+btr_pcur_open_on_user_rec(
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	btr_latch_mode	latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+  if (dberr_t err=
+      btr_pcur_open(tuple, PAGE_CUR_GE, latch_mode, cursor, mtr))
+    return err;
+  if (!btr_pcur_is_after_last_on_page(cursor) ||
+      btr_pcur_is_after_last_in_tree(cursor))
+    return DB_SUCCESS;
+  if (dberr_t err= btr_pcur_move_to_next_page(cursor, mtr))
+    return err;
+  return btr_pcur_move_to_next_on_page(cursor) ? DB_SUCCESS : DB_CORRUPTION;
+}
+
+#include "btr0pcur.inl"
diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl
index 05f61b903ff..b827d70dc47 100644
--- a/storage/innobase/include/btr0pcur.inl
+++ b/storage/innobase/include/btr0pcur.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,83 +36,12 @@ btr_pcur_get_rel_pos(
 {
 	ut_ad(cursor);
 	ut_ad(cursor->old_rec);
-	ut_ad(cursor->old_stored);
 	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
 	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 
 	return(cursor->rel_pos);
 }
 
-#ifdef UNIV_DEBUG
-/*********************************************************//**
-Returns the btr cursor component of a persistent cursor.
-@return pointer to btr cursor component */
-UNIV_INLINE
-btr_cur_t*
-btr_pcur_get_btr_cur(
-/*=================*/
-	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
-{
-	const btr_cur_t*	btr_cur = &cursor->btr_cur;
-	return((btr_cur_t*) btr_cur);
-}
-
-/*********************************************************//**
-Returns the page cursor component of a persistent cursor.
-@return pointer to page cursor component */
-UNIV_INLINE
-page_cur_t*
-btr_pcur_get_page_cur(
-/*==================*/
-	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
-{
-	return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the page of a persistent cursor.
-@return pointer to the page */
-UNIV_INLINE
-page_t*
-btr_pcur_get_page(
-/*==============*/
-	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
-{
-	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-
-	return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the buffer block of a persistent cursor.
-@return pointer to the block */
-UNIV_INLINE
-buf_block_t*
-btr_pcur_get_block(
-/*===============*/
-	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
-{
-	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-
-	return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
-}
-
-/*********************************************************//**
-Returns the record of a persistent cursor.
-@return pointer to the record */
-UNIV_INLINE
-rec_t*
-btr_pcur_get_rec(
-/*=============*/
-	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
-{
-	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-
-	return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
-}
-#endif /* UNIV_DEBUG */
-
 /**************************************************************//**
 Gets the up_match value for a pcur after a search.
 @return number of matched fields at the cursor or to the right if
@@ -194,16 +123,8 @@ btr_pcur_is_on_user_rec(
 /*====================*/
 	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
 {
-	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-
-	if (btr_pcur_is_before_first_on_page(cursor)
-	    || btr_pcur_is_after_last_on_page(cursor)) {
-
-		return(FALSE);
-	}
-
-	return(TRUE);
+  return !btr_pcur_is_before_first_on_page(cursor) &&
+    !btr_pcur_is_after_last_on_page(cursor);
 }
 
 /*********************************************************//**
@@ -233,7 +154,7 @@ static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor)
 /*********************************************************//**
 Moves the persistent cursor to the next record on the same page. */
 UNIV_INLINE
-void
+rec_t*
 btr_pcur_move_to_next_on_page(
 /*==========================*/
 	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
@@ -241,25 +162,23 @@ btr_pcur_move_to_next_on_page(
 	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
 
-	page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
-
-	cursor->old_stored = false;
+	cursor->old_rec = nullptr;
+	return page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
 }
 
 /*********************************************************//**
 Moves the persistent cursor to the previous record on the same page. */
 UNIV_INLINE
-void
+rec_t*
 btr_pcur_move_to_prev_on_page(
 /*==========================*/
 	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
 {
 	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_rec = nullptr;
 
-	page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
-
-	cursor->old_stored = false;
+	return page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
 }
 
 /*********************************************************//**
@@ -276,16 +195,15 @@ btr_pcur_move_to_next_user_rec(
 {
 	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
 	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
-	cursor->old_stored = false;
+	cursor->old_rec = nullptr;
 loop:
 	if (btr_pcur_is_after_last_on_page(cursor)) {
-		if (btr_pcur_is_after_last_in_tree(cursor)) {
+		if (btr_pcur_is_after_last_in_tree(cursor)
+		    || btr_pcur_move_to_next_page(cursor, mtr) != DB_SUCCESS) {
 			return(FALSE);
 		}
-
-		btr_pcur_move_to_next_page(cursor, mtr);
-	} else {
-		btr_pcur_move_to_next_on_page(cursor);
+	} else if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(cursor))) {
+		return false;
 	}
 
 	if (btr_pcur_is_on_user_rec(cursor)) {
@@ -308,22 +226,16 @@ btr_pcur_move_to_next(
 				function may release the page latch */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
-	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
 
-	cursor->old_stored = false;
+  cursor->old_rec= nullptr;
 
-	if (btr_pcur_is_after_last_on_page(cursor)) {
-		if (btr_pcur_is_after_last_in_tree(cursor)) {
-			return(FALSE);
-		}
-
-		btr_pcur_move_to_next_page(cursor, mtr);
-		return(TRUE);
-	}
-
-	btr_pcur_move_to_next_on_page(cursor);
-	return(TRUE);
+  if (btr_pcur_is_after_last_on_page(cursor))
+    return !btr_pcur_is_after_last_in_tree(cursor) &&
+      btr_pcur_move_to_next_page(cursor, mtr) == DB_SUCCESS;
+  else
+    return !!btr_pcur_move_to_next_on_page(cursor);
 }
 
 /**************************************************************//**
@@ -381,200 +293,33 @@ btr_pcur_init(
 /*==========*/
 	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
 {
-	pcur->old_stored = false;
 	pcur->old_rec_buf = NULL;
 	pcur->old_rec = NULL;
 
 	pcur->btr_cur.rtr_info = NULL;
 }
 
-/** Free old_rec_buf.
-@param[in]	pcur	Persistent cursor holding old_rec to be freed. */
-UNIV_INLINE
-void
-btr_pcur_free(
-	btr_pcur_t*	pcur)
-{
-	ut_free(pcur->old_rec_buf);
-}
-
-/**************************************************************//**
-Initializes and opens a persistent cursor to an index tree. It should be
-closed with btr_pcur_close. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_low(
-/*==============*/
-	dict_index_t*	index,	/*!< in: index */
-	ulint		level,	/*!< in: level in the btree */
-	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
-	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
-				NOTE that if the search is made using a unique
-				prefix of a record, mode should be
-				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
-				may end up on the previous page from the
-				record! */
-	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
-	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	ib_uint64_t	autoinc,/*!< in: PAGE_ROOT_AUTO_INC to be written
-				(0 if none) */
-	mtr_t*		mtr)	/*!< in: mtr */
-{
-	btr_cur_t*	btr_cursor;
-	dberr_t err = DB_SUCCESS;
-
-	/* Initialize the cursor */
-
-	btr_pcur_init(cursor);
-
-	cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
-	cursor->search_mode = mode;
-
-	/* Search with the tree cursor */
-
-	btr_cursor = btr_pcur_get_btr_cur(cursor);
-
-	ut_ad(!dict_index_is_spatial(index));
-
-	err = btr_cur_search_to_nth_level(
-		index, level, tuple, mode, latch_mode, btr_cursor,
-		file, line, mtr, autoinc);
-
-	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-		ib::warn() << "btr_pcur_open_low"
-			   << " level: " << level
-			   << " called from file: "
-			   << file << " line: " << line
-			   << " table: " << index->table->name
-			   << " index: " << index->name
-			   << " error: " << err;
-	}
-
-	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
-
-	cursor->trx_if_known = NULL;
-
-	return(err);
-}
-
 /** Opens an persistent cursor to an index tree without initializing the
 cursor.
-@param index      index
 @param tuple      tuple on which search done
-@param mode       PAGE_CUR_L, ...; NOTE that if the search is made using a
+@param mode       search mode; NOTE that if the search is made using a
                   unique prefix of a record, mode should be PAGE_CUR_LE, not
                   PAGE_CUR_GE, as the latter may end up on the previous page of
                   the record!
-@param latch_mode BTR_SEARCH_LEAF, ...; NOTE that if ahi_latch then we might
-                  not acquire a cursor page latch, but assume that the
-                  ahi_latch protects the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
 @param cursor     memory buffer for persistent cursor
-@param file       file name
-@param line       line where called
-@param mtr        mtr
+@param mtr        mini-transaction
 @return DB_SUCCESS on success or error code otherwise. */
-UNIV_INLINE
-dberr_t btr_pcur_open_with_no_init_func(dict_index_t *index,
-                                        const dtuple_t *tuple,
-                                        page_cur_mode_t mode, ulint latch_mode,
-                                        btr_pcur_t *cursor, const char *file,
-                                        unsigned line, mtr_t *mtr)
-{
-	btr_cur_t*	btr_cursor;
-	dberr_t		err = DB_SUCCESS;
-
-	cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
-	cursor->search_mode = mode;
-
-	/* Search with the tree cursor */
-
-	btr_cursor = btr_pcur_get_btr_cur(cursor);
-
-	err = btr_cur_search_to_nth_level(
-		index, 0, tuple, mode, latch_mode, btr_cursor,
-		file, line, mtr);
-
-	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
-
-	cursor->old_stored = false;
-
-	cursor->trx_if_known = NULL;
-	return err;
-}
-
-/*****************************************************************//**
-Opens a persistent cursor at either end of an index. */
-UNIV_INLINE
-dberr_t
-btr_pcur_open_at_index_side(
-/*========================*/
-	bool		from_left,	/*!< in: true if open to the low end,
-					false if to the high end */
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: latch mode */
-	btr_pcur_t*	pcur,		/*!< in/out: cursor */
-	bool		init_pcur,	/*!< in: whether to initialize pcur */
-	ulint		level,		/*!< in: level to search for
-					(0=leaf) */
-	mtr_t*		mtr)		/*!< in/out: mini-transaction */
-{
-	dberr_t		err = DB_SUCCESS;
-
-	pcur->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
-
-	pcur->search_mode = from_left ? PAGE_CUR_G : PAGE_CUR_L;
-
-	if (init_pcur) {
-		btr_pcur_init(pcur);
-	}
-
-	err = btr_cur_open_at_index_side(
-		from_left, index, latch_mode,
-		btr_pcur_get_btr_cur(pcur), level, mtr);
-	pcur->pos_state = BTR_PCUR_IS_POSITIONED;
-
-	pcur->old_stored = false;
-
-	pcur->trx_if_known = NULL;
-
-	return (err);
-}
-
-/**********************************************************************//**
-Positions a cursor at a randomly chosen position within a B-tree.
-@return true if the index is available and we have put the cursor, false
-if the index is unavailable */
-UNIV_INLINE
-bool
-btr_pcur_open_at_rnd_pos_func(
-/*==========================*/
-	dict_index_t*	index,		/*!< in: index */
-	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
-	btr_pcur_t*	cursor,		/*!< in/out: B-tree pcur */
-	const char*	file,		/*!< in: file name */
-	unsigned	line,		/*!< in: line where called */
-	mtr_t*		mtr)		/*!< in: mtr */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+                                   btr_latch_mode latch_mode,
+                                   btr_pcur_t *cursor, mtr_t *mtr)
 {
-	/* Initialize the cursor */
-
-	cursor->latch_mode = latch_mode;
-	cursor->search_mode = PAGE_CUR_G;
-
-	btr_pcur_init(cursor);
-
-	bool	available;
-
-	available = btr_cur_open_at_rnd_pos_func(index, latch_mode,
-						 btr_pcur_get_btr_cur(cursor),
-						 file, line, mtr);
-	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
-	cursor->old_stored = false;
-
-	cursor->trx_if_known = NULL;
-
-	return(available);
+  cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+  cursor->search_mode= mode;
+  cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+  cursor->trx_if_known= nullptr;
+  return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
 }
 
 /**************************************************************//**
@@ -586,34 +331,28 @@ cursor is currently positioned. The latch is acquired by the
 are not allowed, you must take care (if using the cursor in S-mode) to
 manually release the latch by either calling
 btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
-or by committing the mini-transaction right after btr_pcur_close().
-A subsequent attempt to crawl the same page in the same mtr would cause
-an assertion failure. */
+or by mtr_t::commit(). */
 UNIV_INLINE
 void
 btr_pcur_close(
 /*===========*/
 	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
 {
-	ut_free(cursor->old_rec_buf);
-
-	if (cursor->btr_cur.rtr_info) {
-		rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
-		cursor->btr_cur.rtr_info = NULL;
-	}
+  ut_free(cursor->old_rec_buf);
 
-	cursor->old_rec = NULL;
-	cursor->old_rec_buf = NULL;
-	cursor->btr_cur.page_cur.rec = NULL;
-	cursor->btr_cur.page_cur.block = NULL;
+  if (cursor->btr_cur.rtr_info)
+    rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
 
-	cursor->old_rec = NULL;
-	cursor->old_stored = false;
+  cursor->btr_cur.rtr_info= nullptr;
+  cursor->old_rec = nullptr;
+  cursor->old_rec_buf = nullptr;
+  cursor->btr_cur.page_cur.rec = nullptr;
+  cursor->btr_cur.page_cur.block = nullptr;
 
-	cursor->latch_mode = BTR_NO_LATCHES;
-	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+  cursor->latch_mode = BTR_NO_LATCHES;
+  cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
 
-	cursor->trx_if_known = NULL;
+  cursor->trx_if_known = nullptr;
 }
 
 /*********************************************************//**
@@ -629,5 +368,5 @@ btr_pcur_move_before_first_on_page(
 	page_cur_set_before_first(btr_pcur_get_block(cursor),
 		btr_pcur_get_page_cur(cursor));
 
-	cursor->old_stored = false;
+	cursor->old_rec = nullptr;
 }
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
index cd29e13f5bd..48e4fadab9b 100644
--- a/storage/innobase/include/btr0sea.h
+++ b/storage/innobase/include/btr0sea.h
@@ -30,7 +30,11 @@ Created 2/17/1996 Heikki Tuuri
 #include "dict0dict.h"
 #ifdef BTR_CUR_HASH_ADAPT
 #include "ha0ha.h"
-#include "sync0sync.h"
+#include "srw_lock.h"
+
+#ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
 
 #define btr_search_sys_create() btr_search_sys.create()
 #define btr_search_sys_free() btr_search_sys.free()
@@ -59,15 +63,9 @@ both have sensible values.
 @param[in,out]	info		index search info
 @param[in]	tuple		logical record
 @param[in]	mode		PAGE_CUR_L, ....
-@param[in]	latch_mode	BTR_SEARCH_LEAF, ...;
-				NOTE that only if has_search_latch is 0, we will
-				have a latch set on the cursor page, otherwise
-				we assume the caller uses his search latch
-				to protect the record!
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
 @param[out]	cursor		tree cursor
-@param[in]	ahi_latch	the adaptive hash index latch being held,
-				or NULL
-@param[in]	mtr		mini transaction
+@param[in]	mtr		mini-transaction
 @return whether the search succeeded */
 bool
 btr_search_guess_on_hash(
@@ -111,8 +109,8 @@ void btr_search_drop_page_hash_when_freed(const page_id_t page_id);
 			using btr_cur_search_, and the new record has been
 			inserted next to the cursor.
 @param[in]	ahi_latch	the adaptive hash index latch */
-void
-btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+                                           srw_spin_lock *ahi_latch);
 
 /** Updates the page hash index when a single record is inserted on a page.
 @param[in,out]	cursor		cursor which was positioned to the
@@ -120,13 +118,13 @@ btr_search_update_hash_node_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
 				and the new record has been inserted next
 				to the cursor
 @param[in]	ahi_latch	the adaptive hash index latch */
-void
-btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch);
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+                                      srw_spin_lock *ahi_latch);
 
 /** Updates the page hash index when a single record is deleted from a page.
 @param[in]	cursor	cursor which was positioned on the record to delete
 			using btr_cur_search_, the record is not yet deleted.*/
-void btr_search_update_hash_on_delete(btr_cur_t* cursor);
+void btr_search_update_hash_on_delete(btr_cur_t *cursor);
 
 /** Validates the search system.
 @return true if ok */
@@ -141,28 +139,13 @@ static inline void btr_search_x_unlock_all();
 /** Lock all search latches in shared mode. */
 static inline void btr_search_s_lock_all();
 
-#ifdef UNIV_DEBUG
-/** Check if thread owns all the search latches.
-@param[in]	mode	lock mode check
-@retval true if owns all of them
-@retval false if does not own some of them */
-static inline bool btr_search_own_all(ulint mode);
-
-/** Check if thread owns any of the search latches.
-@param[in]	mode	lock mode check
-@retval true if owns any of them
-@retval false if owns no search latch */
-static inline bool btr_search_own_any(ulint mode);
-
-/** @return whether this thread holds any of the search latches */
-static inline bool btr_search_own_any();
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all();
 
+# ifdef UNIV_DEBUG
 /** @return if the index is marked as freed */
 bool btr_search_check_marked_free_index(const buf_block_t *block);
-#endif /* UNIV_DEBUG */
-
-/** Unlock all search latches from shared mode. */
-static inline void btr_search_s_unlock_all();
+# endif /* UNIV_DEBUG */
 #else /* BTR_CUR_HASH_ADAPT */
 # define btr_search_sys_create()
 # define btr_search_sys_free()
@@ -257,20 +240,30 @@ struct btr_search_sys_t
   struct partition
   {
     /** latches protecting hash_table */
-    rw_lock_t latch;
+    srw_spin_lock latch;
     /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
     hash_table_t table;
     /** memory heap for table */
     mem_heap_t *heap;
 
-    char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof(rw_lock_t) -
-              sizeof(hash_table_t) - sizeof(mem_heap_t)) &
+#ifdef _MSC_VER
+#pragma warning(push)
+// nonstandard extension - zero sized array, if perfschema is not compiled
+#pragma warning(disable : 4200)
+#endif
+
+    char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof latch -
+              sizeof table - sizeof heap) &
              (CPU_LEVEL1_DCACHE_LINESIZE - 1)];
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
     void init()
     {
       memset((void*) this, 0, sizeof *this);
-      rw_lock_create(btr_search_latch_key, &latch, SYNC_SEARCH_SYS);
+      latch.SRW_LOCK_INIT(btr_search_latch_key);
     }
 
     void alloc(ulint hash_size)
@@ -292,7 +285,7 @@ struct btr_search_sys_t
 
     void free()
     {
-      rw_lock_free(&latch);
+      latch.destroy();
       if (heap)
         clear();
     }
@@ -316,7 +309,7 @@ struct btr_search_sys_t
   }
 
   /** Get the search latch for the adaptive hash index partition */
-  rw_lock_t *get_latch(const dict_index_t &index) const
+  srw_spin_lock *get_latch(const dict_index_t &index) const
   { return &get_part(index)->latch; }
 
   /** Create and initialize at startup */
@@ -357,14 +350,24 @@ struct btr_search_sys_t
 extern btr_search_sys_t btr_search_sys;
 
 /** @return number of leaf pages pointed to by the adaptive hash index */
-inline ulint dict_index_t::n_ahi_pages() const
+TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const
 {
   if (!btr_search_enabled)
     return 0;
-  rw_lock_t *latch = &btr_search_sys.get_part(*this)->latch;
-  rw_lock_s_lock(latch);
+  srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (latch->is_locked())
+      xabort();
+    ulint ref_count= search_info->ref_count;
+    xend();
+    return ref_count;
+  }
+#endif
+  latch->rd_lock(SRW_LOCK_CALL);
   ulint ref_count= search_info->ref_count;
-  rw_lock_s_unlock(latch);
+  latch->rd_unlock();
   return ref_count;
 }
 
diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl
index 40eb5d86ead..5a8d648029a 100644
--- a/storage/innobase/include/btr0sea.inl
+++ b/storage/innobase/include/btr0sea.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -47,8 +47,7 @@ static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
 /** Updates the search info.
 @param[in,out]	info	search info
 @param[in,out]	cursor	cursor which was just positioned */
-void
-btr_search_info_update_slow(btr_search_t* info, btr_cur_t* cursor);
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor);
 
 /*********************************************************************//**
 Updates the search info. */
@@ -59,10 +58,10 @@ btr_search_info_update(
 	dict_index_t*	index,	/*!< in: index of the cursor */
 	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
 {
-	ut_ad(!btr_search_own_any(RW_LOCK_S));
-	ut_ad(!btr_search_own_any(RW_LOCK_X));
+	ut_ad(!index->is_spatial());
+	ut_ad(!index->table->is_temporary());
 
-	if (dict_index_is_spatial(index) || !btr_search_enabled) {
+	if (!btr_search_enabled) {
 		return;
 	}
 
@@ -88,7 +87,7 @@ btr_search_info_update(
 static inline void btr_search_x_lock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_x_lock(&btr_search_sys.parts[i].latch);
+		btr_search_sys.parts[i].latch.wr_lock(SRW_LOCK_CALL);
 	}
 }
 
@@ -96,7 +95,7 @@ static inline void btr_search_x_lock_all()
 static inline void btr_search_x_unlock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_x_unlock(&btr_search_sys.parts[i].latch);
+		btr_search_sys.parts[i].latch.wr_unlock();
 	}
 }
 
@@ -104,7 +103,7 @@ static inline void btr_search_x_unlock_all()
 static inline void btr_search_s_lock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_s_lock(&btr_search_sys.parts[i].latch);
+		btr_search_sys.parts[i].latch.rd_lock(SRW_LOCK_CALL);
 	}
 }
 
@@ -112,49 +111,7 @@ static inline void btr_search_s_lock_all()
 static inline void btr_search_s_unlock_all()
 {
 	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		rw_lock_s_unlock(&btr_search_sys.parts[i].latch);
-	}
-}
-
-#ifdef UNIV_DEBUG
-/** Check if thread owns all the search latches.
-@param[in]	mode	lock mode check
-@retval true if owns all of them
-@retval false if does not own some of them */
-static inline bool btr_search_own_all(ulint mode)
-{
-	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		if (!rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
-			return(false);
-		}
-	}
-	return(true);
-}
-
-/** Check if thread owns any of the search latches.
-@param[in]	mode	lock mode check
-@retval true if owns any of them
-@retval false if owns no search latch */
-static inline bool btr_search_own_any(ulint mode)
-{
-	for (ulint i = 0; i < btr_ahi_parts; ++i) {
-		if (rw_lock_own(&btr_search_sys.parts[i].latch, mode)) {
-			return(true);
-		}
-	}
-	return(false);
-}
-
-/** @return whether this thread holds any of the search latches */
-static inline bool btr_search_own_any()
-{
-	for (ulint i = btr_ahi_parts; i--; ) {
-		if (rw_lock_own_flagged(&btr_search_sys.parts[i].latch,
-					RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)) {
-			return true;
-		}
+		btr_search_sys.parts[i].latch.rd_unlock();
 	}
-	return false;
 }
-#endif /* UNIV_DEBUG */
 #endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
index 83c374e2561..fc829e7857a 100644
--- a/storage/innobase/include/btr0types.h
+++ b/storage/innobase/include/btr0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2019, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ The index tree general types
 Created 2/17/1996 Heikki Tuuri
 *************************************************************************/
 
-#ifndef btr0types_h
-#define btr0types_h
+#pragma once
 
 #include "page0types.h"
 #include "rem0types.h"
@@ -56,4 +55,100 @@ in the index record. */
 #define BTR_EXTERN_LOCAL_STORED_MAX_SIZE	\
 	(BTR_EXTERN_FIELD_REF_SIZE * 2)
 
-#endif
+/** Latching modes for btr_cur_t::search_leaf(). */
+enum btr_latch_mode {
+	/** Search a record on a leaf page and S-latch it. */
+	BTR_SEARCH_LEAF = RW_S_LATCH,
+	/** (Prepare to) modify a record on a leaf page and X-latch it. */
+	BTR_MODIFY_LEAF	= RW_X_LATCH,
+	/** U-latch root and X-latch a leaf page */
+	BTR_MODIFY_ROOT_AND_LEAF = RW_SX_LATCH,
+	/** Obtain no latches. */
+	BTR_NO_LATCHES = RW_NO_LATCH,
+	/** Search the previous record.
+	Used in btr_pcur_move_backward_from_page(). */
+	BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF,
+	/** Modify the previous record.
+	Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */
+	BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF,
+	/** Start modifying the entire B-tree. */
+	BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF,
+	/** Continue modifying the entire R-tree.
+	Only used by rtr_search_to_nth_level(). */
+	BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE,
+
+	/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
+	exclusive. */
+	/** The search tuple will be inserted to the secondary index
+	at the searched position.  When the leaf page is not in the
+	buffer pool, try to use the change buffer. */
+	BTR_INSERT = 64,
+
+	/** Try to delete mark a secondary index leaf page record at
+	the searched position using the change buffer when the page is
+	not in the buffer pool. */
+	BTR_DELETE_MARK	= 128,
+
+	/** Try to purge the record using the change buffer when the
+	secondary index leaf page is not in the buffer pool. */
+	BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK,
+
+	/** The caller is already holding dict_index_t::lock S-latch. */
+	BTR_ALREADY_S_LATCHED = 256,
+	/** Search and S-latch a leaf page, assuming that the
+	dict_index_t::lock S-latch is being held. */
+	BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Search and X-latch a leaf page, assuming that the
+	dict_index_t::lock is being held in non-exclusive mode. */
+	BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Attempt to modify records in an x-latched tree. */
+	BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE
+	| BTR_ALREADY_S_LATCHED,
+	/** U-latch root and X-latch a leaf page, assuming that
+	dict_index_t::lock is being held in U mode. */
+	BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF
+	| BTR_ALREADY_S_LATCHED,
+
+	/** Attempt to delete-mark a secondary index record. */
+	BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
+	/** Attempt to delete-mark a secondary index record
+	while holding the dict_index_t::lock S-latch. */
+	BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Attempt to purge a secondary index record. */
+	BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
+	/** Attempt to purge a secondary index record
+	while holding the dict_index_t::lock S-latch. */
+	BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
+	| BTR_ALREADY_S_LATCHED,
+
+	/** In the case of BTR_MODIFY_TREE, the caller specifies
+	the intention to delete record only. It is used to optimize
+	block->lock range.*/
+	BTR_LATCH_FOR_DELETE = 512,
+
+	/** In the case of BTR_MODIFY_TREE, the caller specifies
+	the intention to delete record only. It is used to optimize
+	block->lock range.*/
+	BTR_LATCH_FOR_INSERT = 1024,
+
+	/** Attempt to delete a record in the tree. */
+	BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+	/** Attempt to delete a record in an x-latched tree. */
+	BTR_PURGE_TREE_ALREADY_LATCHED = BTR_PURGE_TREE
+	| BTR_ALREADY_S_LATCHED,
+
+	/** Attempt to insert a record into the tree. */
+	BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+
+	/** This flag ORed to BTR_INSERT says that we can ignore possible
+	UNIQUE definition on secondary indexes when we decide if we can use
+	the insert buffer to speed up inserts */
+	BTR_IGNORE_SEC_UNIQUE = 2048,
+	/** Rollback in spatial index */
+	BTR_RTREE_UNDO_INS = 4096,
+	/** Try to delete mark a spatial index record */
+	BTR_RTREE_DELETE_MARK = 8192
+};
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
index ee48e7ce6d2..d4fee7c1e99 100644
--- a/storage/innobase/include/buf0block_hint.h
+++ b/storage/innobase/include/buf0block_hint.h
@@ -56,7 +56,7 @@ public:
     buf_block_t *block= m_block;
     bool res= f(block);
     if (block)
-      buf_block_buf_fix_dec(block);
+      block->page.unfix();
     return res;
   }
 
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index a84ea047a54..2b4732a64a0 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -24,40 +24,30 @@ The database buffer pool high-level routines
 Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef buf0buf_h
-#define buf0buf_h
+#pragma once
 
 /** Magic value to use instead of checksums when they are disabled */
 #define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
 
 #include "fil0fil.h"
 #include "mtr0types.h"
-#include "buf0types.h"
 #include "span.h"
 #include "assume_aligned.h"
+#include "buf0types.h"
 #ifndef UNIV_INNOCHECKSUM
-#include "hash0hash.h"
 #include "ut0byte.h"
 #include "page0types.h"
 #include "log0log.h"
 #include "srv0srv.h"
+#include "transactional_lock_guard.h"
 #include <ostream>
 
-// Forward declaration
-struct fil_addr_t;
-
 /** @name Modes for buf_page_get_gen */
 /* @{ */
 #define BUF_GET			10	/*!< get always */
 #define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
 #define BUF_PEEK_IF_IN_POOL	12	/*!< get if in pool, do not make
 					the block young in the LRU list */
-#define BUF_GET_NO_LATCH	14	/*!< get and bufferfix, but
-					set no latch; we have
-					separated this case, because
-					it is error-prone programming
-					not to set a latch, and it
-					should be used with care */
 #define BUF_GET_IF_IN_POOL_OR_WATCH	15
 					/*!< Get the page only if it's in the
 					buffer pool, if not then set a watch
@@ -65,7 +55,6 @@ struct fil_addr_t;
 #define BUF_GET_POSSIBLY_FREED		16
 					/*!< Like BUF_GET, but do not mind
 					if the file page has been freed. */
-#define BUF_EVICT_IF_IN_POOL	20	/*!< evict a clean block if found */
 /* @} */
 
 /** If LRU list of a buf_pool is less than this size then LRU eviction
@@ -74,22 +63,6 @@ the blocks on free list. If LRU list is very small then we can end up
 in thrashing. */
 #define BUF_LRU_MIN_LEN		256
 
-/** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */
-enum buf_page_state
-{
-  /** available in buf_pool.free or buf_pool.watch */
-  BUF_BLOCK_NOT_USED,
-  /** allocated for something else than a file page */
-  BUF_BLOCK_MEMORY,
-  /** a previously allocated file page, in transit to NOT_USED */
-  BUF_BLOCK_REMOVE_HASH,
-  /** a buf_block_t that is also in buf_pool.LRU */
-  BUF_BLOCK_FILE_PAGE,
-  /** the buf_page_t of a ROW_FORMAT=COMPRESSED page
-  whose uncompressed page frame has been evicted */
-  BUF_BLOCK_ZIP_PAGE
-};
-
 /** This structure defines information we will fetch from each buffer pool. It
 will be used to print table IO stats */
 struct buf_pool_info_t
@@ -170,33 +143,10 @@ operator<<(
 	const page_id_t		page_id);
 
 #ifndef UNIV_INNOCHECKSUM
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in bytes.
-@return size in bytes */
-UNIV_INLINE
-ulint
-buf_pool_get_curr_size(void);
-/*========================*/
-
-/********************************************************************//**
-Allocates a buf_page_t descriptor. This function must succeed. In case
-of failure we assert in this function. */
-UNIV_INLINE
-buf_page_t*
-buf_page_alloc_descriptor(void)
-/*===========================*/
-	MY_ATTRIBUTE((malloc));
-/********************************************************************//**
-Free a buf_page_t descriptor. */
-UNIV_INLINE
-void
-buf_page_free_descriptor(
-/*=====================*/
-	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
-	MY_ATTRIBUTE((nonnull));
+# define buf_pool_get_curr_size() srv_buf_pool_curr_size
 
 /** Allocate a buffer block.
-@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+@return own: the allocated block, state()==MEMORY */
 inline buf_block_t *buf_block_alloc();
 /********************************************************************//**
 Frees a buffer block which does not contain a file page. */
@@ -206,71 +156,37 @@ buf_block_free(
 /*===========*/
 	buf_block_t*	block);	/*!< in, own: block to be freed */
 
-/**************************************************************//**
-NOTE! The following macros should be used instead of buf_page_get_gen,
-to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
-in LA! */
 #define buf_page_get(ID, SIZE, LA, MTR)					\
-	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, __FILE__, __LINE__, MTR)
-
-/**************************************************************//**
-Use these macros to bufferfix a page with no latching. Remember not to
-read the contents of the page unless you know it is safe. Do not modify
-the contents of the page! We have separated this case, because it is
-error-prone programming not to set a latch, and it should be used
-with care. */
-#define buf_page_get_with_no_latch(ID, SIZE, MTR)	\
-	buf_page_get_gen(ID, SIZE, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH, \
-			 __FILE__, __LINE__, MTR)
-/********************************************************************//**
-This is the general function used to get optimistic access to a database
-page.
-@return TRUE if success */
-ibool
-buf_page_optimistic_get(
-/*====================*/
-	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
-	buf_block_t*	block,	/*!< in: guessed block */
-	ib_uint64_t	modify_clock,/*!< in: modify clock value */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	mtr_t*		mtr);	/*!< in: mini-transaction */
-
-/** Given a tablespace id and page number tries to get that page. If the
-page is not in the buffer pool it is not loaded and NULL is returned.
-Suitable for using when holding the lock_sys_t::mutex.
-@param[in]	page_id	page id
-@param[in]	file	file name
-@param[in]	line	line where called
-@param[in]	mtr	mini-transaction
-@return pointer to a page or NULL */
-buf_block_t*
-buf_page_try_get_func(
-	const page_id_t		page_id,
-	const char*		file,
-	unsigned		line,
-	mtr_t*			mtr);
-
-/** Tries to get a page.
-If the page is not in the buffer pool it is not loaded. Suitable for using
-when holding the lock_sys_t::mutex.
+	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
+
+/** Try to acquire a page latch.
+@param rw_latch      RW_S_LATCH or RW_X_LATCH
+@param block         guessed block
+@param modify_clock  expected value of block->modify_clock
+@param mtr           mini-transaction
+@return whether the latch was acquired (the page is an allocated file page) */
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+                             uint64_t modify_clock, mtr_t *mtr);
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
 @param[in]	page_id	page identifier
-@param[in]	mtr	mini-transaction
-@return the page if in buffer pool, NULL if not */
-#define buf_page_try_get(page_id, mtr)	\
-	buf_page_try_get_func((page_id), __FILE__, __LINE__, mtr);
+@param[in,out]	mtr	mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr);
 
 /** Get read access to a compressed page (usually of type
 FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
-The page must be released with buf_page_release_zip().
+The page must be released with unfix().
 NOTE: the page is not protected by any latch.  Mutual exclusion has to
 be implemented at a higher level.  In other words, all possible
 accesses to a given page through this function must be protected by
 the same set of mutexes or latches.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
-@return pointer to the block */
-buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+@param page_id   page identifier
+@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
 
 /** Get access to a database page. Buffered redo log may be applied.
 @param[in]	page_id			page id
@@ -278,10 +194,8 @@ buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size);
 @param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
 @param[in]	guess			guessed block or NULL
 @param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file			file name
-@param[in]	line			line where called
-@param[in]	mtr			mini-transaction
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction
 @param[out]	err			DB_SUCCESS or error code
 @param[in]	allow_ibuf_merge	Allow change buffer merge while
 reading the pages from file.
@@ -293,11 +207,10 @@ buf_page_get_gen(
 	ulint			rw_latch,
 	buf_block_t*		guess,
 	ulint			mode,
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr,
 	dberr_t*		err = NULL,
-	bool			allow_ibuf_merge = false);
+	bool			allow_ibuf_merge = false)
+	MY_ATTRIBUTE((nonnull(6), warn_unused_result));
 
 /** This is the low level function used to get access to a database page.
 @param[in]	page_id			page id
@@ -305,10 +218,9 @@ buf_page_get_gen(
 @param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
 @param[in]	guess			guessed block or NULL
 @param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
-BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
-@param[in]	file			file name
-@param[in]	line			line where called
-@param[in]	mtr			mini-transaction
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction, or NULL if a
+					block with page_id is to be evicted
 @param[out]	err			DB_SUCCESS or error code
 @param[in]	allow_ibuf_merge	Allow change buffer merge to happen
 while reading the page from file
@@ -322,16 +234,14 @@ buf_page_get_low(
 	ulint			rw_latch,
 	buf_block_t*		guess,
 	ulint			mode,
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr,
 	dberr_t*		err,
 	bool			allow_ibuf_merge);
 
 /** Initialize a page in the buffer pool. The page is usually not read
 from a file even if it cannot be found in the buffer buf_pool. This is one
-of the functions which perform to a block a state transition NOT_USED =>
-FILE_PAGE (the other is buf_page_get_gen).
+of the functions which perform to a block a state transition NOT_USED => LRU
+(the other is buf_page_get_low()).
 @param[in,out]	space		space object
 @param[in]	offset		offset of the tablespace
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
@@ -342,52 +252,24 @@ buf_block_t*
 buf_page_create(fil_space_t *space, uint32_t offset,
                 ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
 
-/********************************************************************//**
-Releases a compressed-only page acquired with buf_page_get_zip(). */
-UNIV_INLINE
-void
-buf_page_release_zip(
-/*=================*/
-	buf_page_t*	bpage);		/*!< in: buffer block */
-/********************************************************************//**
-Releases a latch, if specified. */
-UNIV_INLINE
-void
-buf_page_release_latch(
-/*=====================*/
-	buf_block_t*	block,		/*!< in: buffer block */
-	ulint		rw_latch);	/*!< in: RW_S_LATCH, RW_X_LATCH,
-					RW_NO_LATCH */
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id         space identfier
+@param zip_size         ROW_FORMAT=COMPRESSED page size or 0
+@param mtr              mini-transaction
+@param free_block       pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr,
+                         buf_block_t *free_block);
+
 /** Move a block to the start of the LRU list. */
 void buf_page_make_young(buf_page_t *bpage);
-/** Mark the page status as FREED for the given tablespace id and
-page number. If the page is not in buffer pool then ignore it.
+/** Mark the page status as FREED for the given tablespace and page number.
 @param[in,out]	space	tablespace
 @param[in]	page	page number
-@param[in,out]	mtr	mini-transaction
-@param[in]	file	file name
-@param[in]	line	line where called */
-void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr,
-                   const char *file, unsigned line);
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_page_get_freed_page_clock(
-/*==========================*/
-	const buf_page_t*	bpage)	/*!< in: block */
-	MY_ATTRIBUTE((warn_unused_result));
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_block_get_freed_page_clock(
-/*===========================*/
-	const buf_block_t*	block)	/*!< in: block */
-	MY_ATTRIBUTE((warn_unused_result));
+@param[in,out]	mtr	mini-transaction */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr);
 
 /** Determine if a block is still close enough to the MRU end of the LRU list
 meaning that it is not in danger of getting evicted and also implying
@@ -431,32 +313,6 @@ ib_uint64_t
 buf_block_get_modify_clock(
 /*=======================*/
 	buf_block_t*	block);	/*!< in: block */
-/*******************************************************************//**
-Increments the bufferfix count. */
-UNIV_INLINE
-void
-buf_block_buf_fix_inc_func(
-/*=======================*/
-# ifdef UNIV_DEBUG
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line */
-# endif /* UNIV_DEBUG */
-	buf_block_t*	block)	/*!< in/out: block to bufferfix */
-	MY_ATTRIBUTE((nonnull));
-
-# ifdef UNIV_DEBUG
-/** Increments the bufferfix count.
-@param[in,out]	b	block to bufferfix
-@param[in]	f	file name where requested
-@param[in]	l	line number where requested */
-#  define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
-# else /* UNIV_DEBUG */
-/** Increments the bufferfix count.
-@param[in,out]	b	block to bufferfix
-@param[in]	f	file name where requested
-@param[in]	l	line number where requested */
-#  define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
-# endif /* UNIV_DEBUG */
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** Check if a buffer is all zeroes.
@@ -464,42 +320,6 @@ buf_block_buf_fix_inc_func(
 @return whether the buffer is all zeroes */
 bool buf_is_zeroes(st_::span<const byte> buf);
 
-/** Checks if the page is in crc32 checksum format.
-@param[in]	read_buf		database page
-@param[in]	checksum_field1		new checksum field
-@param[in]	checksum_field2		old checksum field
-@return true if the page is in crc32 checksum format. */
-bool
-buf_page_is_checksum_valid_crc32(
-	const byte*			read_buf,
-	ulint				checksum_field1,
-	ulint				checksum_field2)
-	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
-/** Checks if the page is in innodb checksum format.
-@param[in]	read_buf	database page
-@param[in]	checksum_field1	new checksum field
-@param[in]	checksum_field2	old checksum field
-@return true if the page is in innodb checksum format. */
-bool
-buf_page_is_checksum_valid_innodb(
-	const byte*			read_buf,
-	ulint				checksum_field1,
-	ulint				checksum_field2)
-	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
-/** Checks if the page is in none checksum format.
-@param[in]	read_buf	database page
-@param[in]	checksum_field1	new checksum field
-@param[in]	checksum_field2	old checksum field
-@return true if the page is in none checksum format. */
-bool
-buf_page_is_checksum_valid_none(
-	const byte*			read_buf,
-	ulint				checksum_field1,
-	ulint				checksum_field2)
-	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
 /** Check if a page is corrupt.
 @param[in]	check_lsn	whether the LSN should be checked
 @param[in]	read_buf	database page
@@ -512,27 +332,6 @@ buf_page_is_corrupted(
 	ulint			fsp_flags)
 	MY_ATTRIBUTE((warn_unused_result));
 
-inline void *aligned_malloc(size_t size, size_t align)
-{
-#ifdef _MSC_VER
-  return _aligned_malloc(size, align);
-#else
-  void *result;
-  if (posix_memalign(&result, align, size))
-    result= NULL;
-  return result;
-#endif
-}
-
-inline void aligned_free(void *ptr)
-{
-#ifdef _MSC_VER
-  _aligned_free(ptr);
-#else
-  free(ptr);
-#endif
-}
-
 /** Read the key version from the page. In full crc32 format,
 key version is stored at {0-3th} bytes. In other format, it is
 stored in 26th position.
@@ -631,35 +430,7 @@ void buf_pool_invalidate();
 --------------------------- LOWER LEVEL ROUTINES -------------------------
 =========================================================================*/
 
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Adds latch level info for the rw-lock protecting the buffer frame. This
-should be called in the debug version after a successful latching of a
-page if we know the latching order level of the acquired latch. */
-UNIV_INLINE
-void
-buf_block_dbg_add_level(
-/*====================*/
-	buf_block_t*	block,	/*!< in: buffer page
-				where we have acquired latch */
-	latch_level_t	level);	/*!< in: latching order level */
-#else /* UNIV_DEBUG */
-# define buf_block_dbg_add_level(block, level) /* nothing */
-#endif /* UNIV_DEBUG */
-
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Gets a pointer to the memory frame of a block.
-@return pointer to the frame */
-UNIV_INLINE
-buf_frame_t*
-buf_block_get_frame(
-/*================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
-	MY_ATTRIBUTE((warn_unused_result));
-#else /* UNIV_DEBUG */
-# define buf_block_get_frame(block) (block)->frame
-#endif /* UNIV_DEBUG */
+#define buf_block_get_frame(block) (block)->page.frame
 
 /*********************************************************************//**
 Gets the compressed page descriptor corresponding to an uncompressed page
@@ -672,18 +443,8 @@ if applicable. */
 /** Monitor the buffer page read/write activity, and increment corresponding
 counter value in MONITOR_MODULE_BUF_PAGE.
 @param bpage   buffer page whose read or write was completed
-@param io_type BUF_IO_READ or BUF_IO_WRITE */
-ATTRIBUTE_COLD __attribute__((nonnull))
-void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type);
-
-/** Complete a read request of a file page to buf_pool.
-@param bpage    recently read page
-@param node     data file
-@return whether the operation succeeded
-@retval DB_SUCCESS              always when writing, or if a read page was OK
-@retval DB_PAGE_CORRUPTED       if the checksum fails on a page read
-@retval DB_DECRYPTION_FAILED    if the page cannot be decrypted */
-dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node);
+@param read    true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read);
 
 /** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
 if needed.
@@ -752,17 +513,17 @@ class buf_page_t
 {
   friend buf_pool_t;
   friend buf_block_t;
+
   /** @name General fields */
   /* @{ */
 
 public: // FIXME: fix fil_iterate()
-  /** Page id. Protected by buf_pool.hash_lock_get(id) when
+  /** Page id. Protected by buf_pool.page_hash.lock_get() when
   the page is in buf_pool.page_hash. */
   page_id_t id_;
+  /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */
+  buf_page_t *hash;
 private:
-  /** Count of how manyfold this block is currently bufferfixed. */
-  Atomic_counter<uint32_t> buf_fix_count_;
-
   /** log sequence number of the START of the log entry written of the
   oldest modification to this block which has not yet been written
   to the data file;
@@ -773,53 +534,64 @@ private:
   (because id().space() is the temporary tablespace). */
   Atomic_relaxed<lsn_t> oldest_modification_;
 
-  /** type of pending I/O operation; protected by buf_pool.mutex
-  if in_LRU_list */
-  Atomic_relaxed<buf_io_fix> io_fix_;
-  /** Block state. @see in_file().
-  State transitions between in_file() states and to
-  BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id)
-  when the block is in buf_pool.page_hash.
-  Other transitions when in_LRU_list are protected by buf_pool.mutex. */
-  buf_page_state state_;
-
 public:
-  /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */
-  buf_page_t *hash;
+  /** state() of unused block (in buf_pool.free list) */
+  static constexpr uint32_t NOT_USED= 0;
+  /** state() of block allocated as general-purpose memory */
+  static constexpr uint32_t MEMORY= 1;
+  /** state() of block that is being freed */
+  static constexpr uint32_t REMOVE_HASH= 2;
+  /** smallest state() of a buffer page that is freed in the tablespace */
+  static constexpr uint32_t FREED= 3;
+  /** smallest state() for a block that belongs to buf_pool.LRU */
+  static constexpr uint32_t UNFIXED= 1U << 29;
+  /** smallest state() of a block for which buffered changes may exist */
+  static constexpr uint32_t IBUF_EXIST= 2U << 29;
+  /** smallest state() of a (re)initialized page (no doublewrite needed) */
+  static constexpr uint32_t REINIT= 3U << 29;
+  /** smallest state() for an io-fixed block */
+  static constexpr uint32_t READ_FIX= 4U << 29;
+  /** smallest state() for a write-fixed block */
+  static constexpr uint32_t WRITE_FIX= 5U << 29;
+  /** smallest state() for a write-fixed block with buffered changes */
+  static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29;
+  /** smallest state() for a write-fixed block (no doublewrite was used) */
+  static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29;
+  /** buf_pool.LRU status mask in state() */
+  static constexpr uint32_t LRU_MASK= 7U << 29;
+
+  /** lock covering the contents of frame */
+  block_lock lock;
+  /** pointer to aligned, uncompressed page frame of innodb_page_size */
+  byte *frame;
   /* @} */
-	page_zip_des_t	zip;		/*!< compressed page; zip.data
-					(but not the data it points to) is
-					also protected by buf_pool.mutex;
-					state == BUF_BLOCK_ZIP_PAGE and
-					zip.data == NULL means an active
-					buf_pool.watch */
-
-	buf_tmp_buffer_t* slot;		/*!< Slot for temporary memory
-					used for encryption/compression
-					or NULL */
+  /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to)
+  is also protected by buf_pool.mutex;
+  !frame && !zip.data means an active buf_pool.watch */
+  page_zip_des_t zip;
 #ifdef UNIV_DEBUG
   /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
   bool in_zip_hash;
-  /** whether this->LRU is in buf_pool.LRU (in_file() holds);
+  /** whether this->LRU is in buf_pool.LRU (in_file());
   protected by buf_pool.mutex */
   bool in_LRU_list;
-  /** whether this is in buf_pool.page_hash (in_file() holds);
+  /** whether this is in buf_pool.page_hash (in_file());
   protected by buf_pool.mutex */
   bool in_page_hash;
-  /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED);
+  /** whether this->list is in buf_pool.free (state() == NOT_USED);
   protected by buf_pool.flush_list_mutex */
   bool in_free_list;
 #endif /* UNIV_DEBUG */
   /** list member in one of the lists of buf_pool; protected by
   buf_pool.mutex or buf_pool.flush_list_mutex
 
-  state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw
+  state() == NOT_USED: buf_pool.free or buf_pool.withdraw
 
   in_file() && oldest_modification():
   buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
 
   The contents is undefined if in_file() && !oldest_modification(),
-  or if state() is BUF_BLOCK_MEMORY or BUF_BLOCK_REMOVE_HASH. */
+  or if state() == MEMORY or state() == REMOVE_HASH. */
   UT_LIST_NODE_T(buf_page_t) list;
 
 	/** @name LRU replacement algorithm fields.
@@ -843,7 +615,7 @@ public:
 					0 if the block was never accessed
 					in the buffer pool.
 
-					For state==BUF_BLOCK_MEMORY
+					For state() == MEMORY
 					blocks, this field can be repurposed
 					for something else.
 
@@ -851,89 +623,127 @@ public:
 					and bytes allocated for recv_sys.pages,
 					the field is protected by
 					recv_sys_t::mutex. */
-  /** Change buffer entries for the page exist.
-  Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */
-  bool ibuf_exist;
-
-  /** Block initialization status. Can be modified while holding io_fix()
-  or buf_block_t::lock X-latch */
-  enum {
-    /** the page was read normally and should be flushed normally */
-    NORMAL = 0,
-    /** the page was (re)initialized, and the doublewrite buffer can be
-    skipped on the next flush */
-    INIT_ON_FLUSH,
-    /** the page was freed and need to be flushed.
-    For page_compressed, page flush will punch a hole to free space.
-    Else if innodb_immediate_scrub_data_uncompressed, the page will
-    be overwritten with zeroes. */
-    FREED
-  } status;
-
-  buf_page_t() : id_(0)
+  buf_page_t() : id_{0}
   {
-    static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility");
+    static_assert(NOT_USED == 0, "compatibility");
     memset((void*) this, 0, sizeof *this);
   }
 
-  /** Initialize some fields */
-  void init()
+  buf_page_t(const buf_page_t &b) :
+    id_(b.id_), hash(b.hash),
+    oldest_modification_(b.oldest_modification_),
+    lock() /* not copied */,
+    frame(b.frame), zip(b.zip),
+#ifdef UNIV_DEBUG
+    in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list),
+    in_page_hash(b.in_page_hash), in_free_list(b.in_free_list),
+#endif /* UNIV_DEBUG */
+    list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock),
+    access_time(b.access_time)
   {
-    io_fix_= BUF_IO_NONE;
-    buf_fix_count_= 0;
-    old= 0;
-    freed_page_clock= 0;
-    access_time= 0;
+    lock.init();
+  }
+
+  /** Initialize some more fields */
+  void init(uint32_t state, page_id_t id)
+  {
+    ut_ad(state < REMOVE_HASH || state >= UNFIXED);
+    id_= id;
+    zip.fix= state;
     oldest_modification_= 0;
-    slot= nullptr;
-    ibuf_exist= false;
-    status= NORMAL;
+    lock.init();
     ut_d(in_zip_hash= false);
     ut_d(in_free_list= false);
     ut_d(in_LRU_list= false);
     ut_d(in_page_hash= false);
-    HASH_INVALIDATE(this, hash);
+    old= 0;
+    freed_page_clock= 0;
+    access_time= 0;
   }
 
-  /** Initialize some more fields */
-  void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0)
+public:
+  const page_id_t &id() const { return id_; }
+  uint32_t state() const { return zip.fix; }
+  uint32_t buf_fix_count() const
   {
-    init();
-    state_= state;
-    id_= id;
-    buf_fix_count_= buf_fix_count;
+    uint32_t f= state();
+    ut_ad(f >= FREED);
+    return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f);
   }
+  /** @return whether this block is read or write fixed;
+  read_complete() or write_complete() will always release
+  the io-fix before releasing U-lock or X-lock */
+  bool is_io_fixed() const
+  { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; }
+  /** @return whether this block is write fixed;
+  write_complete() will always release the write-fix before releasing U-lock */
+  bool is_write_fixed() const { return state() >= WRITE_FIX; }
+  /** @return whether this block is read fixed; this should never hold
+  when a thread is holding the block lock in any mode */
+  bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); }
 
-  /** Initialize some more fields */
-  void init(page_id_t id, uint32_t buf_fix_count= 0)
+  /** @return if this belongs to buf_pool.unzip_LRU */
+  bool belongs_to_unzip_LRU() const
+  { return UNIV_LIKELY_NULL(zip.data) && frame; }
+
+  bool is_freed() const
+  { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
+  bool is_ibuf_exist() const
   {
-    init();
-    id_= id;
-    buf_fix_count_= buf_fix_count;
+    const auto s= state();
+    ut_ad(s >= UNFIXED);
+    ut_ad(s < READ_FIX);
+    return (s & LRU_MASK) == IBUF_EXIST;
   }
+  bool is_reinit() const { return !(~state() & REINIT); }
 
-public:
-  const page_id_t &id() const { return id_; }
-  buf_page_state state() const { return state_; }
-  uint32_t buf_fix_count() const { return buf_fix_count_; }
-  buf_io_fix io_fix() const { return io_fix_; }
-  void io_unfix()
+  void set_reinit(uint32_t prev_state)
   {
-    ut_d(const auto old_io_fix= io_fix());
-    ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN);
-    io_fix_= BUF_IO_NONE;
+    ut_ad(prev_state < READ_FIX);
+    ut_d(const auto s=) zip.fix.fetch_add(REINIT - prev_state);
+    ut_ad(s > prev_state);
+    ut_ad(s < prev_state + UNFIXED);
   }
 
-  /** @return if this belongs to buf_pool.unzip_LRU */
-  bool belongs_to_unzip_LRU() const
+  void set_ibuf_exist()
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+    const auto s= state();
+    ut_ad(s >= UNFIXED);
+    ut_ad(s < READ_FIX);
+    ut_ad(s < IBUF_EXIST || s >= REINIT);
+    zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s));
+  }
+  void clear_ibuf_exist()
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+    ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED);
+    ut_ad(s >= IBUF_EXIST);
+    ut_ad(s < REINIT);
+  }
+
+  uint32_t read_unfix(uint32_t s)
   {
-    return zip.data && state() != BUF_BLOCK_ZIP_PAGE;
+    ut_ad(lock.is_write_locked());
+    ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
+    uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
+    ut_ad(old_state >= READ_FIX);
+    ut_ad(old_state < WRITE_FIX);
+    return old_state + (s - READ_FIX);
   }
 
-  inline void add_buf_fix_count(uint32_t count);
-  inline void set_buf_fix_count(uint32_t count);
-  inline void set_state(buf_page_state state);
-  inline void set_io_fix(buf_io_fix io_fix);
+  void set_freed(uint32_t prev_state, uint32_t count= 0)
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(prev_state >= UNFIXED);
+    ut_ad(prev_state < READ_FIX);
+    ut_d(auto s=) zip.fix.fetch_sub((prev_state & LRU_MASK) - FREED - count);
+    ut_ad(!((prev_state ^ s) & LRU_MASK));
+  }
+
+  inline void set_state(uint32_t s);
   inline void set_corrupt_id();
 
   /** @return the log sequence number of the oldest pending modification
@@ -953,35 +763,72 @@ public:
   inline void set_oldest_modification(lsn_t lsn);
   /** Clear oldest_modification after removing from buf_pool.flush_list */
   inline void clear_oldest_modification();
+  /** Reset the oldest_modification when marking a persistent page freed */
+  void reset_oldest_modification()
+  {
+    ut_ad(oldest_modification() > 2);
+    oldest_modification_.store(1, std::memory_order_release);
+  }
+
+  /** Complete a read of a page.
+  @param node     data file
+  @return whether the operation succeeded
+  @retval DB_PAGE_CORRUPTED    if the checksum fails
+  @retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+  @retval DB_FAIL              if the page contains the wrong ID */
+  dberr_t read_complete(const fil_node_t &node);
+
   /** Note that a block is no longer dirty, while not removing
   it from buf_pool.flush_list */
-  inline void clear_oldest_modification(bool temporary);
+  inline void write_complete(bool temporary);
+
+  /** Write a flushable page to a file or free a freeable block.
+  @param evict       whether to evict the page on write completion
+  @param space       tablespace
+  @return whether a page write was initiated and buf_pool.mutex released */
+  bool flush(bool evict, fil_space_t *space);
 
   /** Notify that a page in a temporary tablespace has been modified. */
   void set_temp_modified()
   {
     ut_ad(fsp_is_system_temporary(id().space()));
-    ut_ad(state() == BUF_BLOCK_FILE_PAGE);
-    ut_ad(!oldest_modification());
+    ut_ad(in_file());
+    ut_ad((oldest_modification() | 2) == 2);
     oldest_modification_= 2;
   }
 
   /** Prepare to release a file page to buf_pool.free. */
   void free_file_page()
   {
-    ut_ad(state() == BUF_BLOCK_REMOVE_HASH);
+    ut_ad((zip.fix.fetch_sub(REMOVE_HASH - MEMORY)) == REMOVE_HASH);
     /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
     ut_d(oldest_modification_= 0;)
-    set_corrupt_id();
-    ut_d(set_state(BUF_BLOCK_MEMORY));
+    id_= page_id_t(~0ULL);
+  }
+
+  void fix_on_recovery()
+  {
+    ut_d(const auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED - 1);
+    ut_ad(f >= READ_FIX);
+    ut_ad(f < WRITE_FIX);
+  }
+
+  uint32_t fix(uint32_t count= 1)
+  {
+    ut_ad(count);
+    ut_ad(count < IBUF_EXIST);
+    uint32_t f= zip.fix.fetch_add(count);
+    ut_ad(f >= FREED);
+    ut_ad(!((f ^ (f + 1)) & LRU_MASK));
+    return f;
   }
 
-  void fix() { buf_fix_count_++; }
   uint32_t unfix()
   {
-    uint32_t count= buf_fix_count_--;
-    ut_ad(count != 0);
-    return count - 1;
+    uint32_t f= zip.fix.fetch_sub(1);
+    ut_ad(f > FREED);
+    ut_ad(!((f ^ (f - 1)) & LRU_MASK));
+    return f - 1;
   }
 
   /** @return the physical size, in bytes */
@@ -1007,27 +854,8 @@ public:
   }
 
   /** @return whether the block is mapped to a data file */
-  bool in_file() const
-  {
-    switch (state_) {
-    case BUF_BLOCK_ZIP_PAGE:
-    case BUF_BLOCK_FILE_PAGE:
-      return true;
-    case BUF_BLOCK_NOT_USED:
-    case BUF_BLOCK_MEMORY:
-    case BUF_BLOCK_REMOVE_HASH:
-      return false;
-    }
+  bool in_file() const { return state() >= FREED; }
 
-    ut_error;
-    return false;
-  }
-
-  /** @return whether the block is modified and ready for flushing */
-  inline bool ready_for_flush() const;
-  /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */
-  bool ready_for_replace() const
-  { return !oldest_modification() && can_relocate(); }
   /** @return whether the block can be relocated in memory.
   The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
   inline bool can_relocate() const;
@@ -1059,27 +887,18 @@ struct buf_block_t{
 					be the first field, so that
 					buf_pool.page_hash can point
 					to buf_page_t or buf_block_t */
-	byte*		frame;		/*!< pointer to buffer frame which
-					is of size srv_page_size, and
-					aligned to an address divisible by
-					srv_page_size */
-	rw_lock_t	lock;		/*!< read-write lock of the buffer
-					frame */
 #ifdef UNIV_DEBUG
   /** whether page.list is in buf_pool.withdraw
-  ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk;
+  ((state() == NOT_USED)) and the buffer pool is being shrunk;
   protected by buf_pool.mutex */
   bool in_withdraw_list;
   /** whether unzip_LRU is in buf_pool.unzip_LRU
-  (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr);
+  (in_file() && frame && zip.data);
   protected by buf_pool.mutex */
   bool in_unzip_LRU_list;
 #endif
-	UT_LIST_NODE_T(buf_block_t) unzip_LRU;
-					/*!< node of the decompressed LRU list;
-					a block is in the unzip_LRU list
-					if page.state() == BUF_BLOCK_FILE_PAGE
-					and page.zip.data != NULL */
+  /** member of buf_pool.unzip_LRU (if belongs_to_unzip_LRU()) */
+  UT_LIST_NODE_T(buf_block_t) unzip_LRU;
 	/* @} */
 	/** @name Optimistic search field */
 	/* @{ */
@@ -1118,17 +937,15 @@ struct buf_block_t{
 	These 5 fields may only be modified when:
 	we are holding the appropriate x-latch in btr_search_latches[], and
 	one of the following holds:
-	(1) the block state is BUF_BLOCK_FILE_PAGE, and
-	we are holding an s-latch or x-latch on buf_block_t::lock, or
-	(2) buf_block_t::buf_fix_count == 0, or
-	(3) the block state is BUF_BLOCK_REMOVE_HASH.
+	(1) in_file(), and we are holding lock in any mode, or
+	(2) !is_read_fixed()&&(state()>=UNFIXED||state()==REMOVE_HASH).
 
 	An exception to this is when we init or create a page
 	in the buffer pool in buf0buf.cc.
 
 	Another exception for buf_pool_t::clear_hash_index() is that
 	assigning block->index = NULL (and block->n_pointers = 0)
-	is allowed whenever btr_search_own_all(RW_LOCK_X).
+	is allowed whenever all AHI latches are exclusively locked.
 
 	Another exception is that ha_insert_for_fold() may
 	decrement n_pointers without holding the appropriate latch
@@ -1137,8 +954,8 @@ struct buf_block_t{
 
 	This implies that the fields may be read without race
 	condition whenever any of the following hold:
-	- the btr_search_latches[] s-latch or x-latch is being held, or
-	- the block state is not BUF_BLOCK_FILE_PAGE or BUF_BLOCK_REMOVE_HASH,
+	- the btr_search_sys.partition[].latch is being held, or
+	- state() == NOT_USED || state() == MEMORY,
 	and holding some latch prevents the state from changing to that.
 
 	Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
@@ -1152,9 +969,7 @@ struct buf_block_t{
 	Atomic_counter<ulint>
 			n_pointers;	/*!< used in debugging: the number of
 					pointers in the adaptive hash index
-					pointing to this frame;
-					protected by atomic memory access
-					or btr_search_own_all(). */
+					pointing to this frame */
 #  define assert_block_ahi_empty(block)					\
 	ut_a((block)->n_pointers == 0)
 #  define assert_block_ahi_empty_on_init(block) do {			\
@@ -1188,24 +1003,8 @@ struct buf_block_t{
 # define assert_block_ahi_empty_on_init(block) /* nothing */
 # define assert_block_ahi_valid(block) /* nothing */
 #endif /* BTR_CUR_HASH_ADAPT */
-# ifdef UNIV_DEBUG
-	/** @name Debug fields */
-	/* @{ */
-	rw_lock_t*	debug_latch;	/*!< in the debug version, each thread
-					which bufferfixes the block acquires
-					an s-latch here; so we can use the
-					debug utilities in sync0rw */
-	/* @} */
-# endif
   void fix() { page.fix(); }
-  uint32_t unfix()
-  {
-    ut_ad(page.buf_fix_count() || page.io_fix() != BUF_IO_NONE ||
-          page.state() == BUF_BLOCK_ZIP_PAGE ||
-          !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S |
-                               RW_LOCK_FLAG_SX));
-    return page.unfix();
-  }
+  uint32_t unfix() { return page.unfix(); }
 
   /** @return the physical size, in bytes */
   ulint physical_size() const { return page.physical_size(); }
@@ -1217,22 +1016,22 @@ struct buf_block_t{
   /** Initialize the block.
   @param page_id  page identifier
   @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
-  @param fix      initial buf_fix_count() */
-  void initialise(const page_id_t page_id, ulint zip_size, uint32_t fix= 0);
+  @param state    initial state() */
+  void initialise(const page_id_t page_id, ulint zip_size, uint32_t state);
 };
 
 /**********************************************************************//**
 Compute the hash fold value for blocks in buf_pool.zip_hash. */
 /* @{ */
 #define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
-#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame)
 #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
 /* @} */
 
-/** A "Hazard Pointer" class used to iterate over page lists
-inside the buffer pool. A hazard pointer is a buf_page_t pointer
+/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or
+buf_pool.flush_list. A hazard pointer is a buf_page_t pointer
 which we intend to iterate over next and we want it remain valid
-even after we release the buffer pool mutex. */
+even after we release the mutex that protects the list. */
 class HazardPointer
 {
 public:
@@ -1347,14 +1146,18 @@ struct buf_buddy_free_t {
 				/*!< Node of zip_free list */
 };
 
-/** @brief The buffer pool statistics structure. */
+/** @brief The buffer pool statistics structure;
+protected by buf_pool.mutex unless otherwise noted. */
 struct buf_pool_stat_t{
-	ulint	n_page_gets;	/*!< number of page gets performed;
+	/** Initialize the counters */
+	void init() { memset((void*) this, 0, sizeof *this); }
+
+	ib_counter_t<ulint, ib_counter_element_t>	n_page_gets;
+				/*!< number of page gets performed;
 				also successful searches through
 				the adaptive hash index are
-				counted as page gets; this field
-				is NOT protected by the buffer
-				pool mutex */
+				counted as page gets;
+				NOT protected by buf_pool.mutex */
 	ulint	n_pages_read;	/*!< number read operations */
 	ulint	n_pages_written;/*!< number write operations */
 	ulint	n_pages_created;/*!< number of pages created
@@ -1372,10 +1175,9 @@ struct buf_pool_stat_t{
 				young because the first access
 				was not long enough ago, in
 				buf_page_peek_if_too_old() */
-	/** number of waits for eviction; writes protected by buf_pool.mutex */
+	/** number of waits for eviction */
 	ulint	LRU_waits;
 	ulint	LRU_bytes;	/*!< LRU size in bytes */
-	ulint	flush_list_bytes;/*!< flush_list size in bytes */
 };
 
 /** Statistics of buddy blocks of a given size. */
@@ -1415,7 +1217,7 @@ class buf_pool_t
     size_t mem_size() const { return mem_pfx.m_size; }
 
     /** Register the chunk */
-    void reg() { map_reg->emplace(map::value_type(blocks->frame, this)); }
+    void reg() { map_reg->emplace(map::value_type(blocks->page.frame, this)); }
 
     /** Allocate a chunk of buffer frames.
     @param bytes    requested size
@@ -1442,7 +1244,14 @@ class buf_pool_t
     inline const buf_block_t *not_freed() const;
 #endif /* UNIV_DEBUG */
   };
-
+public:
+  /** Hash cell chain in page_hash_table */
+  struct hash_chain
+  {
+    /** pointer to the first block */
+    buf_page_t *first;
+  };
+private:
   /** Withdraw blocks from the buffer pool until meeting withdraw_target.
   @return whether retry is needed */
   inline bool withdraw_blocks();
@@ -1494,27 +1303,27 @@ public:
   {
     ut_ad(is_initialised());
     size_t size= 0;
-    for (auto j= n_chunks; j--; )
+    for (auto j= ut_min(n_chunks_new, n_chunks); j--; )
       size+= chunks[j].size;
     return size;
   }
 
   /** Determine whether a frame is intended to be withdrawn during resize().
-  @param ptr    pointer within a buf_block_t::frame
+  @param ptr    pointer within a buf_page_t::frame
   @return whether the frame will be withdrawn */
   bool will_be_withdrawn(const byte *ptr) const
   {
-    ut_ad(curr_size < old_size);
+    ut_ad(n_chunks_new < n_chunks);
 #ifdef SAFE_MUTEX
-    if (resizing.load(std::memory_order_relaxed))
+    if (resize_in_progress())
       mysql_mutex_assert_owner(&mutex);
 #endif /* SAFE_MUTEX */
 
     for (const chunk_t *chunk= chunks + n_chunks_new,
          * const echunk= chunks + n_chunks;
          chunk != echunk; chunk++)
-      if (ptr >= chunk->blocks->frame &&
-          ptr < (chunk->blocks + chunk->size - 1)->frame + srv_page_size)
+      if (ptr >= chunk->blocks->page.frame &&
+          ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size)
         return true;
     return false;
   }
@@ -1524,9 +1333,9 @@ public:
   @return whether the frame will be withdrawn */
   bool will_be_withdrawn(const buf_page_t &bpage) const
   {
-    ut_ad(curr_size < old_size);
+    ut_ad(n_chunks_new < n_chunks);
 #ifdef SAFE_MUTEX
-    if (resizing.load(std::memory_order_relaxed))
+    if (resize_in_progress())
       mysql_mutex_assert_owner(&mutex);
 #endif /* SAFE_MUTEX */
 
@@ -1540,8 +1349,9 @@ public:
   }
 
   /** Release and evict a corrupted page.
-  @param bpage    page that was being read */
-  ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage);
+  @param bpage    x-latched page that was found corrupted
+  @param state    expected current state of the page */
+  ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state);
 
   /** Release a memory block to the buffer pool. */
   ATTRIBUTE_COLD void free_block(buf_block_t *block);
@@ -1576,9 +1386,6 @@ public:
   inline buf_block_t *block_from_ahi(const byte *ptr) const;
 #endif /* BTR_CUR_HASH_ADAPT */
 
-  bool is_block_lock(const rw_lock_t *l) const
-  { return is_block_field(static_cast<const void*>(l)); }
-
   /**
   @return the smallest oldest_modification lsn for any page
   @retval empty_lsn if all modified persistent pages have been flushed */
@@ -1607,84 +1414,27 @@ public:
     return is_block_field(reinterpret_cast<const void*>(block));
   }
 
-  /** Get the page_hash latch for a page */
-  page_hash_latch *hash_lock_get(const page_id_t id) const
-  {
-    return page_hash.lock_get(id.fold());
-  }
-
-  /** Look up a block descriptor.
-  @param id    page identifier
-  @param fold  id.fold()
-  @return block descriptor, possibly in watch[]
-  @retval nullptr  if not found*/
-  buf_page_t *page_hash_get_low(const page_id_t id, const ulint fold)
-  {
-    ut_ad(id.fold() == fold);
-#ifdef SAFE_MUTEX
-    DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
-                page_hash.lock_get(fold)->is_locked());
-#endif /* SAFE_MUTEX */
-    buf_page_t *bpage;
-    /* Look for the page in the hash table */
-    HASH_SEARCH(hash, &page_hash, fold, buf_page_t*, bpage,
-                ut_ad(bpage->in_page_hash), id == bpage->id());
-    return bpage;
-  }
-private:
-  /** Look up a block descriptor.
-  @tparam exclusive  whether the latch is to be acquired exclusively
-  @tparam watch      whether to allow watch_is_sentinel()
-  @param page_id     page identifier
-  @param fold        page_id.fold()
-  @param hash_lock   pointer to the acquired latch (to be released by caller)
-  @return pointer to the block
-  @retval nullptr  if no block was found; !lock || !*lock will also hold */
-  template<bool exclusive,bool watch>
-  buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
-                                   page_hash_latch **hash_lock)
+public:
+  /** @return whether the buffer pool contains a page
+  @tparam allow_watch  whether to allow watch_is_sentinel()
+  @param page_id       page identifier
+  @param chain         hash table chain for page_id.fold() */
+  template<bool allow_watch= false>
+  TRANSACTIONAL_INLINE
+  bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
   {
-    ut_ad(hash_lock || !exclusive);
-    page_hash_latch *latch= page_hash.lock<exclusive>(fold);
-    buf_page_t *bpage= page_hash_get_low(page_id, fold);
-    if (!bpage || watch_is_sentinel(*bpage))
+    transactional_shared_lock_guard<page_hash_latch> g
+      {page_hash.lock_get(chain)};
+    buf_page_t *bpage= page_hash.get(page_id, chain);
+    if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
     {
-      latch->release<exclusive>();
-      if (hash_lock)
-        *hash_lock= nullptr;
-      return watch ? bpage : nullptr;
+      ut_ad(!bpage->in_zip_hash);
+      ut_ad(!bpage->zip.data);
+      if (!allow_watch)
+        bpage= nullptr;
     }
-
-    ut_ad(bpage->in_file());
-    ut_ad(page_id == bpage->id());
-
-    if (hash_lock)
-      *hash_lock= latch; /* to be released by the caller */
-    else
-      latch->release<exclusive>();
     return bpage;
   }
-public:
-  /** Look up a block descriptor.
-  @tparam exclusive  whether the latch is to be acquired exclusively
-  @param page_id     page identifier
-  @param fold        page_id.fold()
-  @param hash_lock   pointer to the acquired latch (to be released by caller)
-  @return pointer to the block
-  @retval nullptr  if no block was found; !lock || !*lock will also hold */
-  template<bool exclusive>
-  buf_page_t *page_hash_get_locked(const page_id_t page_id, ulint fold,
-                                   page_hash_latch **hash_lock)
-  { return page_hash_get_locked<exclusive,false>(page_id, fold, hash_lock); }
-
-  /** @return whether the buffer pool contains a page
-  @tparam watch      whether to allow watch_is_sentinel()
-  @param page_id     page identifier */
-  template<bool watch= false>
-  bool page_hash_contains(const page_id_t page_id)
-  {
-    return page_hash_get_locked<false,watch>(page_id, page_id.fold(), nullptr);
-  }
 
   /** Determine if a block is a sentinel for a buffer pool watch.
   @param bpage page descriptor
@@ -1693,17 +1443,12 @@ public:
   {
 #ifdef SAFE_MUTEX
     DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
-                hash_lock_get(bpage.id())->is_locked());
+                page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
+                is_locked());
 #endif /* SAFE_MUTEX */
     ut_ad(bpage.in_file());
-
-    if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)])
-    {
-      ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data);
+    if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
       return false;
-    }
-
-    ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE);
     ut_ad(!bpage.in_zip_hash);
     ut_ad(!bpage.zip.data);
     return true;
@@ -1713,44 +1458,55 @@ public:
   This may only be called after !watch_set() and before invoking watch_unset().
   @param id   page identifier
   @return whether the page was read to the buffer pool */
+  TRANSACTIONAL_INLINE
   bool watch_occurred(const page_id_t id)
   {
-    const ulint fold= id.fold();
-    page_hash_latch *hash_lock= page_hash.lock<false>(fold);
+    hash_chain &chain= page_hash.cell_get(id.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {page_hash.lock_get(chain)};
     /* The page must exist because watch_set() increments buf_fix_count. */
-    buf_page_t *bpage= page_hash_get_low(id, fold);
-    const bool is_sentinel= watch_is_sentinel(*bpage);
-    hash_lock->read_unlock();
-    return !is_sentinel;
+    return !watch_is_sentinel(*page_hash.get(id, chain));
   }
 
-  /** Register a watch for a page identifier. The caller must hold an
-  exclusive page hash latch. The *hash_lock may be released,
-  relocated, and reacquired.
+  /** Register a watch for a page identifier.
   @param id         page identifier
-  @param hash_lock  exclusively held page_hash latch
-  @return a buffer pool block corresponding to id
-  @retval nullptr   if the block was not present, and a watch was installed */
-  inline buf_page_t *watch_set(const page_id_t id,
-                               page_hash_latch **hash_lock);
+  @param chain      page_hash.cell_get(id.fold())
+  @return a buffer page corresponding to id
+  @retval nullptr   if the block was not present in page_hash */
+  buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
 
   /** Stop watching whether a page has been read in.
   watch_set(id) must have returned nullptr before.
-  @param id   page identifier */
-  void watch_unset(const page_id_t id);
+  @param id         page identifier
+  @param chain      unlocked hash table chain */
+  void watch_unset(const page_id_t id, hash_chain &chain);
 
   /** Remove the sentinel block for the watch before replacing it with a
   real block. watch_unset() or watch_occurred() will notice
   that the block has been replaced with the real block.
-  @param watch   sentinel */
-  inline void watch_remove(buf_page_t *watch);
+  @param w          sentinel
+  @param chain      locked hash table chain
+  @return           w->state() */
+  inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain);
 
   /** @return whether less than 1/4 of the buffer pool is available */
+  TPOOL_SUPPRESS_TSAN
   bool running_out() const
   {
     return !recv_recovery_is_on() &&
-      UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
-                    std::min(curr_size, old_size) / 4);
+      UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+        n_chunks_new / 4 * chunks->size;
+  }
+
+  /** @return whether the buffer pool has run out */
+  TPOOL_SUPPRESS_TSAN
+  bool ran_out() const
+  { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); }
+
+  /** @return whether the buffer pool is shrinking */
+  inline bool is_shrinking() const
+  {
+    return n_chunks_new < n_chunks;
   }
 
 #ifdef UNIV_DEBUG
@@ -1783,18 +1539,11 @@ public:
   static constexpr uint32_t READ_AHEAD_PAGES= 64;
 
   /** Buffer pool mutex */
-  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
-  /** Number of pending LRU flush; protected by mutex. */
-  ulint n_flush_LRU_;
-  /** broadcast when n_flush_LRU reaches 0; protected by mutex */
-  pthread_cond_t done_flush_LRU;
-  /** Number of pending flush_list flush; protected by mutex */
-  ulint n_flush_list_;
-  /** broadcast when n_flush_list reaches 0; protected by mutex */
-  pthread_cond_t done_flush_list;
-
-  TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
-  TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+  /** current statistics; protected by mutex */
+  buf_pool_stat_t stat;
+  /** old statistics; protected by mutex */
+  buf_pool_stat_t old_stat;
 
 	/** @name General fields */
 	/* @{ */
@@ -1809,30 +1558,35 @@ public:
 	ut_allocator<unsigned char>	allocator;	/*!< Allocator used for
 					allocating memory for the the "chunks"
 					member. */
-	volatile ulint	n_chunks;	/*!< number of buffer pool chunks */
-	volatile ulint	n_chunks_new;	/*!< new number of buffer pool chunks */
+	ulint		n_chunks;	/*!< number of buffer pool chunks */
+	ulint		n_chunks_new;	/*!< new number of buffer pool chunks.
+					both n_chunks{,new} are protected under
+					mutex */
 	chunk_t*	chunks;		/*!< buffer pool chunks */
 	chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
 					after resizing buffer pool */
 	/** current pool size in pages */
 	Atomic_counter<ulint> curr_size;
-	/** previous pool size in pages */
-	Atomic_counter<ulint> old_size;
 	/** read-ahead request size in pages */
 	Atomic_counter<uint32_t> read_ahead_area;
 
-  /** Hash table with singly-linked overflow lists. @see hash_table_t */
+  /** Hash table with singly-linked overflow lists */
   struct page_hash_table
   {
+    static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
+    static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63),
+      "not a multiple of 64 bytes");
+
     /** Number of array[] elements per page_hash_latch.
     Must be one less than a power of 2. */
-    static constexpr size_t ELEMENTS_PER_LATCH= CPU_LEVEL1_DCACHE_LINESIZE /
-      sizeof(void*) - 1;
+    static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1;
+    static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+      ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
 
     /** number of payload elements in array[] */
     Atomic_relaxed<ulint> n_cells;
     /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
-    hash_cell_t *array;
+    hash_chain *array;
 
     /** Create the hash table.
     @param n  the lower bound of n_cells */
@@ -1844,7 +1598,12 @@ public:
     /** @return the index of an array element */
     ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
     /** @return raw array index converted to padded index */
-    static ulint pad(ulint h) { return 1 + (h / ELEMENTS_PER_LATCH) + h; }
+    static ulint pad(ulint h)
+    {
+      ulint latches= h / ELEMENTS_PER_LATCH;
+      ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH;
+      return 1 + latches + empty_slots + h;
+    }
   private:
     /** @return the hash value before any ELEMENTS_PER_LATCH padding */
     static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
@@ -1854,29 +1613,72 @@ public:
     {
       return pad(hash(fold, n_cells));
     }
-    /** Get a page_hash latch. */
-    page_hash_latch *lock_get(ulint fold, ulint n) const
+  public:
+    /** @return the latch covering a hash table chain */
+    static page_hash_latch &lock_get(hash_chain &chain)
     {
       static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
                     "must be one less than a power of 2");
-      return reinterpret_cast<page_hash_latch*>
-        (&array[calc_hash(fold, n) & ~ELEMENTS_PER_LATCH]);
+      const size_t addr= reinterpret_cast<size_t>(&chain);
+      ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
+      return *reinterpret_cast<page_hash_latch*>
+        (addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
     }
-  public:
-    /** Get a page_hash latch. */
-    page_hash_latch *lock_get(ulint fold) const
-    { return lock_get(fold, n_cells); }
-
-    /** Acquire an array latch.
-    @tparam exclusive  whether the latch is to be acquired exclusively
-    @param fold    hash bucket key */
-    template<bool exclusive> page_hash_latch *lock(ulint fold)
+
+    /** Get a hash table slot. */
+    hash_chain &cell_get(ulint fold) const
+    { return array[calc_hash(fold, n_cells)]; }
+
+    /** Append a block descriptor to a hash bucket chain. */
+    void append(hash_chain &chain, buf_page_t *bpage)
+    {
+      ut_ad(!bpage->in_page_hash);
+      ut_ad(!bpage->hash);
+      ut_d(bpage->in_page_hash= true);
+      buf_page_t **prev= &chain.first;
+      while (*prev)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage;
+    }
+
+    /** Remove a block descriptor from a hash bucket chain. */
+    void remove(hash_chain &chain, buf_page_t *bpage)
     {
-      page_hash_latch *latch= lock_get(fold, n_cells);
-      latch->acquire<exclusive>();
-      return latch;
+      ut_ad(bpage->in_page_hash);
+      buf_page_t **prev= &chain.first;
+      while (*prev != bpage)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage->hash;
+      ut_d(bpage->in_page_hash= false);
+      bpage->hash= nullptr;
     }
 
+    /** Replace a block descriptor with another. */
+    void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage)
+    {
+      ut_ad(old->in_page_hash);
+      ut_ad(bpage->in_page_hash);
+      ut_d(old->in_page_hash= false);
+      ut_ad(bpage->hash == old->hash);
+      old->hash= nullptr;
+      buf_page_t **prev= &chain.first;
+      while (*prev != old)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage;
+    }
+
+    /** Look up a page in a hash bucket chain. */
+    inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const;
+
     /** Exclusively aqcuire all latches */
     inline void write_lock_all();
 
@@ -1891,8 +1693,6 @@ public:
   /** map of block->frame to buf_block_t blocks that belong
   to buf_buddy_alloc(); protected by buf_pool.mutex */
   hash_table_t zip_hash;
-	/** number of pending read operations */
-	Atomic_counter<ulint> n_pend_reads;
 	Atomic_counter<ulint>
 			n_pend_unzip;	/*!< number of pending decompressions */
 
@@ -1902,44 +1702,90 @@ public:
 	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
 					/*!< Statistics of buddy system,
 					indexed by block size */
-	buf_pool_stat_t	stat;		/*!< current statistics */
-	buf_pool_stat_t	old_stat;	/*!< old statistics */
 
 	/* @} */
 
+  /** number of index page splits */
+  Atomic_counter<ulint> pages_split;
+
   /** @name Page flushing algorithm fields */
   /* @{ */
 
   /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
   and buf_page_t::list pointers when !oldest_modification() */
-  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
   /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
   FlushHp flush_hp;
-  /** modified blocks (a subset of LRU) */
+  /** flush_list size in bytes; protected by flush_list_mutex */
+  ulint flush_list_bytes;
+  /** possibly modified persistent pages (a subset of LRU);
+  os_aio_pending_writes() is approximately COUNT(is_write_fixed()) */
   UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
 private:
-  /** whether the page cleaner needs wakeup from indefinite sleep */
-  bool page_cleaner_is_idle;
+  static constexpr unsigned PAGE_CLEANER_IDLE= 1;
+  static constexpr unsigned FLUSH_LIST_ACTIVE= 2;
+  static constexpr unsigned LRU_FLUSH= 4;
+
+  /** Number of pending LRU flush * LRU_FLUSH +
+  PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */
+  unsigned page_cleaner_status;
   /** track server activity count for signaling idle flushing */
   ulint last_activity_count;
 public:
   /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
   pthread_cond_t do_flush_list;
+  /** broadcast when !n_flush(); protected by flush_list_mutex */
+  pthread_cond_t done_flush_LRU;
+  /** broadcast when a batch completes; protected by flush_list_mutex */
+  pthread_cond_t done_flush_list;
+
+  /** @return number of pending LRU flush */
+  unsigned n_flush() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status / LRU_FLUSH;
+  }
+
+  /** Increment the number of pending LRU flush */
+  inline void n_flush_inc();
+
+  /** Decrement the number of pending LRU flush */
+  inline void n_flush_dec();
+
+  /** @return whether flush_list flushing is active */
+  bool flush_list_active() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status & FLUSH_LIST_ACTIVE;
+  }
+
+  void flush_list_set_active()
+  {
+    ut_ad(!flush_list_active());
+    page_cleaner_status+= FLUSH_LIST_ACTIVE;
+  }
+  void flush_list_set_inactive()
+  {
+    ut_ad(flush_list_active());
+    page_cleaner_status-= FLUSH_LIST_ACTIVE;
+  }
 
   /** @return whether the page cleaner must sleep due to being idle */
   bool page_cleaner_idle() const
   {
     mysql_mutex_assert_owner(&flush_list_mutex);
-    return page_cleaner_is_idle;
+    return page_cleaner_status & PAGE_CLEANER_IDLE;
   }
-  /** Wake up the page cleaner if needed */
-  void page_cleaner_wakeup();
+  /** Wake up the page cleaner if needed.
+  @param for_LRU  whether to wake up for LRU eviction */
+  void page_cleaner_wakeup(bool for_LRU= false);
 
   /** Register whether an explicit wakeup of the page cleaner is needed */
   void page_cleaner_set_idle(bool deep_sleep)
   {
     mysql_mutex_assert_owner(&flush_list_mutex);
-    page_cleaner_is_idle= deep_sleep;
+    page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) |
+      (PAGE_CLEANER_IDLE * deep_sleep);
   }
 
   /** Update server last activity count */
@@ -1949,9 +1795,6 @@ public:
     last_activity_count= activity_count;
   }
 
-  // n_flush_LRU() + n_flush_list()
-  // is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
-
 	unsigned	freed_page_clock;/*!< a sequence number used
 					to count the number of buffer
 					blocks removed from the end of
@@ -1961,16 +1804,10 @@ public:
 					to read this for heuristic
 					purposes without holding any
 					mutex or latch */
-	bool		try_LRU_scan;	/*!< Cleared when an LRU
-					scan for free block fails. This
-					flag is used to avoid repeated
-					scans of LRU list when we know
-					that there is no free block
-					available in the scan depth for
-					eviction. Set whenever
-					we flush a batch from the
-					buffer pool. Protected by the
-					buf_pool.mutex */
+  /** Cleared when buf_LRU_get_free_block() fails.
+  Set whenever the free list grows, along with a broadcast of done_free.
+  Protected by buf_pool.mutex. */
+  Atomic_relaxed<bool> try_LRU_scan;
 	/* @} */
 
 	/** @name LRU replacement algorithm fields */
@@ -1979,7 +1816,8 @@ public:
 	UT_LIST_BASE_NODE_T(buf_page_t) free;
 					/*!< base node of the free
 					block list */
-  /** signaled each time when the free list grows; protected by mutex */
+  /** broadcast each time when the free list grows or try_LRU_scan is set;
+  protected by mutex */
   pthread_cond_t done_free;
 
 	UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
@@ -2034,34 +1872,13 @@ public:
   /** Reserve a buffer. */
   buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
 
-  /** @return whether any I/O is pending */
-  bool any_io_pending()
-  {
-    if (n_pend_reads)
-      return true;
-    mysql_mutex_lock(&mutex);
-    const bool any_pending{n_flush_LRU_ || n_flush_list_};
-    mysql_mutex_unlock(&mutex);
-    return any_pending;
-  }
-  /** @return total amount of pending I/O */
-  ulint io_pending() const
-  {
-    return n_pend_reads + n_flush_LRU() + n_flush_list();
-  }
-
 private:
   /** Remove a block from the flush list. */
   inline void delete_from_flush_list_low(buf_page_t *bpage);
-  /** Remove a block from flush_list.
-  @param bpage   buffer pool page
-  @param clear   whether to invoke buf_page_t::clear_oldest_modification() */
-  void delete_from_flush_list(buf_page_t *bpage, bool clear);
 public:
   /** Remove a block from flush_list.
   @param bpage   buffer pool page */
-  void delete_from_flush_list(buf_page_t *bpage)
-  { delete_from_flush_list(bpage, true); }
+  void delete_from_flush_list(buf_page_t *bpage);
 
   /** Insert a modified block into the flush list.
   @param block    modified block
@@ -2069,7 +1886,7 @@ public:
   void insert_into_flush_list(buf_block_t *block, lsn_t lsn);
 
   /** Free a page whose underlying file page has been freed. */
-  inline void release_freed_page(buf_page_t *bpage);
+  ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage);
 
 private:
   /** Temporary memory for page_compressed and encrypted I/O */
@@ -2080,34 +1897,12 @@ private:
     /** array of slots */
     buf_tmp_buffer_t *slots;
 
-    void create(ulint n_slots)
-    {
-      this->n_slots= n_slots;
-      slots= static_cast<buf_tmp_buffer_t*>
-        (ut_malloc_nokey(n_slots * sizeof *slots));
-      memset((void*) slots, 0, n_slots * sizeof *slots);
-    }
+    void create(ulint n_slots);
 
-    void close()
-    {
-      for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
-      {
-        aligned_free(s->crypt_buf);
-        aligned_free(s->comp_buf);
-      }
-      ut_free(slots);
-      slots= nullptr;
-      n_slots= 0;
-    }
+    void close();
 
     /** Reserve a buffer */
-    buf_tmp_buffer_t *reserve()
-    {
-      for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
-        if (s->acquire())
-          return s;
-      return nullptr;
-    }
+    buf_tmp_buffer_t *reserve();
   } io_buf;
 
   /** whether resize() is in the critical path */
@@ -2117,64 +1912,46 @@ private:
 /** The InnoDB buffer pool */
 extern buf_pool_t buf_pool;
 
-inline void page_hash_latch::read_lock()
+inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id,
+                                                    const hash_chain &chain)
+  const
+{
+#ifdef SAFE_MUTEX
+  DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) ||
+              lock_get(const_cast<hash_chain&>(chain)).is_locked());
+#endif /* SAFE_MUTEX */
+  for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash)
+  {
+    ut_ad(bpage->in_page_hash);
+    ut_ad(bpage->in_file());
+    if (bpage->id() == id)
+      return bpage;
+  }
+  return nullptr;
+}
+
+#ifdef SUX_LOCK_GENERIC
+inline void page_hash_latch::lock_shared()
 {
   mysql_mutex_assert_not_owner(&buf_pool.mutex);
   if (!read_trylock())
     read_lock_wait();
 }
 
-inline void page_hash_latch::write_lock()
+inline void page_hash_latch::lock()
 {
   if (!write_trylock())
     write_lock_wait();
 }
+#endif /* SUX_LOCK_GENERIC */
 
-inline void buf_page_t::add_buf_fix_count(uint32_t count)
-{
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-  buf_fix_count_+= count;
-}
-
-inline void buf_page_t::set_buf_fix_count(uint32_t count)
-{
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-  buf_fix_count_= count;
-}
-
-inline void buf_page_t::set_state(buf_page_state state)
-{
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-#ifdef UNIV_DEBUG
-  switch (state) {
-  case BUF_BLOCK_REMOVE_HASH:
-    /* buf_pool_t::corrupted_evict() invokes set_corrupt_id()
-    before buf_LRU_free_one_page(), so we cannot assert that
-    we are holding the hash_lock. */
-    break;
-  case BUF_BLOCK_MEMORY:
-    if (!in_file()) break;
-    /* fall through */
-  case BUF_BLOCK_FILE_PAGE:
-    ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
-    break;
-  case BUF_BLOCK_NOT_USED:
-    if (!in_file()) break;
-    /* fall through */
-  case BUF_BLOCK_ZIP_PAGE:
-    ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked() ||
-          (this >= &buf_pool.watch[0] &&
-           this <= &buf_pool.watch[UT_ARR_SIZE(buf_pool.watch)]));
-    break;
-  }
-#endif
-  state_= state;
-}
-
-inline void buf_page_t::set_io_fix(buf_io_fix io_fix)
+inline void buf_page_t::set_state(uint32_t s)
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  io_fix_= io_fix;
+  ut_ad(s <= REMOVE_HASH || s >= UNFIXED);
+  ut_ad(s < WRITE_FIX);
+  ut_ad(s <= READ_FIX || zip.fix == READ_FIX);
+  zip.fix= s;
 }
 
 inline void buf_page_t::set_corrupt_id()
@@ -2191,19 +1968,15 @@ inline void buf_page_t::set_corrupt_id()
   default:
     ut_ad("block is dirty" == 0);
   }
-  switch (state()) {
-  case BUF_BLOCK_REMOVE_HASH:
-    break;
-  case BUF_BLOCK_ZIP_PAGE:
-  case BUF_BLOCK_FILE_PAGE:
-    ut_ad(buf_pool.hash_lock_get(id_)->is_write_locked());
-    break;
-  case BUF_BLOCK_NOT_USED:
-  case BUF_BLOCK_MEMORY:
-    ut_ad("invalid state" == 0);
+  const auto f= state();
+  if (f != REMOVE_HASH)
+  {
+    ut_ad(f >= UNFIXED);
+    ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+          is_write_locked());
   }
 #endif
-  id_= page_id_t(~0ULL);
+  id_.set_corrupted();
 }
 
 /** Set oldest_modification when adding to buf_pool.flush_list */
@@ -2218,10 +1991,12 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn)
 /** Clear oldest_modification after removing from buf_pool.flush_list */
 inline void buf_page_t::clear_oldest_modification()
 {
-  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
-  ut_d(const auto state= state_);
-  ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE ||
-        state == BUF_BLOCK_REMOVE_HASH);
+#ifdef SAFE_MUTEX
+  if (oldest_modification() != 2)
+    mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+#endif /* SAFE_MUTEX */
+  ut_d(const auto s= state());
+  ut_ad(s >= REMOVE_HASH);
   ut_ad(oldest_modification());
   ut_ad(!list.prev);
   ut_ad(!list.next);
@@ -2231,46 +2006,16 @@ inline void buf_page_t::clear_oldest_modification()
   oldest_modification_.store(0, std::memory_order_release);
 }
 
-/** Note that a block is no longer dirty, while not removing
-it from buf_pool.flush_list */
-inline void buf_page_t::clear_oldest_modification(bool temporary)
-{
-  ut_ad(temporary == fsp_is_system_temporary(id().space()));
-  if (temporary)
-  {
-    ut_ad(oldest_modification() == 2);
-    oldest_modification_= 0;
-  }
-  else
-  {
-    /* We use release memory order to guarantee that callers of
-    oldest_modification_acquire() will observe the block as
-    being detached from buf_pool.flush_list, after reading the value 0. */
-    ut_ad(oldest_modification() > 2);
-    oldest_modification_.store(1, std::memory_order_release);
-  }
-}
-
-/** @return whether the block is modified and ready for flushing */
-inline bool buf_page_t::ready_for_flush() const
-{
-  mysql_mutex_assert_owner(&buf_pool.mutex);
-  ut_ad(in_LRU_list);
-  ut_a(in_file());
-  ut_ad(fsp_is_system_temporary(id().space())
-        ? oldest_modification() == 2
-        : oldest_modification() > 2);
-  return io_fix_ == BUF_IO_NONE;
-}
-
 /** @return whether the block can be relocated in memory.
 The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
 inline bool buf_page_t::can_relocate() const
 {
   mysql_mutex_assert_owner(&buf_pool.mutex);
-  ut_ad(in_file());
+  const auto f= state();
+  ut_ad(f >= FREED);
   ut_ad(in_LRU_list);
-  return io_fix_ == BUF_IO_NONE && !buf_fix_count_;
+  return (f == FREED || (f < READ_FIX && !(f & ~LRU_MASK))) &&
+    !lock.is_locked_or_waiting();
 }
 
 /** @return whether the block has been flagged old in buf_pool.LRU */
@@ -2331,41 +2076,26 @@ inline void buf_page_t::set_old(bool old)
 /**********************************************************************
 Let us list the consistency conditions for different control block states.
 
-NOT_USED:	is in free list, not in LRU list, not in flush list, nor
-		page hash table
-MEMORY:		is not in free list, LRU list, or flush list, nor page
-		hash table
-FILE_PAGE:	space and offset are defined, is in page hash table
-		if io_fix == BUF_IO_WRITE,
-			buf_pool.n_flush_LRU() || buf_pool.n_flush_list()
-
-		(1) if buf_fix_count == 0, then
-			is in LRU list, not in free list
-			is in flush list,
-				if and only if oldest_modification > 0
-			is x-locked,
-				if and only if io_fix == BUF_IO_READ
-			is s-locked,
-				if and only if io_fix == BUF_IO_WRITE
-
-		(2) if buf_fix_count > 0, then
-			is not in LRU list, not in free list
-			is in flush list,
-				if and only if oldest_modification > 0
-			if io_fix == BUF_IO_READ,
-				is x-locked
-			if io_fix == BUF_IO_WRITE,
-				is s-locked
+NOT_USED:	is in free list, not LRU, not flush_list, nor page_hash
+MEMORY:		is not in any of free, LRU, flush_list, page_hash
+in_file():	is not in free list, is in LRU list, id() is defined,
+		is in page_hash (not necessarily if is_read_fixed())
+
+		is in buf_pool.flush_list, if and only
+		if oldest_modification == 1 || oldest_modification > 2
+
+		(1) if is_write_fixed(): is u-locked
+		(2) if is_read_fixed(): is x-locked
 
 State transitions:
 
 NOT_USED => MEMORY
-MEMORY => FILE_PAGE
 MEMORY => NOT_USED
-FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
-				(1) buf_fix_count == 0,
-				(2) oldest_modification == 0, and
-				(3) io_fix == 0.
+MEMORY => UNFIXED
+UNFIXED => in_file()
+in_file() => UNFIXED or FREED
+UNFIXED or FREED => REMOVE_HASH
+REMOVE_HASH => NOT_USED	(if and only if !oldest_modification())
 */
 
 /** Select from where to start a scan. If we have scanned
@@ -2427,5 +2157,3 @@ struct	CheckUnzipLRUAndLRUList {
 #include "buf0buf.inl"
 
 #endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
index 364f04d3f69..3c4da98f83b 100644
--- a/storage/innobase/include/buf0buf.inl
+++ b/storage/innobase/include/buf0buf.inl
@@ -2,7 +2,7 @@
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
-Copyright (c) 2014, 2020, MariaDB Corporation.
+Copyright (c) 2014, 2021, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -37,42 +37,6 @@ Created 11/5/1995 Heikki Tuuri
 #include "buf0rea.h"
 #include "fsp0types.h"
 
-/*********************************************************************//**
-Gets the current size of buffer buf_pool in bytes.
-@return size in bytes */
-UNIV_INLINE
-ulint
-buf_pool_get_curr_size(void)
-/*========================*/
-{
-	return(srv_buf_pool_curr_size);
-}
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_page_get_freed_page_clock(
-/*==========================*/
-	const buf_page_t*	bpage)	/*!< in: block */
-{
-	/* This is sometimes read without holding buf_pool.mutex. */
-	return(bpage->freed_page_clock);
-}
-
-/********************************************************************//**
-Reads the freed_page_clock of a buffer block.
-@return freed_page_clock */
-UNIV_INLINE
-unsigned
-buf_block_get_freed_page_clock(
-/*===========================*/
-	const buf_block_t*	block)	/*!< in: block */
-{
-	return(buf_page_get_freed_page_clock(&block->page));
-}
-
 /** Determine if a block is still close enough to the MRU end of the LRU list
 meaning that it is not in danger of getting evicted and also implying
 that it has been accessed recently.
@@ -122,67 +86,6 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
 	}
 }
 
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Gets a pointer to the memory frame of a block.
-@return pointer to the frame */
-UNIV_INLINE
-buf_frame_t*
-buf_block_get_frame(
-/*================*/
-	const buf_block_t*	block)	/*!< in: pointer to the control block */
-{
-	if (!block) {
-		return NULL;
-	}
-
-	switch (block->page.state()) {
-	case BUF_BLOCK_ZIP_PAGE:
-	case BUF_BLOCK_NOT_USED:
-		ut_error;
-		break;
-	case BUF_BLOCK_FILE_PAGE:
-		ut_a(block->page.buf_fix_count());
-		/* fall through */
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		goto ok;
-	}
-	ut_error;
-ok:
-	return((buf_frame_t*) block->frame);
-}
-#endif /* UNIV_DEBUG */
-
-/********************************************************************//**
-Allocates a buf_page_t descriptor. This function must succeed. In case
-of failure we assert in this function.
-@return: the allocated descriptor. */
-UNIV_INLINE
-buf_page_t*
-buf_page_alloc_descriptor(void)
-/*===========================*/
-{
-	buf_page_t*	bpage;
-
-	bpage = (buf_page_t*) ut_zalloc_nokey(sizeof *bpage);
-	ut_ad(bpage);
-	MEM_UNDEFINED(bpage, sizeof *bpage);
-
-	return(bpage);
-}
-
-/********************************************************************//**
-Free a buf_page_t descriptor. */
-UNIV_INLINE
-void
-buf_page_free_descriptor(
-/*=====================*/
-	buf_page_t*	bpage)	/*!< in: bpage descriptor to free. */
-{
-	ut_free(bpage);
-}
-
 /** Allocate a buffer block.
 @return own: the allocated block, in state BUF_BLOCK_MEMORY */
 inline buf_block_t *buf_block_alloc()
@@ -214,18 +117,11 @@ buf_block_modify_clock_inc(
 	buf_block_t*	block)	/*!< in: block */
 {
 #ifdef SAFE_MUTEX
-	/* No latch is acquired for the shared temporary tablespace. */
-	ut_ad(fsp_is_system_temporary(block->page.id().space())
-	      || (mysql_mutex_is_owner(&buf_pool.mutex)
-		  && !block->page.buf_fix_count())
-	      || rw_lock_own_flagged(&block->lock,
-				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+	ut_ad((mysql_mutex_is_owner(&buf_pool.mutex)
+	       && !block->page.buf_fix_count())
+	      || block->page.lock.have_u_or_x());
 #else /* SAFE_MUTEX */
-	/* No latch is acquired for the shared temporary tablespace. */
-	ut_ad(fsp_is_system_temporary(block->page.id().space())
-	      || !block->page.buf_fix_count()
-	      || rw_lock_own_flagged(&block->lock,
-				     RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+	ut_ad(!block->page.buf_fix_count() || block->page.lock.have_u_or_x());
 #endif /* SAFE_MUTEX */
 	assert_block_ahi_valid(block);
 
@@ -242,162 +138,7 @@ buf_block_get_modify_clock(
 /*=======================*/
 	buf_block_t*	block)	/*!< in: block */
 {
-#ifdef UNIV_DEBUG
-	/* No latch is acquired for the shared temporary tablespace. */
-	if (!fsp_is_system_temporary(block->page.id().space())) {
-		ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S)
-		      || rw_lock_own(&(block->lock), RW_LOCK_X)
-		      || rw_lock_own(&(block->lock), RW_LOCK_SX));
-	}
-#endif /* UNIV_DEBUG */
-
+	ut_ad(block->page.lock.have_any());
 	return(block->modify_clock);
 }
 
-/*******************************************************************//**
-Increments the bufferfix count. */
-UNIV_INLINE
-void
-buf_block_buf_fix_inc_func(
-/*=======================*/
-#ifdef UNIV_DEBUG
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line */
-#endif /* UNIV_DEBUG */
-	buf_block_t*	block)	/*!< in/out: block to bufferfix */
-{
-#ifdef UNIV_DEBUG
-	/* No debug latch is acquired if block belongs to system temporary.
-	Debug latch is not of much help if access to block is single
-	threaded. */
-	if (!fsp_is_system_temporary(block->page.id().space())) {
-		ibool   ret;
-		ret = rw_lock_s_lock_nowait(block->debug_latch, file, line);
-		ut_a(ret);
-	}
-#endif /* UNIV_DEBUG */
-
-	block->fix();
-}
-
-/*******************************************************************//**
-Decrements the bufferfix count. */
-UNIV_INLINE
-void
-buf_block_buf_fix_dec(
-/*==================*/
-	buf_block_t*	block)	/*!< in/out: block to bufferunfix */
-{
-#ifdef UNIV_DEBUG
-	/* No debug latch is acquired if block belongs to system temporary.
-	Debug latch is not of much help if access to block is single
-	threaded. */
-	if (!fsp_is_system_temporary(block->page.id().space())) {
-		rw_lock_s_unlock(block->debug_latch);
-	}
-#endif /* UNIV_DEBUG */
-
-	block->unfix();
-}
-
-/********************************************************************//**
-Releases a compressed-only page acquired with buf_page_get_zip(). */
-UNIV_INLINE
-void
-buf_page_release_zip(
-/*=================*/
-	buf_page_t*	bpage)		/*!< in: buffer block */
-{
-	ut_ad(bpage);
-	ut_a(bpage->buf_fix_count());
-
-	switch (bpage->state()) {
-	case BUF_BLOCK_FILE_PAGE:
-#ifdef UNIV_DEBUG
-	{
-		/* No debug latch is acquired if block belongs to system
-		temporary. Debug latch is not of much help if access to block
-		is single threaded. */
-		buf_block_t*	block = reinterpret_cast<buf_block_t*>(bpage);
-		if (!fsp_is_system_temporary(block->page.id().space())) {
-			rw_lock_s_unlock(block->debug_latch);
-		}
-	}
-#endif /* UNIV_DEBUG */
-		/* Fall through */
-	case BUF_BLOCK_ZIP_PAGE:
-		reinterpret_cast<buf_block_t*>(bpage)->unfix();
-		return;
-
-	case BUF_BLOCK_NOT_USED:
-	case BUF_BLOCK_MEMORY:
-	case BUF_BLOCK_REMOVE_HASH:
-		break;
-	}
-
-	ut_error;
-}
-
-/********************************************************************//**
-Releases a latch, if specified. */
-UNIV_INLINE
-void
-buf_page_release_latch(
-/*===================*/
-	buf_block_t*	block,		/*!< in: buffer block */
-	ulint		rw_latch)	/*!< in: RW_S_LATCH, RW_X_LATCH,
-					RW_NO_LATCH */
-{
-#ifdef UNIV_DEBUG
-	/* No debug latch is acquired if block belongs to system
-	temporary. Debug latch is not of much help if access to block
-	is single threaded. */
-	if (!fsp_is_system_temporary(block->page.id().space())) {
-		rw_lock_s_unlock(block->debug_latch);
-	}
-#endif /* UNIV_DEBUG */
-
-	if (rw_latch == RW_S_LATCH) {
-		rw_lock_s_unlock(&block->lock);
-	} else if (rw_latch == RW_SX_LATCH) {
-		rw_lock_sx_unlock(&block->lock);
-	} else if (rw_latch == RW_X_LATCH) {
-		rw_lock_x_unlock(&block->lock);
-	}
-}
-
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Adds latch level info for the rw-lock protecting the buffer frame. This
-should be called in the debug version after a successful latching of a
-page if we know the latching order level of the acquired latch. */
-UNIV_INLINE
-void
-buf_block_dbg_add_level(
-/*====================*/
-	buf_block_t*	block,	/*!< in: buffer page
-				where we have acquired latch */
-	latch_level_t	level)	/*!< in: latching order level */
-{
-	sync_check_lock(&block->lock, level);
-}
-#endif /* UNIV_DEBUG */
-
-/********************************************************************//**
-Get buf frame. */
-UNIV_INLINE
-void *
-buf_page_get_frame(
-/*===============*/
-	const buf_page_t*	bpage) /*!< in: buffer pool page */
-{
-	/* In encryption/compression buffer pool page may contain extra
-	buffer where result is stored. */
-	if (bpage->slot && bpage->slot->out_buf) {
-		return bpage->slot->out_buf;
-	} else if (bpage->zip.data) {
-		return bpage->zip.data;
-	} else {
-		return ((buf_block_t*) bpage)->frame;
-	}
-}
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
index 8dc25f91d59..d9f03177812 100644
--- a/storage/innobase/include/buf0checksum.h
+++ b/storage/innobase/include/buf0checksum.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,7 @@ Buffer pool checksum functions, also linked from /extra/innochecksum.cc
 Created Aug 11, 2011 Vasil Dimov
 *******************************************************/
 
-#ifndef buf0checksum_h
-#define buf0checksum_h
-
+#pragma once
 #include "buf0types.h"
 
 /** Calculate the CRC32 checksum of a page. The value is stored to the page
@@ -37,6 +35,7 @@ architectures.
 @return	CRC-32C */
 uint32_t buf_calc_page_crc32(const byte* page);
 
+#ifndef UNIV_INNOCHECKSUM
 /** Calculate a checksum which is stored to the page when it is written
 to a file. Note that we must be careful to calculate the same value on
 32-bit and 64-bit architectures.
@@ -55,13 +54,4 @@ because this takes that field as an input!
 @return checksum */
 uint32_t
 buf_calc_page_old_checksum(const byte* page);
-
-/** Return a printable string describing the checksum algorithm.
-@param[in]	algo	algorithm
-@return algorithm name */
-const char*
-buf_checksum_algorithm_name(srv_checksum_algorithm_t algo);
-
-extern ulong	srv_checksum_algorithm;
-
-#endif /* buf0checksum_h */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
index fb9df55504c..92b840d2f4c 100644
--- a/storage/innobase/include/buf0dblwr.h
+++ b/storage/innobase/include/buf0dblwr.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -54,9 +54,9 @@ class buf_dblwr_t
   };
 
   /** the page number of the first doublewrite block (block_size() pages) */
-  page_id_t block1= page_id_t(0, 0);
+  page_id_t block1{0, 0};
   /** the page number of the second doublewrite block (block_size() pages) */
-  page_id_t block2= page_id_t(0, 0);
+  page_id_t block2{0, 0};
 
   /** mutex protecting the data members below */
   mysql_mutex_t mutex;
@@ -74,9 +74,9 @@ class buf_dblwr_t
   ulint pages_written;
 
   slot slots[2];
-  slot *active_slot= &slots[0];
+  slot *active_slot;
 
-  /** Initialize the doublewrite buffer data structure.
+  /** Initialise the persistent storage of the doublewrite buffer.
   @param header   doublewrite page header in the TRX_SYS page */
   inline void init(const byte *header);
 
@@ -84,6 +84,8 @@ class buf_dblwr_t
   bool flush_buffered_writes(const ulint size);
 
 public:
+  /** Initialise the doublewrite buffer data structures. */
+  void init();
   /** Create or restore the doublewrite buffer in the TRX_SYS page.
   @return whether the operation succeeded */
   bool create();
@@ -137,14 +139,14 @@ public:
   @param size       payload size in bytes */
   void add_to_batch(const IORequest &request, size_t size);
 
-  /** Determine whether the doublewrite buffer is initialized */
-  bool is_initialised() const
+  /** Determine whether the doublewrite buffer has been created */
+  bool is_created() const
   { return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
 
   /** @return whether a page identifier is part of the doublewrite buffer */
   bool is_inside(const page_id_t id) const
   {
-    if (!is_initialised())
+    if (!is_created())
       return false;
     ut_ad(block1 < block2);
     if (id < block1)
@@ -156,13 +158,10 @@ public:
   /** Wait for flush_buffered_writes() to be fully completed */
   void wait_flush_buffered_writes()
   {
-    if (is_initialised())
-    {
-      mysql_mutex_lock(&mutex);
-      while (batch_running)
-        my_cond_wait(&cond, &mutex.m_mutex);
-      mysql_mutex_unlock(&mutex);
-    }
+    mysql_mutex_lock(&mutex);
+    while (batch_running)
+      my_cond_wait(&cond, &mutex.m_mutex);
+    mysql_mutex_unlock(&mutex);
   }
 };
 
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index c772f84147d..13a9363922b 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -24,21 +24,20 @@ The database buffer pool flush algorithm
 Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef buf0flu_h
-#define buf0flu_h
+#pragma once
 
 #include "ut0byte.h"
 #include "log0log.h"
-#include "buf0types.h"
+#include "buf0buf.h"
 
-/** Number of pages flushed. Protected by buf_pool.mutex. */
-extern ulint buf_flush_page_count;
 /** Number of pages flushed via LRU. Protected by buf_pool.mutex.
-Also included in buf_flush_page_count. */
+Also included in buf_pool.stat.n_pages_written. */
 extern ulint buf_lru_flush_page_count;
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+extern ulint buf_lru_freed_page_count;
 
 /** Flag indicating if the page_cleaner is in active state. */
-extern bool buf_page_cleaner_is_active;
+extern Atomic_relaxed<bool> buf_page_cleaner_is_active;
 
 /** Remove all dirty pages belonging to a given tablespace when we are
 deleting the data file of that tablespace.
@@ -85,15 +84,18 @@ buf_flush_init_for_writing(
 bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
   MY_ATTRIBUTE((warn_unused_result));
 
-/** Write out dirty blocks from buf_pool.LRU.
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
 @param max_n    wished maximum mumber of blocks flushed
-@return the number of processed pages
+@param evict    whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written
 @retval 0 if a buf_pool.LRU batch is already running */
-ulint buf_flush_LRU(ulint max_n);
+ulint buf_flush_LRU(ulint max_n, bool evict);
 
-/** Wait until a flush batch ends.
-@param lru    true=buf_pool.LRU; false=buf_pool.flush_list */
-void buf_flush_wait_batch_end(bool lru);
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end();
 /** Wait until all persistent pages are flushed up to a limit.
 @param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
 ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
@@ -106,22 +108,30 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
 This function should be called at a mini-transaction commit, if a page was
 modified in it. Puts the block to the list of modified blocks, if it not
 already in it. */
-UNIV_INLINE
-void
-buf_flush_note_modification(
-/*========================*/
-	buf_block_t*	block,		/*!< in: block which is modified */
-	lsn_t		start_lsn,	/*!< in: start lsn of the first mtr in a
-					set of mtr's */
-	lsn_t		end_lsn);	/*!< in: end lsn of the last mtr in the
-					set of mtr's */
+inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end)
+{
+  ut_ad(!srv_read_only_mode);
+  ut_d(const auto s= b->page.state());
+  ut_ad(s > buf_page_t::FREED);
+  ut_ad(s < buf_page_t::READ_FIX);
+  ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= end);
+  mach_write_to_8(b->page.frame + FIL_PAGE_LSN, end);
+  if (UNIV_LIKELY_NULL(b->page.zip.data))
+    memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
+                      FIL_PAGE_LSN + b->page.frame, 8);
+
+  const lsn_t oldest_modification= b->page.oldest_modification();
+
+  if (oldest_modification > 1)
+    ut_ad(oldest_modification <= start);
+  else
+    buf_pool.insert_into_flush_list(b, start);
+  srv_stats.buf_pool_write_requests.inc();
+}
 
 /** Initialize page_cleaner. */
 ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
 
-/** Wait for pending flushes to complete. */
-void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
-
 /** Flush the buffer pool on shutdown. */
 ATTRIBUTE_COLD void buf_flush_buffer_pool();
 
@@ -137,7 +147,3 @@ void buf_flush_sync_batch(lsn_t lsn);
 /** Synchronously flush dirty blocks.
 NOTE: The calling thread is not allowed to hold any buffer page latches! */
 void buf_flush_sync();
-
-#include "buf0flu.inl"
-
-#endif
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index 540c14a49c9..aec08e77f54 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,11 +24,10 @@ The database buffer pool LRU replacement algorithm
 Created 11/5/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef buf0lru_h
-#define buf0lru_h
+#pragma once
 
-#include "ut0byte.h"
 #include "buf0types.h"
+#include "hash0hash.h"
 
 // Forward declaration
 struct trx_t;
@@ -132,14 +131,6 @@ policy at the end of each interval. */
 void
 buf_LRU_stat_update();
 
-/** Remove one page from LRU list and put it to free list.
-@param bpage     file page to be freed
-@param id        page identifier
-@param hash_lock buf_pool.page_hash latch (will be released here) */
-void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id,
-                           page_hash_latch *hash_lock)
-  MY_ATTRIBUTE((nonnull));
-
 #ifdef UNIV_DEBUG
 /** Validate the LRU list. */
 void buf_LRU_validate();
@@ -200,5 +191,3 @@ Increments the I/O counter in buf_LRU_stat_cur. */
 /********************************************************************//**
 Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
 #define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
-
-#endif
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
index 8d6b28194dc..d898c5efc63 100644
--- a/storage/innobase/include/buf0rea.h
+++ b/storage/innobase/include/buf0rea.h
@@ -33,10 +33,11 @@ Created 11/5/1995 Heikki Tuuri
 buffer buf_pool if it is not already there. Sets the io_fix flag and sets
 an exclusive lock on the buffer frame. The flag is cleared and the x-lock
 released by the i/o-handler thread.
-@param[in]	page_id		page id
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@retval DB_SUCCESS if the page was read and is not corrupted,
-@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
+@param page_id   page id
+@param zip_size  ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
 @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
 after decryption normal page checksum does not match.
 @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
index 327e2e2498e..c69c07d66e1 100644
--- a/storage/innobase/include/buf0types.h
+++ b/storage/innobase/include/buf0types.h
@@ -39,57 +39,29 @@ struct buf_buddy_stat_t;
 /** A buffer frame. @see page_t */
 typedef	byte	buf_frame_t;
 
-/** Flags for io_fix types */
-enum buf_io_fix {
-	BUF_IO_NONE = 0,		/**< no pending I/O */
-	BUF_IO_READ,			/**< read pending */
-	BUF_IO_WRITE,			/**< write pending */
-	BUF_IO_PIN			/**< disallow relocation of
-					block and its removal of from
-					the flush_list */
-};
-
 /** Alternatives for srv_checksum_algorithm, which can be changed by
 setting innodb_checksum_algorithm */
 enum srv_checksum_algorithm_t {
-	SRV_CHECKSUM_ALGORITHM_CRC32,		/*!< Write crc32, allow crc32,
-						innodb or none when reading */
-	SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,	/*!< Write crc32, allow crc32
-						when reading */
-	SRV_CHECKSUM_ALGORITHM_INNODB,		/*!< Write innodb, allow crc32,
-						innodb or none when reading */
-	SRV_CHECKSUM_ALGORITHM_STRICT_INNODB,	/*!< Write innodb, allow
-						innodb when reading */
-	SRV_CHECKSUM_ALGORITHM_NONE,		/*!< Write none, allow crc32,
-						innodb or none when reading */
-	SRV_CHECKSUM_ALGORITHM_STRICT_NONE,	/*!< Write none, allow none
-						when reading */
-
-	/** For new files, always compute CRC-32C for the whole page.
-	For old files, allow crc32, innodb or none when reading. */
-	SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
-
-	/** For new files, always compute CRC-32C for the whole page.
-	For old files, allow crc32 when reading. */
-	SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+  /** Write crc32; allow full_crc32,crc32,innodb,none when reading */
+  SRV_CHECKSUM_ALGORITHM_CRC32,
+  /** Write crc32; allow full_crc23,crc32 when reading */
+  SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,
+  /** For new files, always compute CRC-32C for the whole page.
+  For old files, allow crc32, innodb or none when reading. */
+  SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+  /** For new files, always compute CRC-32C for the whole page.
+  For old files, allow crc32 when reading. */
+  SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
 };
 
-inline
-bool
-is_checksum_strict(srv_checksum_algorithm_t algo)
+inline bool is_checksum_strict(srv_checksum_algorithm_t algo)
 {
-	return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32
-	       || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
-	       || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE);
+  return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
 }
 
-inline
-bool
-is_checksum_strict(ulint algo)
+inline bool is_checksum_strict(ulint algo)
 {
-	return(algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32
-	       || algo == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB
-	       || algo == SRV_CHECKSUM_ALGORITHM_STRICT_NONE);
+  return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
 }
 
 /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
@@ -176,6 +148,12 @@ public:
 
   constexpr ulonglong raw() const { return m_id; }
 
+  /** Flag the page identifier as corrupted. */
+  void set_corrupted() { m_id= ~0ULL; }
+
+  /** @return whether the page identifier belongs to a corrupted page */
+  constexpr bool is_corrupted() const { return m_id == ~0ULL; }
+
 private:
   /** The page identifier */
   uint64_t m_id;
@@ -189,39 +167,69 @@ extern const byte *field_ref_zero;
 
 #ifndef UNIV_INNOCHECKSUM
 
-#include "ut0mutex.h"
-#include "sync0rw.h"
-#include "rw_lock.h"
+/** Latch types */
+enum rw_lock_type_t
+{
+  RW_S_LATCH= 1 << 0,
+  RW_X_LATCH= 1 << 1,
+  RW_SX_LATCH= 1 << 2,
+  RW_NO_LATCH= 1 << 3
+};
 
-class page_hash_latch : public rw_lock
+#include "sux_lock.h"
+
+#ifdef SUX_LOCK_GENERIC
+class page_hash_latch : private rw_lock
 {
-public:
   /** Wait for a shared lock */
   void read_lock_wait();
   /** Wait for an exclusive lock */
   void write_lock_wait();
-
+public:
   /** Acquire a shared lock */
-  inline void read_lock();
+  inline void lock_shared();
   /** Acquire an exclusive lock */
-  inline void write_lock();
+  inline void lock();
 
-  /** Acquire a lock */
-  template<bool exclusive> void acquire()
-  {
-    if (exclusive)
-      write_lock();
-    else
-      read_lock();
-  }
-  /** Release a lock */
-  template<bool exclusive> void release()
-  {
-    if (exclusive)
-      write_unlock();
-    else
-      read_unlock();
-  }
+  /** @return whether an exclusive lock is being held by any thread */
+  bool is_write_locked() const { return rw_lock::is_write_locked(); }
+
+  /** @return whether any lock is being held by any thread */
+  bool is_locked() const { return rw_lock::is_locked(); }
+  /** @return whether any lock is being held or waited for by any thread */
+  bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); }
+
+  /** Release a shared lock */
+  void unlock_shared() { read_unlock(); }
+  /** Release an exclusive lock */
+  void unlock() { write_unlock(); }
+};
+#elif defined _WIN32 || SIZEOF_SIZE_T >= 8
+class page_hash_latch
+{
+  srw_spin_lock_low lk;
+public:
+  void lock_shared() { lk.rd_lock(); }
+  void unlock_shared() { lk.rd_unlock(); }
+  void lock() { lk.wr_lock(); }
+  void unlock() { lk.wr_unlock(); }
+  bool is_write_locked() const { return lk.is_write_locked(); }
+  bool is_locked() const { return lk.is_locked(); }
+  bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); }
+};
+#else
+class page_hash_latch
+{
+  srw_spin_mutex lk;
+public:
+  void lock_shared() { lock(); }
+  void unlock_shared() { unlock(); }
+  void lock() { lk.wr_lock(); }
+  void unlock() { lk.wr_unlock(); }
+  bool is_locked() const { return lk.is_locked(); }
+  bool is_write_locked() const { return is_locked(); }
+  bool is_locked_or_waiting() const { return is_locked(); }
 };
+#endif
 
 #endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
index c2b8c3e00b6..5eaad5bf552 100644
--- a/storage/innobase/include/data0data.h
+++ b/storage/innobase/include/data0data.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, 2020 MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -581,6 +581,10 @@ struct dtuple_t {
 	/** @return whether this is a hidden metadata record
 	for instant ADD COLUMN or ALTER TABLE */
 	bool is_metadata() const { return is_metadata(info_bits); }
+
+	/** Copy type information from index fields.
+	@param index	index field to be copied */
+	inline void copy_field_types(const dict_index_t &index);
 };
 
 inline ulint dtuple_get_n_fields(const dtuple_t* tuple)
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
index 9528443e7a8..3d63ddb767c 100644
--- a/storage/innobase/include/data0type.h
+++ b/storage/innobase/include/data0type.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,7 @@ Data types
 Created 1/16/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef data0type_h
-#define data0type_h
-
+#pragma once
 #include "univ.i"
 
 /** Special length indicating a missing instantly added column */
@@ -196,9 +194,6 @@ constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
 /** system-versioned user data column */
 #define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END)
 
-/** Check whether locking is disabled (never). */
-#define dict_table_is_locking_disabled(table) false
-
 /*-------------------------------------------*/
 
 /* This many bytes we need to store the type information affecting the
@@ -325,7 +320,6 @@ dtype_get_prtype(
 
 /*********************************************************************//**
 Compute the mbminlen and mbmaxlen members of a data type structure. */
-UNIV_INLINE
 void
 dtype_get_mblen(
 /*============*/
@@ -480,19 +474,6 @@ dtype_new_read_for_order_and_null_size(
 	const byte*	buf);	/*!< in: buffer for stored type order info */
 
 /*********************************************************************//**
-Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
-@return the SQL type name */
-UNIV_INLINE
-char*
-dtype_sql_name(
-/*===========*/
-	unsigned	mtype,	/*!< in: mtype */
-	unsigned	prtype,	/*!< in: prtype */
-	unsigned	len,	/*!< in: len */
-	char*		name,	/*!< out: SQL name */
-	unsigned	name_sz);/*!< in: size of the name buffer */
-
-/*********************************************************************//**
 Validates a data type structure.
 @return TRUE if ok */
 ibool
@@ -507,6 +488,8 @@ dtype_print(
 	const dtype_t*	type);
 #endif /* UNIV_DEBUG */
 
+struct dict_col_t;
+
 /* Structure for an SQL data type.
 If you add fields to this structure, be sure to initialize them everywhere.
 This structure is initialized in the following functions:
@@ -562,6 +545,10 @@ struct dtype_t{
 		mbminlen = 0;
 		mbmaxlen = 0;
 	}
+
+	/** Copy the type information from a column.
+	@param col column type to be copied */
+	void assign(const dict_col_t &col);
 };
 
 /** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
@@ -602,5 +589,3 @@ static const byte REC_INFO_METADATA_ALTER
 	= REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG;
 
 #include "data0type.inl"
-
-#endif
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
index b81b68e69e9..329cee5d190 100644
--- a/storage/innobase/include/data0type.inl
+++ b/storage/innobase/include/data0type.inl
@@ -68,30 +68,6 @@ dtype_get_mysql_type(
 Compute the mbminlen and mbmaxlen members of a data type structure. */
 UNIV_INLINE
 void
-dtype_get_mblen(
-/*============*/
-	ulint	mtype,		/*!< in: main type */
-	ulint	prtype,		/*!< in: precise type (and collation) */
-	unsigned*mbminlen,	/*!< out: minimum length of a
-				multi-byte character */
-	unsigned*mbmaxlen)	/*!< out: maximum length of a
-				multi-byte character */
-{
-	if (dtype_is_string_type(mtype)) {
-		innobase_get_cset_width(dtype_get_charset_coll(prtype),
-					mbminlen, mbmaxlen);
-		ut_ad(*mbminlen <= *mbmaxlen);
-		ut_ad(*mbminlen < DATA_MBMAX);
-		ut_ad(*mbmaxlen < DATA_MBMAX);
-	} else {
-		*mbminlen = *mbmaxlen = 0;
-	}
-}
-
-/*********************************************************************//**
-Compute the mbminlen and mbmaxlen members of a data type structure. */
-UNIV_INLINE
-void
 dtype_set_mblen(
 /*============*/
 	dtype_t*	type)	/*!< in/out: type */
@@ -327,103 +303,6 @@ dtype_new_read_for_order_and_null_size(
 	dtype_set_mblen(type);
 }
 
-/*********************************************************************//**
-Returns the type's SQL name (e.g. BIGINT UNSIGNED) from mtype,prtype,len
-@return the SQL type name */
-UNIV_INLINE
-char*
-dtype_sql_name(
-/*===========*/
-	unsigned	mtype,	/*!< in: mtype */
-	unsigned	prtype,	/*!< in: prtype */
-	unsigned	len,	/*!< in: len */
-	char*		name,	/*!< out: SQL name */
-	unsigned	name_sz)/*!< in: size of the name buffer */
-{
-
-#define APPEND_UNSIGNED()					\
-	do {							\
-		if (prtype & DATA_UNSIGNED) {			\
-			snprintf(name + strlen(name),	\
-				    name_sz - strlen(name),	\
-				    " UNSIGNED");		\
-		}						\
-	} while (0)
-
-	snprintf(name, name_sz, "UNKNOWN");
-
-	switch (mtype) {
-	case DATA_INT:
-		switch (len) {
-		case 1:
-			snprintf(name, name_sz, "TINYINT");
-			break;
-		case 2:
-			snprintf(name, name_sz, "SMALLINT");
-			break;
-		case 3:
-			snprintf(name, name_sz, "MEDIUMINT");
-			break;
-		case 4:
-			snprintf(name, name_sz, "INT");
-			break;
-		case 8:
-			snprintf(name, name_sz, "BIGINT");
-			break;
-		}
-		APPEND_UNSIGNED();
-		break;
-	case DATA_FLOAT:
-		snprintf(name, name_sz, "FLOAT");
-		APPEND_UNSIGNED();
-		break;
-	case DATA_DOUBLE:
-		snprintf(name, name_sz, "DOUBLE");
-		APPEND_UNSIGNED();
-		break;
-	case DATA_FIXBINARY:
-		snprintf(name, name_sz, "BINARY(%u)", len);
-		break;
-	case DATA_CHAR:
-	case DATA_MYSQL:
-		snprintf(name, name_sz, "CHAR(%u)", len);
-		break;
-	case DATA_VARCHAR:
-	case DATA_VARMYSQL:
-		snprintf(name, name_sz, "VARCHAR(%u)", len);
-		break;
-	case DATA_BINARY:
-		snprintf(name, name_sz, "VARBINARY(%u)", len);
-		break;
-	case DATA_GEOMETRY:
-		snprintf(name, name_sz, "GEOMETRY");
-		break;
-	case DATA_BLOB:
-		switch (len) {
-		case 9:
-			snprintf(name, name_sz, "TINYBLOB");
-			break;
-		case 10:
-			snprintf(name, name_sz, "BLOB");
-			break;
-		case 11:
-			snprintf(name, name_sz, "MEDIUMBLOB");
-			break;
-		case 12:
-			snprintf(name, name_sz, "LONGBLOB");
-			break;
-		}
-	}
-
-	if (prtype & DATA_NOT_NULL) {
-		snprintf(name + strlen(name),
-			    name_sz - strlen(name),
-			    " NOT NULL");
-	}
-
-	return(name);
-}
-
 /***********************************************************************//**
 Returns the size of a fixed size data type, 0 if not a fixed size type.
 @return fixed size, or 0 */
@@ -471,16 +350,6 @@ dtype_get_fixed_size_low(
 		} else if (!comp) {
 			return static_cast<unsigned>(len);
 		} else {
-#ifdef UNIV_DEBUG
-			unsigned i_mbminlen, i_mbmaxlen;
-
-			innobase_get_cset_width(
-				dtype_get_charset_coll(prtype),
-				&i_mbminlen, &i_mbmaxlen);
-
-			ut_ad(i_mbminlen == mbminlen);
-			ut_ad(i_mbmaxlen == mbmaxlen);
-#endif /* UNIV_DEBUG */
 			if (mbminlen == mbmaxlen) {
 				return static_cast<unsigned>(len);
 			}
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
index 51d116d5ede..64182aabc38 100644
--- a/storage/innobase/include/db0err.h
+++ b/storage/innobase/include/db0err.h
@@ -49,9 +49,6 @@ enum dberr_t {
 					rollback segment */
 	DB_CLUSTER_NOT_FOUND = 30,
 	DB_TABLE_NOT_FOUND,
-	DB_MUST_GET_MORE_FILE_SPACE,	/*!< the database has to be stopped
-					and restarted with more file space */
-	DB_TABLE_IS_BEING_USED,
 	DB_TOO_BIG_RECORD,		/*!< a record in an index would not fit
 					on a compressed page, or it would
 					become bigger than 1/2 free space in
@@ -121,8 +118,6 @@ enum dberr_t {
 	DB_READ_ONLY,			/*!< Update operation attempted in
 					a read-only transaction */
 	DB_FTS_INVALID_DOCID,		/* FTS Doc ID cannot be zero */
-	DB_TABLE_IN_FK_CHECK,		/* table is being used in foreign
-					key check */
 	DB_ONLINE_LOG_TOO_BIG,		/*!< Modification log grew too big
 					during online index creation */
 
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
index 186fd30f89f..3e14e0ace69 100644
--- a/storage/innobase/include/dict0boot.h
+++ b/storage/innobase/include/dict0boot.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -33,8 +33,6 @@ Created 4/18/1996 Heikki Tuuri
 #include "buf0buf.h"
 #include "dict0dict.h"
 
-/** @return the DICT_HDR block, x-latched */
-buf_block_t *dict_hdr_get(mtr_t* mtr);
 /**********************************************************************//**
 Returns a new table, index, or space id. */
 void
@@ -46,27 +44,39 @@ dict_hdr_get_new_id(
 						(not assigned if NULL) */
 	ulint*			space_id);	/*!< out: space id
 						(not assigned if NULL) */
-/**********************************************************************//**
-Writes the current value of the row id counter to the dictionary header file
-page. */
-void
-dict_hdr_flush_row_id(void);
-/*=======================*/
-/**********************************************************************//**
-Returns a new row id.
-@return the new id */
-UNIV_INLINE
-row_id_t
-dict_sys_get_new_row_id(void);
-/*=========================*/
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id);
+/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+inline row_id_t dict_sys_t::get_new_row_id()
+{
+  row_id_t id= row_id.fetch_add(1);
+  if (!(id % ROW_ID_WRITE_MARGIN))
+    dict_hdr_flush_row_id(id);
+  return id;
+}
+
+/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+inline void dict_sys_t::update_row_id(row_id_t id)
+{
+  row_id_t sys_id= row_id;
+  while (id >= sys_id)
+  {
+    if (!row_id.compare_exchange_strong(sys_id, id))
+      continue;
+    if (!(id % ROW_ID_WRITE_MARGIN))
+      dict_hdr_flush_row_id(id);
+    break;
+  }
+}
+
 /**********************************************************************//**
 Writes a row id to a record or other 6-byte stored form. */
-UNIV_INLINE
-void
-dict_sys_write_row_id(
-/*==================*/
-	byte*		field,	/*!< in: record field */
-	row_id_t	row_id);/*!< in: row id */
+inline void dict_sys_write_row_id(byte *field, row_id_t row_id)
+{
+  static_assert(DATA_ROW_ID_LEN == 6, "compatibility");
+  mach_write_to_6(field, row_id);
+}
+
 /*****************************************************************//**
 Initializes the data dictionary memory structures when the database is
 started. This function is also called when the data dictionary is created.
@@ -87,12 +97,7 @@ dict_create(void)
 /*********************************************************************//**
 Check if a table id belongs to  system table.
 @return true if the table id belongs to a system table. */
-UNIV_INLINE
-bool
-dict_is_sys_table(
-/*==============*/
-	table_id_t	id)		/*!< in: table id to check */
-	MY_ATTRIBUTE((warn_unused_result));
+inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; }
 
 /* Space id and page no where the dictionary header resides */
 #define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
@@ -267,37 +272,6 @@ enum dict_fld_sys_foreign_cols_enum {
 	DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME	= 5,
 	DICT_NUM_FIELDS__SYS_FOREIGN_COLS		= 6
 };
-/* The columns in SYS_TABLESPACES */
-enum dict_col_sys_tablespaces_enum {
-	DICT_COL__SYS_TABLESPACES__SPACE		= 0,
-	DICT_COL__SYS_TABLESPACES__NAME			= 1,
-	DICT_COL__SYS_TABLESPACES__FLAGS		= 2,
-	DICT_NUM_COLS__SYS_TABLESPACES			= 3
-};
-/* The field numbers in the SYS_TABLESPACES clustered index */
-enum dict_fld_sys_tablespaces_enum {
-	DICT_FLD__SYS_TABLESPACES__SPACE		= 0,
-	DICT_FLD__SYS_TABLESPACES__DB_TRX_ID		= 1,
-	DICT_FLD__SYS_TABLESPACES__DB_ROLL_PTR		= 2,
-	DICT_FLD__SYS_TABLESPACES__NAME			= 3,
-	DICT_FLD__SYS_TABLESPACES__FLAGS		= 4,
-	DICT_NUM_FIELDS__SYS_TABLESPACES		= 5
-};
-/* The columns in SYS_DATAFILES */
-enum dict_col_sys_datafiles_enum {
-	DICT_COL__SYS_DATAFILES__SPACE			= 0,
-	DICT_COL__SYS_DATAFILES__PATH			= 1,
-	DICT_NUM_COLS__SYS_DATAFILES			= 2
-};
-/* The field numbers in the SYS_DATAFILES clustered index */
-enum dict_fld_sys_datafiles_enum {
-	DICT_FLD__SYS_DATAFILES__SPACE			= 0,
-	DICT_FLD__SYS_DATAFILES__DB_TRX_ID		= 1,
-	DICT_FLD__SYS_DATAFILES__DB_ROLL_PTR		= 2,
-	DICT_FLD__SYS_DATAFILES__PATH			= 3,
-	DICT_NUM_FIELDS__SYS_DATAFILES			= 4
-};
-
 /* The columns in SYS_VIRTUAL */
 enum dict_col_sys_virtual_enum {
 	DICT_COL__SYS_VIRTUAL__TABLE_ID		= 0,
@@ -320,11 +294,4 @@ length of thos fields. */
 #define	DICT_FLD_LEN_SPACE	4
 #define	DICT_FLD_LEN_FLAGS	4
 
-/* When a row id which is zero modulo this number (which must be a power of
-two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
-updated */
-#define DICT_HDR_ROW_ID_WRITE_MARGIN	256
-
-#include "dict0boot.inl"
-
 #endif
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
index 50f7f34a8e8..c40df12babe 100644
--- a/storage/innobase/include/dict0crea.h
+++ b/storage/innobase/include/dict0crea.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -41,14 +41,14 @@ tab_create_graph_create(
 /*====================*/
 	dict_table_t*	table,		/*!< in: table to create, built as
 					a memory data structure */
-	mem_heap_t*	heap,		/*!< in: heap where created */
-	fil_encryption_t mode,		/*!< in: encryption mode */
-	uint32_t	key_id);	/*!< in: encryption key_id */
+	mem_heap_t*	heap);		/*!< in: heap where created */
 
 /** Creates an index create graph.
 @param[in]	index	index to create, built as a memory data structure
 @param[in]	table	table name
 @param[in,out]	heap	heap where created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
 @param[in]	add_v	new virtual columns added in the same clause with
 			add index
 @return own: index create node */
@@ -57,6 +57,8 @@ ind_create_graph_create(
 	dict_index_t*		index,
 	const char*		table,
 	mem_heap_t*		heap,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
 	const dict_add_v_col_t*	add_v = NULL);
 
 /***********************************************************//**
@@ -99,29 +101,22 @@ dict_create_index_tree(
 /** Drop the index tree associated with a row in SYS_INDEXES table.
 @param[in,out]	pcur	persistent cursor on rec
 @param[in,out]	trx	dictionary transaction
-@param[in,out]	mtr	mini-transaction */
-void dict_drop_index_tree(btr_pcur_t* pcur, trx_t* trx, mtr_t* mtr)
-	MY_ATTRIBUTE((nonnull));
+@param[in,out]	mtr	mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
 
 /***************************************************************//**
 Creates an index tree for the index if it is not a member of a cluster.
 Don't update SYSTEM TABLES.
-@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+@return	error code */
 dberr_t
 dict_create_index_tree_in_mem(
 /*==========================*/
 	dict_index_t*	index,		/*!< in/out: index */
 	const trx_t*	trx);		/*!< in: InnoDB transaction handle */
 
-/****************************************************************//**
-Creates the foreign key constraints system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_foreign_constraint_tables(void);
-/*================================================*/
-
 /********************************************************************//**
 Generate a foreign key constraint name when it was not named by the user.
 A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
@@ -167,37 +162,6 @@ dict_foreigns_has_s_base_col(
 	const dict_foreign_set&	local_fk_set,
 	const dict_table_t*	table);
 
-/****************************************************************//**
-Creates the tablespaces and datafiles system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_sys_tablespace(void);
-/*=====================================*/
-/** Creates the virtual column system tables inside InnoDB
-at server bootstrap or server start if they are not found or are
-not of the right form.
-@return DB_SUCCESS or error code */
-dberr_t
-dict_create_or_check_sys_virtual();
-
-/** Put a tablespace definition into the data dictionary,
-replacing what was there previously.
-@param[in]	space	Tablespace id
-@param[in]	name	Tablespace name
-@param[in]	flags	Tablespace flags
-@param[in]	path	Tablespace path
-@param[in]	trx	Transaction
-@return error code or DB_SUCCESS */
-dberr_t
-dict_replace_tablespace_in_dictionary(
-	ulint		space_id,
-	const char*	name,
-	ulint		flags,
-	const char*	path,
-	trx_t*		trx);
-
 /********************************************************************//**
 Add a foreign key definition to the data dictionary tables.
 @return error code or DB_SUCCESS */
@@ -209,16 +173,6 @@ dict_create_add_foreign_to_dictionary(
 	trx_t*			trx)	/*!< in/out: dictionary transaction */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/********************************************************************//**
-Construct foreign key constraint defintion from data dictionary information.
-*/
-UNIV_INTERN
-char*
-dict_foreign_def_get(
-/*=================*/
-	dict_foreign_t*	foreign,/*!< in: foreign */
-	trx_t*		trx);	/*!< in: trx */
-
 /* Table create node structure */
 struct tab_node_t{
 	que_common_t	common;		/*!< node type: QUE_NODE_TABLE_CREATE */
@@ -240,8 +194,6 @@ struct tab_node_t{
 	/* Local storage for this graph node */
 	ulint		state;		/*!< node execution state */
 	ulint		col_no;		/*!< next column definition to insert */
-	uint		key_id;	/*!< encryption key_id */
-	fil_encryption_t mode;	/*!< encryption mode */
 	ulint		base_col_no;	/*!< next base column to insert */
 	mem_heap_t*	heap;		/*!< memory heap used as auxiliary
 					storage */
@@ -273,11 +225,12 @@ struct ind_node_t{
 	/* Local storage for this graph node */
 	ulint		state;		/*!< node execution state */
 	uint32_t	page_no;	/* root page number of the index */
-	dict_table_t*	table;		/*!< table which owns the index */
 	dtuple_t*	ind_row;	/* index definition row built */
 	ulint		field_no;	/* next field definition to insert */
 	mem_heap_t*	heap;		/*!< memory heap used as auxiliary
 					storage */
+	uint		key_id;		/*!< encryption key_id */
+	fil_encryption_t mode;		/*!< encryption mode */
 	const dict_add_v_col_t*
 			add_v;		/*!< new virtual columns that being
 					added along with an add index call */
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
index 3aea41b0bb8..679484ad64e 100644
--- a/storage/innobase/include/dict0defrag_bg.h
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2016, 2020, MariaDB Corporation.
+Copyright (c) 2016, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -80,21 +80,16 @@ dict_stats_defrag_pool_del(
 					all entries for the table */
 	const dict_index_t*	index);	/*!< in: index to remove */
 
-/*****************************************************************//**
+/**
 Get the first index that has been added for updating persistent defrag
 stats and eventually save its stats. */
-void
-dict_defrag_process_entries_from_defrag_pool();
-/*===========================================*/
+void dict_defrag_process_entries_from_defrag_pool(THD *thd);
 
 /*********************************************************************//**
 Save defragmentation result.
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_save_defrag_summary(
-/*============================*/
-	dict_index_t*	index)	/*!< in: index */
-	MY_ATTRIBUTE((warn_unused_result));
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /*********************************************************************//**
 Save defragmentation stats for a given index.
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 65b88a65185..e54a138cc02 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -31,11 +31,11 @@ Created 1/8/1996 Heikki Tuuri
 #include "data0data.h"
 #include "dict0mem.h"
 #include "fsp0fsp.h"
+#include "srw_lock.h"
+#include <my_sys.h>
 #include <deque>
 
 class MDL_ticket;
-extern bool innodb_table_stats_not_found;
-extern bool innodb_index_stats_not_found;
 
 /** the first table or index ID for other than hard-coded system tables */
 constexpr uint8_t DICT_HDR_FIRST_ID= 10;
@@ -132,7 +132,7 @@ enum dict_table_op_t {
 @param[in]      table_op        operation to perform when opening
 @return table object after locking MDL shared
 @retval NULL if the table is not readable, or if trylock && MDL blocked */
-template<bool trylock>
+template<bool trylock, bool purge_thd= false>
 dict_table_t*
 dict_acquire_mdl_shared(dict_table_t *table,
                         THD *thd,
@@ -140,7 +140,6 @@ dict_acquire_mdl_shared(dict_table_t *table,
                         dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
 
 /** Look up a table by numeric identifier.
-@tparam	purge_thd Whether the function is called by purge thread
 @param[in]      table_id        table identifier
 @param[in]      dict_locked     data dictionary locked
 @param[in]      table_op        operation to perform when opening
@@ -154,11 +153,12 @@ dict_table_open_on_id(table_id_t table_id, bool dict_locked,
                       MDL_ticket **mdl= nullptr)
   MY_ATTRIBUTE((warn_unused_result));
 
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table);
+
 /** Decrements the count of open handles of a table.
 @param[in,out]	table		table
-@param[in]	dict_locked	data dictionary locked
-@param[in]	try_drop	try to drop any orphan indexes after
-				an aborted online index creation
+@param[in]	dict_locked	whether dict_sys.latch is being held
 @param[in]	thd		thread to release MDL
 @param[in]	mdl		metadata lock or NULL if the thread is a
 				foreground one. */
@@ -166,22 +166,10 @@ void
 dict_table_close(
 	dict_table_t*	table,
 	bool		dict_locked,
-	bool		try_drop,
 	THD*		thd = NULL,
 	MDL_ticket*	mdl = NULL);
 
 /*********************************************************************//**
-Closes the only open handle to a table and drops a table while assuring
-that dict_sys.mutex is held the whole time.  This assures that the table
-is not evicted after the close when the count of open handles goes to zero.
-Because dict_sys.mutex is held, we do not need to call prevent_eviction(). */
-void
-dict_table_close_and_drop(
-/*======================*/
-	trx_t*		trx,		/*!< in: data dictionary transaction */
-	dict_table_t*	table);		/*!< in/out: table */
-
-/*********************************************************************//**
 Gets the minimum number of bytes per character.
 @return minimum multi-byte char size, in bytes */
 UNIV_INLINE
@@ -381,12 +369,8 @@ dberr_t
 dict_table_rename_in_cache(
 /*=======================*/
 	dict_table_t*	table,		/*!< in/out: table */
-	const char*	new_name,	/*!< in: new name */
-	bool		rename_also_foreigns,
-					/*!< in: in ALTER TABLE we want
-					to preserve the original table name
-					in constraints which reference it */
-	bool		replace_new_file = false)
+	span<const char> new_name,	/*!< in: new name */
+	bool		replace_new_file)
 					/*!< in: whether to replace the
 					file with the new name
 					(as part of rolling back TRUNCATE) */
@@ -437,14 +421,6 @@ dict_foreign_add_to_cache(
 	dict_err_ignore_t	ignore_err)
 				/*!< in: error to be ignored */
 	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-/*********************************************************************//**
-Checks if a table is referenced by foreign keys.
-@return TRUE if table is referenced by a foreign key */
-ibool
-dict_table_is_referenced_by_foreign_key(
-/*====================================*/
-	const dict_table_t*	table)	/*!< in: InnoDB table */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /**********************************************************************//**
 Replace the index passed in with another equivalent index in the
 foreign key lists of the table.
@@ -480,16 +456,14 @@ NOTE! This is a high-level function to be used mainly from outside the
 'dict' directory. Inside this directory dict_table_get_low
 is usually the appropriate function.
 @param[in] table_name Table name
-@param[in] dict_locked TRUE=data dictionary locked
-@param[in] try_drop TRUE=try to drop any orphan indexes after
-				an aborted online index creation
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
 @param[in] ignore_err error to be ignored when loading the table
-@return table, NULL if does not exist */
+@return table
+@retval nullptr if does not exist */
 dict_table_t*
 dict_table_open_on_name(
 	const char*		table_name,
-	ibool			dict_locked,
-	ibool			try_drop,
+	bool			dict_locked,
 	dict_err_ignore_t	ignore_err)
 	MY_ATTRIBUTE((warn_unused_result));
 
@@ -656,19 +630,6 @@ dict_table_get_next_index(
 # define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
 #endif /* UNIV_DEBUG */
 
-/* Skip corrupted index */
-#define dict_table_skip_corrupt_index(index)			\
-	while (index && index->is_corrupted()) {		\
-		index = dict_table_get_next_index(index);	\
-	}
-
-/* Get the next non-corrupt index */
-#define dict_table_next_uncorrupted_index(index)		\
-do {								\
-	index = dict_table_get_next_index(index);		\
-	dict_table_skip_corrupt_index(index);			\
-} while (0)
-
 #define dict_index_is_clust(index) (index)->is_clust()
 #define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
 #define dict_index_is_unique(index) (index)->is_unique()
@@ -946,17 +907,6 @@ dict_table_copy_types(
 	dtuple_t*		tuple,	/*!< in/out: data tuple */
 	const dict_table_t*	table)	/*!< in: table */
 	MY_ATTRIBUTE((nonnull));
-/**********************************************************************//**
-Make room in the table cache by evicting an unused table. The unused table
-should not be part of FK relationship and currently not used in any user
-transaction. There is no guarantee that it will remove a table.
-@return number of tables evicted. */
-ulint
-dict_make_room_in_cache(
-/*====================*/
-	ulint		max_tables,	/*!< in: max tables allowed in cache */
-	ulint		pct_check);	/*!< in: max percent to check */
-
 /** Adds an index to the dictionary cache, with possible indexing newly
 added column.
 @param[in,out]	index	index; NOTE! The index memory
@@ -1159,7 +1109,6 @@ dict_field_get_col(
 
 /**********************************************************************//**
 Returns an index object if it is found in the dictionary cache.
-Assumes that dict_sys.mutex is already being held.
 @return index, NULL if not found */
 dict_index_t*
 dict_index_get_if_in_cache_low(
@@ -1246,15 +1195,6 @@ dict_index_get_page(
 /*================*/
 	const dict_index_t*	tree)	/*!< in: index */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*********************************************************************//**
-Gets the read-write lock of the index tree.
-@return read-write lock */
-UNIV_INLINE
-rw_lock_t*
-dict_index_get_lock(
-/*================*/
-	const dict_index_t*	index)	/*!< in: index */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /********************************************************************//**
 Returns free space reserved for future updates of records. This is
 relevant only in the case of many consecutive inserts, as updates
@@ -1306,9 +1246,6 @@ dict_index_calc_min_rec_len(
 	const dict_index_t*	index)	/*!< in: index */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-#define dict_mutex_enter_for_mysql() mutex_enter(&dict_sys.mutex)
-#define dict_mutex_exit_for_mysql() mutex_exit(&dict_sys.mutex)
-
 /********************************************************************//**
 Checks if the database name in two table names is the same.
 @return TRUE if same db name */
@@ -1372,105 +1309,134 @@ constraint */
 /* Buffers for storing detailed information about the latest foreign key
 and unique key errors */
 extern FILE*		dict_foreign_err_file;
-extern ib_mutex_t	dict_foreign_err_mutex; /* mutex protecting the
-						foreign key error messages */
+extern mysql_mutex_t dict_foreign_err_mutex;
 
 /** InnoDB data dictionary cache */
 class dict_sys_t
 {
+  /** The my_hrtime_coarse().val of the oldest lock_wait() start, or 0 */
+  std::atomic<ulonglong> latch_ex_wait_start;
+
+  /** the rw-latch protecting the data dictionary cache */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
+#ifdef UNIV_DEBUG
+  /** whether latch is being held in exclusive mode (by any thread) */
+  Atomic_relaxed<pthread_t> latch_ex;
+  /** number of S-latch holders */
+  Atomic_counter<uint32_t> latch_readers;
+#endif
 public:
-	DictSysMutex	mutex;		/*!< mutex protecting the data
-					dictionary; protects also the
-					disk-based dictionary system tables;
-					this mutex serializes CREATE TABLE
-					and DROP TABLE, as well as reading
-					the dictionary data for a table from
-					system tables */
-	/** @brief the data dictionary rw-latch protecting dict_sys
-
-	Table create, drop, etc. reserve this in X-mode; implicit or
-	backround operations purge, rollback, foreign key checks reserve this
-	in S-mode; not all internal InnoDB operations are covered by MDL.
-
-	This latch also prevents lock waits when accessing the InnoDB
-	data dictionary tables. @see trx_t::dict_operation_lock_mode */
-	rw_lock_t	latch;
-	row_id_t	row_id;		/*!< the next row id to assign;
-					NOTE that at a checkpoint this
-					must be written to the dict system
-					header and flushed to a file; in
-					recovery this must be derived from
-					the log records */
-	hash_table_t	table_hash;	/*!< hash table of the tables, based
-					on name */
-	/** hash table of persistent table IDs */
-	hash_table_t	table_id_hash;
-	dict_table_t*	sys_tables;	/*!< SYS_TABLES table */
-	dict_table_t*	sys_columns;	/*!< SYS_COLUMNS table */
-	dict_table_t*	sys_indexes;	/*!< SYS_INDEXES table */
-	dict_table_t*	sys_fields;	/*!< SYS_FIELDS table */
-	dict_table_t*	sys_virtual;	/*!< SYS_VIRTUAL table */
-
-	/*=============================*/
-	UT_LIST_BASE_NODE_T(dict_table_t)
-			table_LRU;	/*!< List of tables that can be evicted
-					from the cache */
-	UT_LIST_BASE_NODE_T(dict_table_t)
-			table_non_LRU;	/*!< List of tables that can't be
-					evicted from the cache */
+  /** Indexes of SYS_TABLE[] */
+  enum
+  {
+    SYS_TABLES= 0,
+    SYS_INDEXES,
+    SYS_COLUMNS,
+    SYS_FIELDS,
+    SYS_FOREIGN,
+    SYS_FOREIGN_COLS,
+    SYS_VIRTUAL
+  };
+  /** System table names */
+  static const span<const char> SYS_TABLE[];
+
+  /** all tables (persistent and temporary), hashed by name */
+  hash_table_t table_hash;
+  /** hash table of persistent table IDs */
+  hash_table_t table_id_hash;
+
+  /** the SYS_TABLES table */
+  dict_table_t *sys_tables;
+  /** the SYS_COLUMNS table */
+  dict_table_t *sys_columns;
+  /** the SYS_INDEXES table */
+  dict_table_t *sys_indexes;
+  /** the SYS_FIELDS table */
+  dict_table_t *sys_fields;
+  /** the SYS_FOREIGN table */
+  dict_table_t *sys_foreign;
+  /** the SYS_FOREIGN_COLS table */
+  dict_table_t *sys_foreign_cols;
+  /** the SYS_VIRTUAL table */
+  dict_table_t *sys_virtual;
+
+  /** @return whether all non-hard-coded system tables exist */
+  bool sys_tables_exist() const
+  { return UNIV_LIKELY(sys_foreign && sys_foreign_cols && sys_virtual); }
+
+  /** list of persistent tables that can be evicted */
+  UT_LIST_BASE_NODE_T(dict_table_t) table_LRU;
+  /** list of persistent tables that cannot be evicted */
+  UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU;
+
 private:
-	bool m_initialised;
-	/** the sequence of temporary table IDs */
-	std::atomic<table_id_t> temp_table_id;
-	/** hash table of temporary table IDs */
-	hash_table_t temp_id_hash;
+  bool m_initialised= false;
+  /** the sequence of temporary table IDs */
+  std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID};
+  /** hash table of temporary table IDs */
+  hash_table_t temp_id_hash;
+  /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID
+  (FIXME: remove this, and move to dict_table_t) */
+  Atomic_relaxed<row_id_t> row_id;
+  /** The synchronization interval of row_id */
+  static constexpr size_t ROW_ID_WRITE_MARGIN= 256;
 public:
-	/** @return a new temporary table ID */
-	table_id_t get_temporary_table_id() {
-		return temp_table_id.fetch_add(1, std::memory_order_relaxed);
-	}
+  /** Diagnostic message for exceeding the lock_wait() timeout */
+  static const char fatal_msg[];
 
-	/** Look up a temporary table.
-	@param id	temporary table ID
-	@return	temporary table
-	@retval	NULL	if the table does not exist
-	(should only happen during the rollback of CREATE...SELECT) */
-	dict_table_t* get_temporary_table(table_id_t id)
-	{
-		ut_ad(mutex_own(&mutex));
-		dict_table_t* table;
-		ulint fold = ut_fold_ull(id);
-		HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
-			    ut_ad(table->cached), table->id == id);
-		if (UNIV_LIKELY(table != NULL)) {
-			DBUG_ASSERT(table->is_temporary());
-			DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
-			table->acquire();
-		}
-		return table;
-	}
+  /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+  inline row_id_t get_new_row_id();
 
-	/** Look up a persistent table.
-	@param id	table ID
-	@return	table
-	@retval	NULL	if not cached */
-	dict_table_t* get_table(table_id_t id)
-	{
-		ut_ad(mutex_own(&mutex));
-		dict_table_t* table;
-		ulint fold = ut_fold_ull(id);
-		HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*,
-			    table,
-			    ut_ad(table->cached), table->id == id);
-		DBUG_ASSERT(!table || !table->is_temporary());
-		return table;
-	}
+  /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+  inline void update_row_id(row_id_t id);
 
-  /**
-    Constructor.  Further initialisation happens in create().
-  */
+  /** Recover the global DB_ROW_ID sequence on database startup */
+  void recover_row_id(row_id_t id)
+  {
+    row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN;
+  }
 
-  dict_sys_t() : m_initialised(false), temp_table_id(DICT_HDR_FIRST_ID) {}
+  /** @return a new temporary table ID */
+  table_id_t acquire_temporary_table_id()
+  {
+    return temp_table_id.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  /** Look up a temporary table.
+  @param id        temporary table ID
+  @return          temporary table
+  @retval nullptr  if the table does not exist
+  (should only happen during the rollback of CREATE...SELECT) */
+  dict_table_t *acquire_temporary_table(table_id_t id)
+  {
+    ut_ad(frozen());
+    dict_table_t *table;
+    ulint fold = ut_fold_ull(id);
+    HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
+                ut_ad(table->cached), table->id == id);
+    if (UNIV_LIKELY(table != nullptr))
+    {
+      DBUG_ASSERT(table->is_temporary());
+      DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
+      table->acquire();
+    }
+    return table;
+  }
+
+  /** Look up a persistent table.
+  @param id     table ID
+  @return table
+  @retval nullptr if not cached */
+  dict_table_t *find_table(table_id_t id)
+  {
+    ut_ad(frozen());
+    dict_table_t *table;
+    ulint fold= ut_fold_ull(id);
+    HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, table,
+                ut_ad(table->cached), table->id == id);
+    DBUG_ASSERT(!table || !table->is_temporary());
+    return table;
+  }
 
   bool is_initialised() const { return m_initialised; }
 
@@ -1493,14 +1459,13 @@ public:
 
 #ifdef UNIV_DEBUG
   /** Find a table */
-  template <bool in_lru> bool find(dict_table_t* table)
+  template <bool in_lru> bool find(const dict_table_t *table)
   {
     ut_ad(table);
     ut_ad(table->can_be_evicted == in_lru);
-    ut_ad(mutex_own(&mutex));
-    for (const dict_table_t* t = UT_LIST_GET_FIRST(in_lru
-					     ? table_LRU : table_non_LRU);
-	 t; t = UT_LIST_GET_NEXT(table_LRU, t))
+    ut_ad(frozen());
+    for (const dict_table_t* t= in_lru ? table_LRU.start : table_non_LRU.start;
+         t; t = UT_LIST_GET_NEXT(table_LRU, t))
     {
       if (t == table) return true;
       ut_ad(t->can_be_evicted == in_lru);
@@ -1508,128 +1473,146 @@ public:
     return false;
   }
   /** Find a table */
-  bool find(dict_table_t* table)
+  bool find(const dict_table_t *table)
   {
     return table->can_be_evicted ? find<true>(table) : find<false>(table);
   }
 #endif
 
   /** Move a table to the non-LRU list from the LRU list. */
-  void prevent_eviction(dict_table_t* table)
+  void prevent_eviction(dict_table_t *table)
   {
+    ut_d(locked());
     ut_ad(find(table));
-    if (table->can_be_evicted)
-    {
-      table->can_be_evicted = FALSE;
-      UT_LIST_REMOVE(table_LRU, table);
-      UT_LIST_ADD_LAST(table_non_LRU, table);
-    }
+    if (!table->can_be_evicted)
+      return;
+    table->can_be_evicted= false;
+    UT_LIST_REMOVE(table_LRU, table);
+    UT_LIST_ADD_LAST(table_non_LRU, table);
   }
-  /** Acquire a reference to a cached table. */
-  inline void acquire(dict_table_t* table);
 
 #ifdef UNIV_DEBUG
-  /** Assert that the data dictionary is locked */
-  void assert_locked()
-  {
-    ut_ad(mutex_own(&mutex));
-    ut_ad(rw_lock_own(&latch, RW_LOCK_X));
-  }
+  /** @return whether any thread (not necessarily the current thread)
+  is holding the latch; that is, this check may return false
+  positives */
+  bool frozen() const { return latch_readers || latch_ex; }
+  /** @return whether any thread (not necessarily the current thread)
+  is holding a shared latch */
+  bool frozen_not_locked() const { return latch_readers; }
+  /** @return whether the current thread holds the exclusive latch */
+  bool locked() const { return latch_ex == pthread_self(); }
 #endif
-  /** Lock the data dictionary cache. */
-  void lock(const char* file, unsigned line)
+private:
+  /** Acquire the exclusive latch */
+  ATTRIBUTE_NOINLINE
+  void lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line));
+public:
+  /** @return the my_hrtime_coarse().val of the oldest lock_wait() start,
+  assuming that requests are served on a FIFO basis */
+  ulonglong oldest_wait() const
+  { return latch_ex_wait_start.load(std::memory_order_relaxed); }
+
+  /** Exclusively lock the dictionary cache. */
+  void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
   {
-    rw_lock_x_lock_func(&latch, 0, file, line);
-    mutex_enter_loc(&mutex, file, line);
+    if (latch.wr_lock_try())
+    {
+      ut_ad(!latch_readers);
+      ut_ad(!latch_ex);
+      ut_d(latch_ex= pthread_self());
+    }
+    else
+      lock_wait(SRW_LOCK_ARGS(file, line));
   }
 
+#ifdef UNIV_PFS_RWLOCK
+  /** Unlock the data dictionary cache. */
+  ATTRIBUTE_NOINLINE void unlock();
+  /** Acquire a shared lock on the dictionary cache. */
+  ATTRIBUTE_NOINLINE void freeze(const char *file, unsigned line);
+  /** Release a shared lock on the dictionary cache. */
+  ATTRIBUTE_NOINLINE void unfreeze();
+#else
   /** Unlock the data dictionary cache. */
   void unlock()
   {
-    mutex_exit(&mutex);
-    rw_lock_x_unlock(&latch);
+    ut_ad(latch_ex == pthread_self());
+    ut_ad(!latch_readers);
+    ut_d(latch_ex= 0);
+    latch.wr_unlock();
   }
+  /** Acquire a shared lock on the dictionary cache. */
+  void freeze()
+  {
+    latch.rd_lock();
+    ut_ad(!latch_ex);
+    ut_d(latch_readers++);
+  }
+  /** Release a shared lock on the dictionary cache. */
+  void unfreeze()
+  {
+    ut_ad(!latch_ex);
+    ut_ad(latch_readers--);
+    latch.rd_unlock();
+  }
+#endif
 
   /** Estimate the used memory occupied by the data dictionary
   table and index objects.
   @return number of bytes occupied */
-  ulint rough_size() const
+  TPOOL_SUPPRESS_TSAN ulint rough_size() const
   {
-    /* No mutex; this is a very crude approximation anyway */
+    /* No latch; this is a very crude approximation anyway */
     ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
     size *= sizeof(dict_table_t)
       + sizeof(dict_index_t) * 2
       + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
       + sizeof(dict_field_t) * 5 /* total number of key fields */
       + 200; /* arbitrary, covering names and overhead */
-    size += (table_hash.n_cells + table_id_hash.n_cells
-	     + temp_id_hash.n_cells) * sizeof(hash_cell_t);
+    size += (table_hash.n_cells + table_id_hash.n_cells +
+             temp_id_hash.n_cells) * sizeof(hash_cell_t);
     return size;
   }
-};
 
-/** the data dictionary cache */
-extern dict_sys_t	dict_sys;
+  /** Evict unused, unlocked tables from table_LRU.
+  @param half whether to consider half the tables only (instead of all)
+  @return number of tables evicted */
+  ulint evict_table_LRU(bool half);
 
-#define dict_table_prevent_eviction(table) dict_sys.prevent_eviction(table)
-#define dict_sys_lock() dict_sys.lock(__FILE__, __LINE__)
-#define dict_sys_unlock() dict_sys.unlock()
-
-/* Auxiliary structs for checking a table definition @{ */
-
-/* This struct is used to specify the name and type that a column must
-have when checking a table's schema. */
-struct dict_col_meta_t {
-	const char*	name;		/* column name */
-	ulint		mtype;		/* required column main type */
-	ulint		prtype_mask;	/* required column precise type mask;
-					if this is non-zero then all the
-					bits it has set must also be set
-					in the column's prtype */
-	ulint		len;		/* required column length */
-};
+  /** Look up a table in the dictionary cache.
+  @param name   table name
+  @return table handle
+  @retval nullptr if not found */
+  dict_table_t *find_table(const span<const char> &name) const
+  {
+    ut_ad(frozen());
+    for (dict_table_t *table= static_cast<dict_table_t*>
+         (HASH_GET_FIRST(&table_hash, table_hash.calc_hash
+                         (my_crc32c(0, name.data(), name.size()))));
+         table; table= table->name_hash)
+      if (strlen(table->name.m_name) == name.size() &&
+          !memcmp(table->name.m_name, name.data(), name.size()))
+        return table;
+    return nullptr;
+  }
 
-/* This struct is used for checking whether a given table exists and
-whether it has a predefined schema (number of columns and column names
-and types) */
-struct dict_table_schema_t {
-	const char*		table_name;	/* the name of the table whose
-						structure we are checking */
-	ulint			n_cols;		/* the number of columns the
-						table must have */
-	dict_col_meta_t*	columns;	/* metadata for the columns;
-						this array has n_cols
-						elements */
-	ulint			n_foreign;	/* number of foreign keys this
-						table has, pointing to other
-						tables (where this table is
-						FK child) */
-	ulint			n_referenced;	/* number of foreign keys other
-						tables have, pointing to this
-						table (where this table is
-						parent) */
+  /** Look up or load a table definition
+  @param name   table name
+  @param ignore errors to ignore when loading the table definition
+  @return table handle
+  @retval nullptr if not found */
+  dict_table_t *load_table(const span<const char> &name,
+                           dict_err_ignore_t ignore= DICT_ERR_IGNORE_NONE);
+
+  /** Attempt to load the system tables on startup
+  @return whether any discrepancy with the expected definition was found */
+  bool load_sys_tables();
+  /** Create or check system tables on startup */
+  dberr_t create_or_check_sys_tables();
 };
-/* @} */
 
-/*********************************************************************//**
-Checks whether a table exists and whether it has the given structure.
-The table must have the same number of columns with the same names and
-types. The order of the columns does not matter.
-The caller must own the dictionary mutex.
-dict_table_schema_check() @{
-@return DB_SUCCESS if the table exists and contains the necessary columns */
-dberr_t
-dict_table_schema_check(
-/*====================*/
-	dict_table_schema_t*	req_schema,	/*!< in/out: required table
-						schema */
-	char*			errstr,		/*!< out: human readable error
-						message if != DB_SUCCESS and
-						!= DB_TABLE_NOT_FOUND is
-						returned */
-	size_t			errstr_sz)	/*!< in: errstr size */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/* @} */
+/** the data dictionary cache */
+extern dict_sys_t	dict_sys;
 
 /*********************************************************************//**
 Converts a database and table name from filesystem encoding
@@ -1647,43 +1630,12 @@ dict_fs2utf8(
 	size_t		table_utf8_size)/*!< in: table_utf8 size */
 	MY_ATTRIBUTE((nonnull));
 
-/**********************************************************************//**
-Check whether the table is corrupted.
-@return nonzero for corrupted table, zero for valid tables */
-UNIV_INLINE
-ulint
-dict_table_is_corrupted(
-/*====================*/
-	const dict_table_t*	table)	/*!< in: table */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/**********************************************************************//**
-Flags an index and table corrupted both in the data dictionary cache
-and in the system table SYS_INDEXES. */
-void
-dict_set_corrupted(
-/*===============*/
-	dict_index_t*	index,	/*!< in/out: index */
-	trx_t*		trx,	/*!< in/out: transaction */
-	const char*	ctx)	/*!< in: context */
-	ATTRIBUTE_COLD __attribute__((nonnull));
-
-/** Flags an index corrupted in the data dictionary cache only. This
-is used mostly to mark a corrupted index when index's own dictionary
-is corrupted, and we force to load such index for repair purpose
-@param[in,out]	index	index that is corrupted */
-void
-dict_set_corrupted_index_cache_only(
-	dict_index_t*	index);
-
-/**********************************************************************//**
-Flags a table with specified space_id corrupted in the table dictionary
-cache.
-@return TRUE if successful */
-bool dict_set_corrupted_by_space(const fil_space_t* space);
-
-/** Flag a table encrypted in the data dictionary cache. */
-void dict_set_encrypted_by_space(const fil_space_t* space);
+/** Flag an index corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES.
+@param index       index to be flagged as corrupted
+@param ctx         context (for error log reporting) */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
+  ATTRIBUTE_COLD __attribute__((nonnull));
 
 /** Sets merge_threshold in the SYS_INDEXES
 @param[in,out]	index		index
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
index eda639ba7c1..a210c839020 100644
--- a/storage/innobase/include/dict0dict.inl
+++ b/storage/innobase/include/dict0dict.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -907,20 +907,6 @@ dict_index_get_page(
 	return(index->page);
 }
 
-/*********************************************************************//**
-Gets the read-write lock of the index tree.
-@return read-write lock */
-UNIV_INLINE
-rw_lock_t*
-dict_index_get_lock(
-/*================*/
-	const dict_index_t*	index)	/*!< in: index */
-{
-	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
-
-	return(&(index->lock));
-}
-
 /********************************************************************//**
 Returns free space reserved for future updates of records. This is
 relevant only in the case of many consecutive inserts, as updates
@@ -977,7 +963,7 @@ dict_index_set_online_status(
 	enum online_index_status	status)	/*!< in: status */
 {
 	ut_ad(!(index->type & DICT_FTS));
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+	ut_ad(index->lock.have_x());
 
 #ifdef UNIV_DEBUG
 	switch (dict_index_get_online_status(index)) {
@@ -1114,19 +1100,6 @@ dict_max_v_field_len_store_undo(
 	return(max_log_len);
 }
 
-/********************************************************************//**
-Check whether the table is corrupted.
-@return nonzero for corrupted table, zero for valid tables */
-UNIV_INLINE
-ulint
-dict_table_is_corrupted(
-/*====================*/
-	const dict_table_t*	table)	/*!< in: table */
-{
-	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
-	return(table->corrupted);
-}
-
 /** Check if the table is found is a file_per_table tablespace.
 This test does not use table flags2 since some REDUNDANT tables in the
 system tablespace may have garbage in the MIX_LEN field where flags2 is
@@ -1153,12 +1126,10 @@ dict_table_is_file_per_table(
 }
 
 /** Acquire the table handle. */
-inline
-void
-dict_table_t::acquire()
+inline void dict_table_t::acquire()
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
-	n_ref_count++;
+  ut_ad(dict_sys.frozen());
+  n_ref_count++;
 }
 
 /** Release the table handle.
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
index f067571ca5b..f7d33d5b43b 100644
--- a/storage/innobase/include/dict0load.h
+++ b/storage/innobase/include/dict0load.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -39,30 +39,12 @@ Created 4/24/1996 Heikki Tuuri
 /** A stack of table names related through foreign key constraints */
 typedef std::deque<const char*, ut_allocator<const char*> >	dict_names_t;
 
-/** enum that defines all system table IDs. @see SYSTEM_TABLE_NAME[] */
-enum dict_system_id_t {
-	SYS_TABLES = 0,
-	SYS_INDEXES,
-	SYS_COLUMNS,
-	SYS_FIELDS,
-	SYS_FOREIGN,
-	SYS_FOREIGN_COLS,
-	SYS_TABLESPACES,
-	SYS_DATAFILES,
-	SYS_VIRTUAL,
-
-	/* This must be last item. Defines the number of system tables. */
-	SYS_NUM_SYSTEM_TABLES
-};
-
 /** Check each tablespace found in the data dictionary.
-Look at each table defined in SYS_TABLES that has a space_id > 0.
-If the tablespace is not yet in the fil_system cache, look up the
-tablespace in SYS_DATAFILES to ensure the correct path.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
 
 In a crash recovery we already have some tablespace objects created from
-processing the REDO log.  Any other tablespace in SYS_TABLESPACES not
-previously used in recovery will be opened here.  We will compare the
+processing the REDO log. We will compare the
 space_id information in the data dictionary to what we find in the
 tablespace file. In addition, more validation will be done if recovery
 was needed and force_recovery is not set.
@@ -70,35 +52,9 @@ was needed and force_recovery is not set.
 We also scan the biggest space id, and store it to fil_system. */
 void dict_check_tablespaces_and_store_max_id();
 
-/********************************************************************//**
-Finds the first table name in the given database.
-@return own: table name, NULL if does not exist; the caller must free
-the memory in the string! */
-char*
-dict_get_first_table_name_in_db(
-/*============================*/
-	const char*	name);	/*!< in: database name which ends to '/' */
-
 /** Make sure the data_file_name is saved in dict_table_t if needed.
-Try to read it from the fil_system first, then from SYS_DATAFILES.
-@param[in]	table		Table object
-@param[in]	dict_mutex_own	true if dict_sys.mutex is owned already */
-void
-dict_get_and_save_data_dir_path(
-	dict_table_t*	table,
-	bool		dict_mutex_own);
-
-/** Loads a table definition and also all its index definitions, and also
-the cluster definition if the table is a member in a cluster. Also loads
-all foreign key constraints where the foreign key is in the table or where
-a foreign key references columns in this table.
-@param[in]	name		Table name in the dbname/tablename format
-@param[in]	ignore_err	Error to be ignored when loading
-				table and its index definition
-@return table, NULL if does not exist; if the table is stored in an
-.ibd file, but the file does not exist, then we set the file_unreadable
-flag in the table object we return. */
-dict_table_t* dict_load_table(const char* name, dict_err_ignore_t ignore_err);
+@param[in,out]	table		Table object */
+void dict_get_and_save_data_dir_path(dict_table_t* table);
 
 /***********************************************************************//**
 Loads a table object based on the table id.
@@ -133,7 +89,8 @@ dict_load_foreigns(
 	const char*		table_name,	/*!< in: table name */
 	const char**		col_names,	/*!< in: column names, or NULL
 						to use table->col_names */
-	bool			check_recursive,/*!< in: Whether to check
+	trx_id_t		trx_id,		/*!< in: DDL transaction id,
+						or 0 to check
 						recursive load of tables
 						chained by FK */
 	bool			check_charsets,	/*!< in: whether to check
@@ -143,7 +100,7 @@ dict_load_foreigns(
 						which must be loaded
 						subsequently to load all the
 						foreign key constraints. */
-	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+	MY_ATTRIBUTE((nonnull(1)));
 
 /********************************************************************//**
 This function opens a system table, and return the first record.
@@ -154,7 +111,7 @@ dict_startscan_system(
 	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
 					the record */
 	mtr_t*		mtr,		/*!< in: the mini-transaction */
-	dict_system_id_t system_id);	/*!< in: which system table to open */
+	dict_table_t*	table);		/*!< in: system table */
 /********************************************************************//**
 This function get the next system table record as we scan the table.
 @return the record if found, NULL if end of scan. */
@@ -164,19 +121,19 @@ dict_getnext_system(
 	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor
 					to the record */
 	mtr_t*		mtr);		/*!< in: the mini-transaction */
-/********************************************************************//**
-This function processes one SYS_TABLES record and populate the dict_table_t
-struct for the table.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_tables_rec_and_mtr_commit(
-/*=======================================*/
-	mem_heap_t*	heap,		/*!< in: temporary memory heap */
-	const rec_t*	rec,		/*!< in: SYS_TABLES record */
-	dict_table_t**	table,		/*!< out: dict_table_t to fill */
-	bool		cached,		/*!< in: whether to load from cache */
-	mtr_t*		mtr);		/*!< in/out: mini-transaction,
-					will be committed */
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in,out]	mtr		mini-transaction
+@param[in]	uncommitted	whether to use READ UNCOMMITTED isolation level
+@param[in]	rec		SYS_TABLES record
+@param[out,own]	table		table, or nullptr
+@return	error message
+@retval	nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+                                const rec_t *rec, dict_table_t **table)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
 /********************************************************************//**
 This function parses a SYS_INDEXES record and populate a dict_index_t
 structure with the information from the record. For detail information
@@ -259,51 +216,5 @@ dict_process_sys_foreign_col_rec(
 	const char**	ref_col_name,	/*!< out: referenced column name
 					in referenced table */
 	ulint*		pos);		/*!< out: column position */
-/********************************************************************//**
-This function parses a SYS_TABLESPACES record, extracts necessary
-information from the record and returns to caller.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_tablespaces(
-/*=========================*/
-	mem_heap_t*	heap,		/*!< in/out: heap memory */
-	const rec_t*	rec,		/*!< in: current SYS_TABLESPACES rec */
-	uint32_t*	space,		/*!< out: tablespace identifier */
-	const char**	name,		/*!< out: tablespace name */
-	ulint*		flags);		/*!< out: tablespace flags */
-/********************************************************************//**
-This function parses a SYS_DATAFILES record, extracts necessary
-information from the record and returns to caller.
-@return error message, or NULL on success */
-const char*
-dict_process_sys_datafiles(
-/*=======================*/
-	mem_heap_t*	heap,		/*!< in/out: heap memory */
-	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
-	uint32_t*	space,		/*!< out: tablespace identifier */
-	const char**	path);		/*!< out: datafile path */
-
-/** Update the record for space_id in SYS_TABLESPACES to this filepath.
-@param[in]	space_id	Tablespace ID
-@param[in]	filepath	Tablespace filepath
-@return DB_SUCCESS if OK, dberr_t if the insert failed */
-dberr_t
-dict_update_filepath(
-	ulint		space_id,
-	const char*	filepath);
-
-/** Replace records in SYS_TABLESPACES and SYS_DATAFILES associated with
-the given space_id using an independent transaction.
-@param[in]	space_id	Tablespace ID
-@param[in]	name		Tablespace name
-@param[in]	filepath	First filepath
-@param[in]	fsp_flags	Tablespace flags
-@return DB_SUCCESS if OK, dberr_t if the insert failed */
-dberr_t
-dict_replace_tablespace_and_filepath(
-	ulint		space_id,
-	const char*	name,
-	const char*	filepath,
-	ulint		fsp_flags);
 
 #endif
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 0a28a6a9868..c469b9da1c2 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -2,7 +2,7 @@
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,14 +28,14 @@ Created 1/8/1996 Heikki Tuuri
 #ifndef dict0mem_h
 #define dict0mem_h
 
+#include "dict0types.h"
 #include "data0type.h"
 #include "mem0mem.h"
 #include "row0types.h"
-#include "rem0types.h"
 #include "btr0types.h"
 #include "lock0types.h"
 #include "que0types.h"
-#include "sync0rw.h"
+#include "sux_lock.h"
 #include "ut0mem.h"
 #include "ut0rnd.h"
 #include "ut0byte.h"
@@ -298,17 +298,6 @@ parent table will fail, and user has to drop excessive foreign constraint
 before proceeds. */
 #define FK_MAX_CASCADE_DEL		15
 
-/** Create a table memory object.
-@param name     table name
-@param space    tablespace
-@param n_cols   total number of columns (both virtual and non-virtual)
-@param n_v_cols number of virtual columns
-@param flags    table flags
-@param flags2   table flags2
-@return own: table object */
-dict_table_t *dict_mem_table_create(const char *name, fil_space_t *space,
-                                    ulint n_cols, ulint n_v_cols, ulint flags,
-                                    ulint flags2);
 /****************************************************************/ /**
  Free a table memory object. */
 void
@@ -977,6 +966,26 @@ const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
 /** Data structure for an index.  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_index_create(). */
 struct dict_index_t {
+  /** Columns whose character-set collation is being changed */
+  struct col_info
+  {
+    /** number of columns whose charset-collation is being changed */
+    unsigned n_cols;
+    /** columns with changed charset-collation */
+    dict_col_t *cols;
+
+    /** Add a column with changed collation. */
+    dict_col_t *add(mem_heap_t *heap, const dict_col_t &col, unsigned offset)
+    {
+      ut_ad(offset < n_cols);
+      if (!cols)
+        cols= static_cast<dict_col_t*>
+          (mem_heap_alloc(heap, n_cols * sizeof col));
+      new (&cols[offset]) dict_col_t(col);
+      return &cols[offset];
+    }
+  };
+
   /** Maximum number of fields */
   static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
 
@@ -1011,15 +1020,6 @@ struct dict_index_t {
 				representation we add more columns */
 	unsigned	nulls_equal:1;
 				/*!< if true, SQL NULL == SQL NULL */
-#ifdef BTR_CUR_HASH_ADAPT
-#ifdef MYSQL_INDEX_DISABLE_AHI
- 	unsigned	disable_ahi:1;
-				/*!< whether to disable the
-				adaptive hash index.
-				Maybe this could be disabled for
-				temporary tables? */
-#endif
-#endif /* BTR_CUR_HASH_ADAPT */
 	unsigned	n_uniq:10;/*!< number of fields from the beginning
 				which are enough to determine an index
 				entry uniquely */
@@ -1046,8 +1046,7 @@ struct dict_index_t {
 				/*!< enum online_index_status.
 				Transitions from ONLINE_INDEX_COMPLETE (to
 				ONLINE_INDEX_CREATION) are protected
-				by dict_sys.latch and
-				dict_sys.mutex. Other changes are
+				by dict_sys.latch. Other changes are
 				protected by index->lock. */
 	unsigned	uncommitted:1;
 				/*!< a flag that is set for secondary indexes
@@ -1072,6 +1071,16 @@ struct dict_index_t {
 	It should use heap from dict_index_t. It should be freed
 	while removing the index from table. */
 	dict_add_v_col_info* new_vcol_info;
+
+	/** During ALTER TABLE, columns that a being-added index depends on
+	and whose encoding or collation is being changed to something
+	that is compatible with the clustered index.
+	Allocated from dict_index_t::heap.
+
+	@see rollback_inplace_alter_table()
+	@see ha_innobase_inplace_ctx::col_collations */
+	col_info* change_col_info;
+
 	UT_LIST_NODE_T(dict_index_t)
 			indexes;/*!< list of indexes of the table */
 #ifdef BTR_CUR_ADAPT
@@ -1148,8 +1157,8 @@ public:
 				when InnoDB was started up */
 	zip_pad_info_t	zip_pad;/*!< Information about state of
 				compression failures and successes */
-	mutable rw_lock_t	lock;	/*!< read-write lock protecting the
-				upper levels of the index tree */
+  /** lock protecting the non-leaf index pages */
+  mutable index_lock lock;
 
 	/** Determine if the index has been committed to the
 	data dictionary.
@@ -1166,6 +1175,7 @@ public:
 	{
 		ut_ad(!to_be_dropped);
 		ut_ad(committed || !(type & DICT_CLUSTERED));
+		ut_ad(!committed || !change_col_info);
 		uncommitted = !committed;
 	}
 
@@ -1205,6 +1215,16 @@ public:
 	/** @return whether this is the change buffer */
 	bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
 
+	/** @return whether this index requires locking */
+	bool has_locking() const { return !is_ibuf(); }
+
+	/** @return whether this is a normal B-tree index
+        (not the change buffer, not SPATIAL or FULLTEXT) */
+	bool is_btree() const {
+		return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL
+					     | DICT_FTS | DICT_CORRUPT)));
+	}
+
 	/** @return whether the index includes virtual columns */
 	bool has_virtual() const { return type & DICT_VIRTUAL; }
 
@@ -1321,6 +1341,16 @@ public:
   ulint get_new_n_vcol() const
   { return new_vcol_info ? new_vcol_info->n_v_col : 0; }
 
+  /** Assign the number of collation change fields as a part of the index
+  @param  n_cols   number of columns whose collation is changing */
+  void init_change_cols(unsigned n_cols)
+  {
+    ut_ad(n_fields > n_cols || type & DICT_FTS);
+    change_col_info= static_cast<col_info*>
+      (mem_heap_zalloc(heap, sizeof(col_info)));
+    change_col_info->n_cols= n_cols;
+  }
+
   /** Reconstruct the clustered index fields.
   @return whether metadata is incorrect */
   inline bool reconstruct_fields();
@@ -1415,6 +1445,26 @@ public:
 	everything in overflow) size of the longest possible row and index
 	of a field which made index records too big to fit on a page.*/
 	inline record_size_info_t record_size_info() const;
+
+  /** Clear the index tree and reinitialize the root page, in the
+  rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+  @param thr query thread
+  @return error code */
+  dberr_t clear(que_thr_t *thr);
+
+  /** Check whether the online log is dummy value to indicate
+  whether table undergoes active DDL.
+  @retval true if online log is dummy value */
+  bool online_log_is_dummy() const
+  {
+    return online_log == reinterpret_cast<const row_log_t*>(this);
+  }
+
+  /** Assign clustered index online log to dummy value */
+  void online_log_make_dummy()
+  {
+    online_log= reinterpret_cast<row_log_t*>(this);
+  }
 };
 
 /** Detach a virtual column from an index.
@@ -1540,24 +1590,6 @@ struct dict_foreign_with_index {
 	const dict_index_t*	m_index;
 };
 
-#ifdef WITH_WSREP
-/** A function object to find a foreign key with the given index as the
-foreign index. Return the foreign key with matching criteria or NULL */
-struct dict_foreign_with_foreign_index {
-
-	dict_foreign_with_foreign_index(const dict_index_t*	index)
-	: m_index(index)
-	{}
-
-	bool operator()(const dict_foreign_t*	foreign) const
-	{
-		return(foreign->foreign_index == m_index);
-	}
-
-	const dict_index_t*	m_index;
-};
-#endif
-
 /* A function object to check if the foreign constraint is between different
 tables.  Returns true if foreign key constraint is between different tables,
 false otherwise. */
@@ -1812,7 +1844,7 @@ typedef enum {
 } dict_frm_t;
 
 /** Data structure for a database table.  Most fields will be
-initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+zero-initialized in dict_table_t::create(). */
 struct dict_table_t {
 
 	/** Get reference count.
@@ -1862,7 +1894,7 @@ struct dict_table_t {
 	which denotes temporary or intermediate tables in MariaDB. */
 	static bool is_temporary_name(const char* name)
 	{
-		return strstr(name, "/" TEMP_FILE_PREFIX) != NULL;
+		return strstr(name, "/#sql");
 	}
 
 	/** @return whether instant ALTER TABLE is in effect */
@@ -1965,37 +1997,75 @@ struct dict_table_t {
 		return versioned() && cols[vers_start].mtype == DATA_INT;
 	}
 
-	void inc_fk_checks()
-	{
-#ifdef UNIV_DEBUG
-		int32_t fk_checks=
-#endif
-		n_foreign_key_checks_running++;
-		ut_ad(fk_checks >= 0);
-	}
-	void dec_fk_checks()
-	{
-#ifdef UNIV_DEBUG
-		int32_t fk_checks=
-#endif
-		n_foreign_key_checks_running--;
-		ut_ad(fk_checks > 0);
-	}
-
 	/** For overflow fields returns potential max length stored inline */
 	inline size_t get_overflow_field_local_len() const;
 
-	/** Parse the table file name into table name and database name.
-	@tparam		dict_locked	whether dict_sys.mutex is being held
-	@param[in,out]	db_name		database name buffer
-	@param[in,out]	tbl_name	table name buffer
-	@param[out]	db_name_len	database name length
-	@param[out]	tbl_name_len	table name length
-	@return whether the table name is visible to SQL */
-	template<bool dict_locked= false>
-	bool parse_name(char (&db_name)[NAME_LEN + 1],
-			char (&tbl_name)[NAME_LEN + 1],
-			size_t *db_name_len, size_t *tbl_name_len) const;
+  /** Parse the table file name into table name and database name.
+  @tparam        dict_frozen  whether the caller holds dict_sys.latch
+  @param[in,out] db_name      database name buffer
+  @param[in,out] tbl_name     table name buffer
+  @param[out] db_name_len     database name length
+  @param[out] tbl_name_len    table name length
+  @return whether the table name is visible to SQL */
+  template<bool dict_frozen= false>
+  bool parse_name(char (&db_name)[NAME_LEN + 1],
+                  char (&tbl_name)[NAME_LEN + 1],
+                  size_t *db_name_len, size_t *tbl_name_len) const;
+
+  /** Clear the table when rolling back TRX_UNDO_EMPTY
+  @return error code */
+  dberr_t clear(que_thr_t *thr);
+
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread holds the lock_mutex */
+  bool lock_mutex_is_owner() const
+  { return lock_mutex_owner == pthread_self(); }
+  /** @return whether the current thread holds the stats_mutex (lock_mutex) */
+  bool stats_mutex_is_owner() const
+  { return lock_mutex_owner == pthread_self(); }
+#endif /* UNIV_DEBUG */
+  void lock_mutex_init() { lock_mutex.init(); }
+  void lock_mutex_destroy() { lock_mutex.destroy(); }
+  /** Acquire lock_mutex */
+  void lock_mutex_lock()
+  {
+    ut_ad(!lock_mutex_is_owner());
+    lock_mutex.wr_lock();
+    ut_ad(!lock_mutex_owner.exchange(pthread_self()));
+  }
+  /** Try to acquire lock_mutex */
+  bool lock_mutex_trylock()
+  {
+    ut_ad(!lock_mutex_is_owner());
+    bool acquired= lock_mutex.wr_lock_try();
+    ut_ad(!acquired || !lock_mutex_owner.exchange(pthread_self()));
+    return acquired;
+  }
+  /** Release lock_mutex */
+  void lock_mutex_unlock()
+  {
+    ut_ad(lock_mutex_owner.exchange(0) == pthread_self());
+    lock_mutex.wr_unlock();
+  }
+#ifndef SUX_LOCK_GENERIC
+  /** @return whether the lock mutex is held by some thread */
+  bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); }
+#endif
+
+  /* stats mutex lock currently defaults to lock_mutex but in the future,
+  there could be a use-case to have separate mutex for stats.
+  extra indirection (through inline so no performance hit) should
+  help simplify code and increase long-term maintainability */
+  void stats_mutex_init() { lock_mutex_init(); }
+  void stats_mutex_destroy() { lock_mutex_destroy(); }
+  void stats_mutex_lock() { lock_mutex_lock(); }
+  void stats_mutex_unlock() { lock_mutex_unlock(); }
+
+  /** Rename the data file.
+  @param new_name     name of the table
+  @param replace      whether to replace the file with the new name
+                      (as part of rolling back TRUNCATE) */
+  dberr_t rename_tablespace(span<const char> new_name, bool replace) const;
 
 private:
 	/** Initialize instant->field_map.
@@ -2004,12 +2074,12 @@ private:
 public:
 	/** Id of the table. */
 	table_id_t				id;
-	/** Hash chain node. */
-	hash_node_t				id_hash;
-	/** Table name. */
+	/** dict_sys.id_hash chain node */
+	dict_table_t*				id_hash;
+	/** Table name in name_hash */
 	table_name_t				name;
-	/** Hash chain node. */
-	hash_node_t				name_hash;
+	/** dict_sys.name_hash chain node */
+	dict_table_t*				name_hash;
 
 	/** Memory heap */
 	mem_heap_t*				heap;
@@ -2057,12 +2127,6 @@ public:
 	/** TRUE if the table object has been added to the dictionary cache. */
 	unsigned				cached:1;
 
-	/** TRUE if the table is to be dropped, but not yet actually dropped
-	(could in the background drop list). It is turned on at the beginning
-	of row_drop_table_for_mysql() and turned off just before we start to
-	update system tables for the drop. It is protected by dict_sys.latch. */
-	unsigned				to_be_dropped:1;
-
 	/** Number of non-virtual columns defined so far. */
 	unsigned				n_def:10;
 
@@ -2158,23 +2222,24 @@ public:
 	/** Maximum recursive level we support when loading tables chained
 	together with FK constraints. If exceeds this level, we will stop
 	loading child table into memory along with its parent table. */
-	unsigned				fk_max_recusive_level:8;
+	byte					fk_max_recusive_level;
 
-	/** Count of how many foreign key check operations are currently being
-	performed on the table. We cannot drop the table while there are
-	foreign key checks running on it. */
-	Atomic_counter<int32_t>			n_foreign_key_checks_running;
+  /** DDL transaction that last touched the table definition, or 0 if
+  no history is available. This includes possible changes in
+  ha_innobase::prepare_inplace_alter_table() and
+  ha_innobase::commit_inplace_alter_table(). */
+  trx_id_t def_trx_id;
 
-	/** Transactions whose view low limit is greater than this number are
-	not allowed to store to the MySQL query cache or retrieve from it.
-	When a trx with undo logs commits, it sets this to the value of the
-	transaction id. */
-	trx_id_t				query_cache_inv_trx_id;
+  /** Last transaction that inserted into an empty table.
+  Updated while holding exclusive table lock and an exclusive
+  latch on the clustered index root page (which must also be
+  an empty leaf page), and an ahi_latch (if btr_search_enabled). */
+  Atomic_relaxed<trx_id_t> bulk_trx_id;
 
-	/** Transaction id that last touched the table definition. Either when
-	loading the definition or CREATE TABLE, or ALTER TABLE (prepare,
-	commit, and rollback phases). */
-	trx_id_t				def_trx_id;
+  /** Original table name, for MDL acquisition in purge. Normally,
+  this points to the same as name. When is_temporary_name(name.m_name) holds,
+  this should be a copy of the original table name, allocated from heap. */
+  table_name_t mdl_name;
 
 	/*!< set of foreign key constraints in the table; these refer to
 	columns in other tables */
@@ -2184,7 +2249,7 @@ public:
 	dict_foreign_set			referenced_set;
 
 	/** Statistics for query optimization. Mostly protected by
-	dict_sys.mutex. @{ */
+	dict_sys.latch and stats_mutex_lock(). @{ */
 
 	/** TRUE if statistics have been calculated the first time after
 	database startup or table creation. */
@@ -2251,24 +2316,6 @@ public:
 	any latch, because this is only used for heuristics. */
 	ib_uint64_t				stat_modified_counter;
 
-	/** Background stats thread is not working on this table. */
-	#define BG_STAT_NONE			0
-
-	/** Set in 'stats_bg_flag' when the background stats code is working
-	on this table. The DROP TABLE code waits for this to be cleared before
-	proceeding. */
-	#define BG_STAT_IN_PROGRESS		(1 << 0)
-
-	/** Set in 'stats_bg_flag' when DROP TABLE starts waiting on
-	BG_STAT_IN_PROGRESS to be cleared. The background stats thread will
-	detect this and will eventually quit sooner. */
-	#define BG_STAT_SHOULD_QUIT		(1 << 1)
-
-	/** The state of the background stats thread wrt this table.
-	See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
-	Writes are covered by dict_sys.mutex. Dirty reads are possible. */
-	byte					stats_bg_flag;
-
 	bool		stats_error_printed;
 				/*!< Has persistent stats error beein
 				already printed for this table ? */
@@ -2280,7 +2327,7 @@ public:
 	kept in trx_t. In order to quickly determine whether a transaction has
 	locked the AUTOINC lock we keep a pointer to the transaction here in
 	the 'autoinc_trx' member. This is to avoid acquiring the
-	lock_sys_t::mutex and scanning the vector in trx_t.
+	lock_sys.latch and scanning the vector in trx_t.
 	When an AUTOINC lock has to wait, the corresponding lock instance is
 	created on the trx lock heap rather than use the pre-allocated instance
 	in autoinc_lock below. */
@@ -2292,26 +2339,41 @@ public:
 	from a select. */
 	lock_t*					autoinc_lock;
 
-	/** Mutex protecting the autoinc counter and freed_indexes. */
-	std::mutex				autoinc_mutex;
-
-	/** Autoinc counter value to give to the next inserted row. */
-	ib_uint64_t				autoinc;
-
-	/** This counter is used to track the number of granted and pending
-	autoinc locks on this table. This value is set after acquiring the
-	lock_sys_t::mutex but we peek the contents to determine whether other
-	transactions have acquired the AUTOINC lock or not. Of course only one
-	transaction can be granted the lock but there can be multiple
-	waiters. */
-	ulong					n_waiting_or_granted_auto_inc_locks;
-
-	/** The transaction that currently holds the the AUTOINC lock on this
-	table. Protected by lock_sys.mutex. */
-	const trx_t*				autoinc_trx;
+  /** Mutex protecting autoinc and freed_indexes. */
+  srw_spin_mutex autoinc_mutex;
+private:
+  /** Mutex protecting locks on this table. */
+  srw_spin_mutex lock_mutex;
+#ifdef UNIV_DEBUG
+  /** The owner of lock_mutex (0 if none) */
+  Atomic_relaxed<pthread_t> lock_mutex_owner{0};
+#endif
+public:
+  /** Autoinc counter value to give to the next inserted row. */
+  uint64_t autoinc;
+
+  /** The transaction that currently holds the the AUTOINC lock on this table.
+  Protected by lock_mutex.
+  The thread that is executing autoinc_trx may read this field without
+  holding a latch, in row_lock_table_autoinc_for_mysql().
+  Only the autoinc_trx thread may clear this field; it cannot be
+  modified on the behalf of a transaction that is being handled by a
+  different thread. */
+  Atomic_relaxed<const trx_t*> autoinc_trx;
+
+  /** Number of granted or pending autoinc_lock on this table. This
+  value is set after acquiring lock_sys.latch but
+  in innodb_autoinc_lock_mode=1 (the default),
+  ha_innobase::innobase_lock_autoinc() will perform a dirty read
+  to determine whether other transactions have acquired the autoinc_lock. */
+  uint32_t n_waiting_or_granted_auto_inc_locks;
 
 	/* @} */
 
+  /** Number of granted or pending LOCK_S or LOCK_X on the table.
+  Protected by lock_sys.assert_locked(*this). */
+  uint32_t n_lock_x_or_s;
+
 	/** FTS specific state variables. */
 	fts_t*					fts;
 
@@ -2320,22 +2382,28 @@ public:
 	in X mode of this table's indexes. */
 	ib_quiesce_t				quiesce;
 
-	/** Count of the number of record locks on this table. We use this to
-	determine whether we can evict the table from the dictionary cache.
-	It is protected by lock_sys.mutex. */
-	ulint					n_rec_locks;
+  /** Count of the number of record locks on this table. We use this to
+  determine whether we can evict the table from the dictionary cache.
+  Modified when lock_sys.is_writer(), or
+  lock_sys.assert_locked(page_id) and trx->mutex_is_owner() hold.
+  @see trx_lock_t::trx_locks */
+  Atomic_counter<uint32_t> n_rec_locks;
 private:
-	/** Count of how many handles are opened to this table. Dropping of the
-	table is NOT allowed until this count gets to zero. MySQL does NOT
-	itself check the number of open handles at DROP. */
-	Atomic_counter<uint32_t>		n_ref_count;
-
+  /** Count of how many handles are opened to this table. Dropping of the
+  table is NOT allowed until this count gets to zero. MySQL does NOT
+  itself check the number of open handles at DROP. */
+  Atomic_counter<uint32_t> n_ref_count;
 public:
-	/** List of locks on the table. Protected by lock_sys.mutex. */
-	table_lock_list_t			locks;
+  /** List of locks on the table. Protected by lock_sys.assert_locked(lock). */
+  table_lock_list_t locks;
 
-	/** Timestamp of the last modification of this table. */
-	time_t					update_time;
+  /** Timestamp of the last modification of this table. */
+  Atomic_relaxed<time_t> update_time;
+  /** Transactions whose view low limit is greater than this number are
+  not allowed to access the MariaDB query cache.
+  @see innobase_query_caching_table_check_low()
+  @see trx_t::commit_tables() */
+  Atomic_relaxed<trx_id_t> query_cache_inv_trx_id;
 
 #ifdef UNIV_DEBUG
 	/** Value of 'magic_n'. */
@@ -2359,13 +2427,30 @@ public:
     return false;
   }
 
-  /** Check whether the table name is same as mysql/innodb_stats_table
-  or mysql/innodb_index_stats.
-  @return true if the table name is same as stats table */
+  /** @return whether a DDL operation is in progress on this table */
+  bool is_active_ddl() const
+  {
+    return UT_LIST_GET_FIRST(indexes)->online_log;
+  }
+
+  /** @return whether the name is
+  mysql.innodb_index_stats or mysql.innodb_table_stats */
   bool is_stats_table() const;
 
   /** @return number of unique columns in FTS_DOC_ID index */
   unsigned fts_n_uniq() const { return versioned() ? 2 : 1; }
+
+  /** Create metadata.
+  @param name     table name
+  @param space    tablespace
+  @param n_cols   total number of columns (both virtual and non-virtual)
+  @param n_v_cols number of virtual columns
+  @param flags    table flags
+  @param flags2   table flags2
+  @return newly allocated table object */
+  static dict_table_t *create(const span<const char> &name, fil_space_t *space,
+                              ulint n_cols, ulint n_v_cols, ulint flags,
+                              ulint flags2);
 };
 
 inline void dict_index_t::set_modified(mtr_t& mtr) const
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
index 0a554a54dbd..d60ee5d9bf4 100644
--- a/storage/innobase/include/dict0mem.inl
+++ b/storage/innobase/include/dict0mem.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -64,10 +64,5 @@ dict_mem_fill_index_struct(
 	/* The '1 +' above prevents allocation
 	of an empty mem block */
 	index->nulls_equal = false;
-#ifdef BTR_CUR_HASH_ADAPT
-#ifdef MYSQL_INDEX_DISABLE_AHI
-	index->disable_ahi = false;
-#endif
-#endif /* BTR_CUR_HASH_ADAPT */
 	ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
 }
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 34c1bef26c5..0dc1b984577 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,9 +30,6 @@ Created Jan 06, 2010 Vasil Dimov
 #include "dict0types.h"
 #include "trx0types.h"
 
-#define TABLE_STATS_NAME        "mysql/innodb_table_stats"
-#define INDEX_STATS_NAME        "mysql/innodb_index_stats"
-
 enum dict_stats_upd_option_t {
 	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
 				statistics using a precise and slow
@@ -140,40 +137,33 @@ dict_stats_update(
 					the stats or to fetch them from
 					the persistent storage */
 
-/** Remove the information for a particular index's stats from the persistent
-storage if it exists and if there is data stored for this index.
-This function creates its own trx and commits it.
-
-We must modify system tables in a separate transaction in order to
-adhere to the InnoDB design constraint that dict_sys.latch prevents
-lock waits on system tables. If we modified system and user tables in
-the same transaction, we should exclusively hold dict_sys.latch until
-the transaction is committed, and effectively block other transactions
-that will attempt to open any InnoDB tables. Because we have no
-guarantee that user transactions will be committed fast, we cannot
-afford to keep the system tables locked in a user transaction.
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_drop_index(
-/*==================*/
-	const char*	tname,	/*!< in: table name */
-	const char*	iname,	/*!< in: index name */
-	char*		errstr, /*!< out: error message if != DB_SUCCESS
-				is returned */
-	ulint		errstr_sz);/*!< in: size of the errstr buffer */
-
-/*********************************************************************//**
-Removes the statistics for a table and all of its indexes from the
-persistent storage if it exists and if there is data stored for the table.
-This function creates its own transaction and commits it.
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+                                           const char *table_name,
+                                           trx_t *trx)
+  MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_drop_table(
-/*==================*/
-	const char*	table_name,	/*!< in: table name */
-	char*		errstr,		/*!< out: error message
-					if != DB_SUCCESS is returned */
-	ulint		errstr_sz);	/*!< in: size of errstr buffer */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           trx_t *trx)
+  MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param index_name     name of the index
+@param trx            transaction (nullptr=start and commit a new one)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           const char *index_name, trx_t *trx);
 
 /*********************************************************************//**
 Fetches or calculates new estimates for index statistics. */
@@ -183,31 +173,29 @@ dict_stats_update_for_index(
 	dict_index_t*	index)	/*!< in/out: index */
 	MY_ATTRIBUTE((nonnull));
 
-/*********************************************************************//**
-Renames a table in InnoDB persistent stats storage.
-This function creates its own transaction and commits it.
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name  old table name
+@param new_name  new table name
+@param trx       transaction
 @return DB_SUCCESS or error code */
-dberr_t
-dict_stats_rename_table(
-/*====================*/
-	const char*	old_name,	/*!< in: old table name */
-	const char*	new_name,	/*!< in: new table name */
-	char*		errstr,		/*!< out: error string if != DB_SUCCESS
-					is returned */
-	size_t		errstr_sz);	/*!< in: errstr size */
-/*********************************************************************//**
-Renames an index in InnoDB persistent stats storage.
-This function creates its own transaction and commits it.
-@return DB_SUCCESS or error code. DB_STATS_DO_NOT_EXIST will be returned
-if the persistent stats do not exist. */
-dberr_t
-dict_stats_rename_index(
-/*====================*/
-	const dict_table_t*	table,		/*!< in: table whose index
-						is renamed */
-	const char*		old_index_name,	/*!< in: old index name */
-	const char*		new_index_name)	/*!< in: new index name */
-	__attribute__((warn_unused_result));
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+                                trx_t *trx);
+/** Rename an index in InnoDB persistent statistics.
+@param db         database name
+@param table      table name
+@param old_name   old table name
+@param new_name   new table name
+@param trx        transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+                                const char *old_name, const char *new_name,
+                                trx_t *trx);
+
+/** Delete all persistent statistics for a database.
+@param db    database name
+@param trx   transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx);
 
 /** Save an individual index's statistic into the persistent statistics
 storage.
@@ -217,9 +205,7 @@ storage.
 @param[in]	stat_value		value of the stat
 @param[in]	sample_size		n pages sampled or NULL
 @param[in]	stat_description	description of the stat
-@param[in,out]	trx			in case of NULL the function will
-allocate and free the trx object. If it is not NULL then it will be
-rolled back only in the case of error, but not freed.
+@param[in,out]	trx			transaction
 @return DB_SUCCESS or error code */
 dberr_t
 dict_stats_save_index_stat(
@@ -229,7 +215,8 @@ dict_stats_save_index_stat(
 	ib_uint64_t	stat_value,
 	ib_uint64_t*	sample_size,
 	const char*	stat_description,
-	trx_t*		trx);
+	trx_t*		trx)
+	MY_ATTRIBUTE((nonnull(1, 3, 6, 7)));
 
 /** Report an error if updating table statistics failed because
 .ibd file is missing, table decryption failed or table is corrupted.
diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl
index 4972efe8961..dd516275156 100644
--- a/storage/innobase/include/dict0stats.inl
+++ b/storage/innobase/include/dict0stats.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -75,7 +75,7 @@ dict_stats_is_persistent_enabled(const dict_table_t* table)
 	+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
 	  just been PS-enabled.
 	This is acceptable. Avoiding this would mean that we would have to
-	protect the stat_persistent with dict_sys.mutex like the
+	hold dict_sys.latch or stats_mutex_lock() like for accessing the
 	other ::stat_ members which would be too big performance penalty,
 	especially when this function is called from
 	dict_stats_update_if_needed(). */
@@ -148,7 +148,7 @@ dict_stats_init(
 /*============*/
 	dict_table_t*	table)	/*!< in/out: table */
 {
-	ut_ad(!mutex_own(&dict_sys.mutex));
+	ut_ad(!table->stats_mutex_is_owner());
 
 	if (table->stat_initialized) {
 		return;
@@ -174,17 +174,14 @@ dict_stats_deinit(
 /*==============*/
 	dict_table_t*	table)	/*!< in/out: table */
 {
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	ut_a(table->get_ref_count() == 0);
+	ut_ad(table->stats_mutex_is_owner());
+	ut_ad(table->get_ref_count() == 0);
 
+#ifdef HAVE_valgrind
 	if (!table->stat_initialized) {
 		return;
 	}
 
-	table->stat_initialized = FALSE;
-
-#ifdef HAVE_valgrind
 	MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
 	MEM_UNDEFINED(&table->stat_clustered_index_size,
 		      sizeof table->stat_clustered_index_size);
@@ -218,4 +215,5 @@ dict_stats_deinit(
 			sizeof(index->stat_n_leaf_pages));
 	}
 #endif /* HAVE_valgrind */
+	table->stat_initialized = FALSE;
 }
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
index c09bf4df8e3..d9a2f6282a1 100644
--- a/storage/innobase/include/dict0stats_bg.h
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -28,66 +28,16 @@ Created Apr 26, 2012 Vasil Dimov
 #define dict0stats_bg_h
 
 #include "dict0types.h"
-#include "os0thread.h"
 
 #ifdef HAVE_PSI_INTERFACE
-extern mysql_pfs_key_t	dict_stats_recalc_pool_mutex_key;
+extern mysql_pfs_key_t	recalc_pool_mutex_key;
 #endif /* HAVE_PSI_INTERFACE */
 
-/*****************************************************************//**
-Delete a given table from the auto recalc pool.
-dict_stats_recalc_pool_del() */
-void
-dict_stats_recalc_pool_del(
-/*=======================*/
-	const dict_table_t*	table);	/*!< in: table to remove */
-
-/** Yield the data dictionary latch when waiting
-for the background thread to stop accessing a table.
-@param trx	transaction holding the data dictionary locks */
-#define DICT_BG_YIELD(trx)	do {	\
-	row_mysql_unlock_data_dictionary(trx);	\
-	os_thread_sleep(250000);		\
-	row_mysql_lock_data_dictionary(trx);	\
-} while (0)
-
-/*****************************************************************//**
-Request the background collection of statistics to stop for a table.
-@retval true when no background process is active
-@retval false when it is not safe to modify the table definition */
-UNIV_INLINE
-bool
-dict_stats_stop_bg(
-/*===============*/
-	dict_table_t*	table)	/*!< in/out: table */
-{
-	ut_ad(!srv_read_only_mode);
-	ut_ad(mutex_own(&dict_sys.mutex));
-
-	if (!(table->stats_bg_flag & BG_STAT_IN_PROGRESS)) {
-		return(true);
-	}
-
-	table->stats_bg_flag |= BG_STAT_SHOULD_QUIT;
-	return(false);
-}
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive);
 
 /*****************************************************************//**
-Wait until background stats thread has stopped using the specified table.
-The caller must have locked the data dictionary using
-row_mysql_lock_data_dictionary() and this function may unlock it temporarily
-and restore the lock before it exits.
-The background stats thread is guaranteed not to start using the specified
-table after this function returns and before the caller unlocks the data
-dictionary because it sets the BG_STAT_IN_PROGRESS bit in table->stats_bg_flag
-under dict_sys.mutex. */
-void
-dict_stats_wait_bg_to_stop_using_table(
-/*===================================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	trx_t*		trx);	/*!< in/out: transaction to use for
-				unlocking/locking the data dict */
-/*****************************************************************//**
 Initialize global variables needed for the operation of dict_stats_thread().
 Must be called before dict_stats task is started. */
 void dict_stats_init();
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index 5c4aaf8c87a..ec50e8cd951 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -27,9 +27,12 @@ Created 1/8/1996 Heikki Tuuri
 #ifndef dict0types_h
 #define dict0types_h
 
-#include <ut0mutex.h>
+#include "univ.i"
+#include "span.h"
 #include <rem0types.h>
 
+using st_::span;
+
 struct dict_col_t;
 struct dict_field_t;
 struct dict_index_t;
@@ -68,18 +71,20 @@ enum dict_err_ignore_t {
 	DICT_ERR_IGNORE_NONE = 0,	/*!< no error to ignore */
 	DICT_ERR_IGNORE_FK_NOKEY = 1,	/*!< ignore error if any foreign
 					key is missing */
-	DICT_ERR_IGNORE_INDEX_ROOT = 2,	/*!< ignore error if index root
-					page is FIL_NULL or incorrect value */
-	DICT_ERR_IGNORE_CORRUPT = 4,	/*!< skip corrupted indexes */
-	DICT_ERR_IGNORE_RECOVER_LOCK = 8 | DICT_ERR_IGNORE_FK_NOKEY,
+	DICT_ERR_IGNORE_INDEX = 2,	/*!< ignore corrupted indexes */
+	DICT_ERR_IGNORE_RECOVER_LOCK = 4 | DICT_ERR_IGNORE_FK_NOKEY,
 					/*!< Used when recovering table locks
 					for resurrected transactions.
 					Silently load a missing
 					tablespace, and do not load
 					incomplete index definitions. */
 	/** ignore all errors above */
-	DICT_ERR_IGNORE_ALL = 15,
-	/** prepare to drop the table; do not attempt to load tablespace */
+	DICT_ERR_IGNORE_ALL = 7,
+	/** prepare some DDL operation;
+	do not attempt to load tablespace */
+	DICT_ERR_IGNORE_TABLESPACE = 15,
+	/** prepare to drop the table; do not attempt to load tablespace
+	or the metadata */
 	DICT_ERR_IGNORE_DROP = 31
 };
 
@@ -90,18 +95,9 @@ enum ib_quiesce_t {
 	QUIESCE_COMPLETE		/*!< All done */
 };
 
-#ifndef UNIV_INNOCHECKSUM
-typedef ib_mutex_t DictSysMutex;
-#endif /* !UNIV_INNOCHECKSUM */
-
-/** Prefix for tmp tables, adopted from sql/table.h */
-#define TEMP_FILE_PREFIX		"#sql"
-#define TEMP_FILE_PREFIX_LENGTH		4
+/** Prefix for InnoDB internal tables, adopted from sql/table.h */
 #define TEMP_FILE_PREFIX_INNODB		"#sql-ib"
 
-#define TEMP_TABLE_PREFIX                "#sql"
-#define TEMP_TABLE_PATH_PREFIX           "/" TEMP_TABLE_PREFIX
-
 /** Table name wrapper for pretty-printing */
 struct table_name_t
 {
@@ -174,4 +170,7 @@ enum spatial_status_t {
 	SPATIAL_ONLY	= 3
 };
 
+#define TABLE_STATS_NAME "mysql/innodb_table_stats"
+#define INDEX_STATS_NAME "mysql/innodb_index_stats"
+
 #endif
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
index cb8b998f0ea..208e49c34a7 100644
--- a/storage/innobase/include/dyn0buf.h
+++ b/storage/innobase/include/dyn0buf.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -331,60 +331,6 @@ public:
 	}
 
 	/**
-	Iterate over each block and call the functor.
-	@return	false if iteration was terminated. */
-	template <typename Functor>
-	bool for_each_block(const Functor& functor) const
-	{
-		for (typename list_t::iterator it = m_list.begin(),
-					       end = m_list.end();
-		     it != end; ++it) {
-
-			if (!functor(&*it)) {
-				return false;
-			}
-		}
-
-		return(true);
-	}
-
-	/**
-	Iterate over all the blocks in reverse and call the iterator
-	@return	false if iteration was terminated. */
-	template <typename Functor>
-	bool for_each_block_in_reverse(Functor& functor) const
-	{
-		for (list_t::reverse_iterator it = m_list.rbegin(),
-					      end = m_list.rend();
-		     it != end; ++it) {
-
-			if (!functor(&*it)) {
-				return false;
-			}
-		}
-
-		return(true);
-	}
-
-	/**
-	Iterate over all the blocks in reverse and call the iterator
-	@return	false if iteration was terminated. */
-	template <typename Functor>
-	bool for_each_block_in_reverse(const Functor& functor) const
-	{
-		for (list_t::reverse_iterator it = m_list.rbegin(),
-					      end = m_list.rend();
-		     it != end; ++it) {
-
-			if (!functor(&*it)) {
-				return false;
-			}
-		}
-
-		return(true);
-	}
-
-	/**
 	@return the first block */
 	block_t* front()
 		MY_ATTRIBUTE((warn_unused_result))
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
index 62043003a6c..26272761f43 100644
--- a/storage/innobase/include/fil0crypt.h
+++ b/storage/innobase/include/fil0crypt.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,7 +26,6 @@ Created 04/01/2015 Jan Lindström
 #ifndef fil0crypt_h
 #define fil0crypt_h
 
-#include "os0event.h"
 #include "my_crypt.h"
 #include "fil0fil.h"
 
@@ -41,7 +40,8 @@ static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
 /* This key will be used if nothing else is given */
 #define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
 
-extern os_event_t fil_crypt_threads_event;
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast= false);
 
 /**
  * CRYPT_SCHEME_UNENCRYPTED
@@ -116,7 +116,7 @@ struct fil_space_crypt_t : st_encryption_scheme
 	{
 		key_id = new_key_id;
 		my_random_bytes(iv, sizeof(iv));
-		mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &mutex);
+		mysql_mutex_init(0, &mutex, nullptr);
 		locker = crypt_data_scheme_locker;
 		type = new_type;
 
@@ -135,7 +135,7 @@ struct fil_space_crypt_t : st_encryption_scheme
 	/** Destructor */
 	~fil_space_crypt_t()
 	{
-		mutex_free(&mutex);
+		mysql_mutex_destroy(&mutex);
 	}
 
 	/** Get latest key version from encryption plugin
@@ -172,12 +172,6 @@ struct fil_space_crypt_t : st_encryption_scheme
 		return (encryption == FIL_ENCRYPTION_OFF);
 	}
 
-	/** Fill crypt data information to the give page.
-	It should be called during ibd file creation.
-	@param[in]	flags	tablespace flags
-	@param[in,out]	page	first page of the tablespace */
-	void fill_page0(ulint flags, byte* page);
-
 	/** Write encryption metadata to the first page.
 	@param[in,out]	block	first page of the tablespace
 	@param[in,out]	mtr	mini-transaction */
@@ -186,7 +180,7 @@ struct fil_space_crypt_t : st_encryption_scheme
 	uint min_key_version; // min key version for this space
 	fil_encryption_t encryption; // Encryption setup
 
-	ib_mutex_t mutex;   // mutex protecting following variables
+	mysql_mutex_t mutex;   // mutex protecting following variables
 
 	/** Return code from encryption_key_get_latest_version.
         If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
@@ -213,25 +207,20 @@ struct fil_space_crypt_status_t {
 };
 
 /** Statistics about encryption key rotation */
-struct fil_crypt_stat_t {
-	ulint pages_read_from_cache;
-	ulint pages_read_from_disk;
-	ulint pages_modified;
-	ulint pages_flushed;
-	ulint estimated_iops;
+struct fil_crypt_stat_t
+{
+  ulint pages_read_from_cache= 0;
+  ulint pages_read_from_disk= 0;
+  ulint pages_modified= 0;
+  ulint pages_flushed= 0;
+  ulint estimated_iops= 0;
 };
 
-/*********************************************************************
-Init space crypt */
-UNIV_INTERN
-void
-fil_space_crypt_init();
+/** Init space crypt */
+void fil_space_crypt_init();
 
-/*********************************************************************
-Cleanup space crypt */
-UNIV_INTERN
-void
-fil_space_crypt_cleanup();
+/** Cleanup space crypt */
+void fil_space_crypt_cleanup();
 
 /**
 Create a fil_space_crypt_t object
@@ -241,23 +230,12 @@ Create a fil_space_crypt_t object
 
 @param[in]	key_id		Encryption key id
 @return crypt object */
-UNIV_INTERN
 fil_space_crypt_t*
 fil_space_create_crypt_data(
 	fil_encryption_t	encrypt_mode,
 	uint			key_id)
 	MY_ATTRIBUTE((warn_unused_result));
 
-/******************************************************************
-Merge fil_space_crypt_t object
-@param[in,out]	dst		Destination cryp data
-@param[in]	src		Source crypt data */
-UNIV_INTERN
-void
-fil_space_merge_crypt_data(
-	fil_space_crypt_t* dst,
-	const fil_space_crypt_t* src);
-
 /** Initialize encryption parameters from a tablespace header page.
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	page		first page of the tablespace
@@ -269,10 +247,7 @@ fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
 /**
 Free a crypt data object
 @param[in,out] crypt_data	crypt data to be freed */
-UNIV_INTERN
-void
-fil_space_destroy_crypt_data(
-	fil_space_crypt_t **crypt_data);
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data);
 
 /** Amend encryption information from redo log.
 @param[in]	space	tablespace
@@ -288,7 +263,6 @@ void fil_crypt_parse(fil_space_t* space, const byte* data);
 @param[in,out]		dst_frame		Output buffer
 @param[in]		use_full_checksum	full crc32 algo is used
 @return encrypted buffer or NULL */
-UNIV_INTERN
 byte*
 fil_encrypt_buf(
 	fil_space_crypt_t*	crypt_data,
@@ -315,7 +289,6 @@ byte* fil_space_encrypt(
 	byte*		dst_frame)
 	MY_ATTRIBUTE((warn_unused_result));
 
-
 /** Decrypt a page.
 @param]in]	space_id		space id
 @param[in]	crypt_data		crypt_data
@@ -323,8 +296,8 @@ byte* fil_space_encrypt(
 @param[in]	physical_size		page size
 @param[in]	fsp_flags		Tablespace flags
 @param[in,out]	src_frame		Page to decrypt
-@return DB_SUCCESS or error */
-UNIV_INTERN
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
 dberr_t
 fil_space_decrypt(
 	ulint			space_id,
@@ -340,8 +313,8 @@ Decrypt a page
 @param[in]	tmp_frame		Temporary buffer used for decrypting
 @param[in,out]	src_frame		Page to decrypt
 @return decrypted page, or original not encrypted page if decryption is
-not needed.*/
-UNIV_INTERN
+not needed.
+@retval nullptr on failure */
 byte*
 fil_space_decrypt(
 	const fil_space_t* space,
@@ -349,39 +322,20 @@ fil_space_decrypt(
 	byte*		src_frame)
 	MY_ATTRIBUTE((warn_unused_result));
 
-/**
-Calculate post encryption checksum
-@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
-@param[in]	dst_frame	Block where checksum is calculated
-@return page checksum
-not needed. */
-uint32_t
-fil_crypt_calculate_checksum(ulint zip_size, const byte* dst_frame)
-	MY_ATTRIBUTE((warn_unused_result));
-
 /*********************************************************************
 Adjust thread count for key rotation
 @param[in]	enw_cnt		Number of threads to be used */
-UNIV_INTERN
-void
-fil_crypt_set_thread_cnt(
-	uint	new_cnt);
+void fil_crypt_set_thread_cnt(const uint new_cnt);
 
 /*********************************************************************
 Adjust max key age
 @param[in]	val		New max key age */
-UNIV_INTERN
-void
-fil_crypt_set_rotate_key_age(
-	uint	val);
+void fil_crypt_set_rotate_key_age(uint val);
 
 /*********************************************************************
 Adjust rotation iops
 @param[in]	val		New max roation iops */
-UNIV_INTERN
-void
-fil_crypt_set_rotation_iops(
-	uint val);
+void fil_crypt_set_rotation_iops(uint val);
 
 /*********************************************************************
 Adjust encrypt tables
@@ -390,30 +344,22 @@ void fil_crypt_set_encrypt_tables(ulong val);
 
 /*********************************************************************
 Init threads for key rotation */
-UNIV_INTERN
-void
-fil_crypt_threads_init();
+void fil_crypt_threads_init();
 
 /*********************************************************************
 Clean up key rotation threads resources */
-UNIV_INTERN
-void
-fil_crypt_threads_cleanup();
+void fil_crypt_threads_cleanup();
 
 /*********************************************************************
 Wait for crypt threads to stop accessing space
 @param[in]	space		Tablespace */
-UNIV_INTERN
-void
-fil_space_crypt_close_tablespace(
-	const fil_space_t*	space);
+void fil_space_crypt_close_tablespace(const fil_space_t *space);
 
 /*********************************************************************
 Get crypt status for a space (used by information_schema)
 @param[in]	space		Tablespace
 @param[out]	status		Crypt status
 return 0 if crypt data present */
-UNIV_INTERN
 void
 fil_space_crypt_get_status(
 	const fil_space_t*			space,
@@ -422,10 +368,7 @@ fil_space_crypt_get_status(
 /*********************************************************************
 Return crypt statistics
 @param[out]	stat		Crypt statistics */
-UNIV_INTERN
-void
-fil_crypt_total_stat(
-	fil_crypt_stat_t *stat);
+void fil_crypt_total_stat(fil_crypt_stat_t *stat);
 
 #include "fil0crypt.inl"
 #endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 385d547a060..165994eef35 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -24,8 +24,7 @@ The low-level file system
 Created 10/25/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef fil0fil_h
-#define fil0fil_h
+#pragma once
 
 #include "fsp0types.h"
 #include "mach0data.h"
@@ -33,6 +32,7 @@ Created 10/25/1995 Heikki Tuuri
 
 #ifndef UNIV_INNOCHECKSUM
 
+#include "srw_lock.h"
 #include "buf0dblwr.h"
 #include "hash0hash.h"
 #include "log0recv.h"
@@ -43,6 +43,10 @@ Created 10/25/1995 Heikki Tuuri
 
 struct unflushed_spaces_tag_t;
 struct rotation_list_tag_t;
+struct space_list_tag_t;
+struct named_spaces_tag_t;
+
+using space_list_t= ilist<fil_space_t, space_list_tag_t>;
 
 // Forward declaration
 extern my_bool srv_use_doublewrite_buf;
@@ -60,13 +64,11 @@ enum srv_flush_t
   /** do not flush after writing */
   SRV_NOSYNC,
   /** invoke os_file_set_nocache() on data files. This implies using
-  non-buffered IO but still using fsync, the reason for which is that
-  some FS do not flush meta-data when unbuffered IO happens */
+  unbuffered I/O but still fdatasync(), because some filesystems might
+  not flush meta-data on write completion */
   SRV_O_DIRECT,
-  /** do not use fsync() when using direct IO i.e.: it can be set to
-  avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT.
-  However, in this case user/DBA should be sure about the integrity of
-  the meta-data */
+  /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
+  durable on write completion */
   SRV_O_DIRECT_NO_FSYNC
 #ifdef _WIN32
   /** Traditional Windows appoach to open all files without caching,
@@ -332,18 +334,27 @@ enum fil_encryption_t
   FIL_ENCRYPTION_OFF
 };
 
-struct fil_space_t final :
-  ilist_node<unflushed_spaces_tag_t>, ilist_node<rotation_list_tag_t>
+struct fil_space_t final : ilist_node<unflushed_spaces_tag_t>,
+                           ilist_node<rotation_list_tag_t>,
+                           ilist_node<space_list_tag_t>,
+                           ilist_node<named_spaces_tag_t>
 #else
 struct fil_space_t final
 #endif
 {
 #ifndef UNIV_INNOCHECKSUM
-  ~fil_space_t() { ut_free(name); }
   friend fil_node_t;
+  ~fil_space_t()
+  {
+    ut_ad(!latch_owner);
+    ut_ad(!latch_count);
+    latch.destroy();
+  }
+
 	ulint		id;	/*!< space id */
-	hash_node_t	hash;	/*!< hash chain node */
-	char*		name;	/*!< Tablespace name */
+
+  /** fil_system.spaces chain node */
+  fil_space_t *hash;
 	lsn_t		max_lsn;
 				/*!< LSN of the most recent
 				fil_names_write_if_was_clean().
@@ -373,6 +384,14 @@ struct fil_space_t final
 				/*!< number of reserved free extents for
 				ongoing operations like B-tree page split */
 private:
+#ifdef UNIV_DEBUG
+  fil_space_t *next_in_space_list();
+  fil_space_t *prev_in_space_list();
+
+  fil_space_t *next_in_unflushed_spaces();
+  fil_space_t *prev_in_unflushed_spaces();
+#endif
+
   /** the committed size of the tablespace in pages */
   Atomic_relaxed<uint32_t> committed_size;
   /** Number of pending operations on the file.
@@ -390,49 +409,44 @@ private:
   static constexpr uint32_t NEEDS_FSYNC= 1U << 29;
   /** The reference count */
   static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
+  /** latch protecting all page allocation bitmap pages */
+  srw_lock latch;
+  pthread_t latch_owner;
+  ut_d(Atomic_relaxed<uint32_t> latch_count;)
 public:
-	rw_lock_t	latch;	/*!< latch protecting the file space storage
-				allocation */
-	UT_LIST_NODE_T(fil_space_t) named_spaces;
-				/*!< list of spaces for which FILE_MODIFY
-				records have been issued */
-	UT_LIST_NODE_T(fil_space_t) space_list;
-				/*!< list of all spaces */
+  /** MariaDB encryption data */
+  fil_space_crypt_t *crypt_data;
 
-	/** MariaDB encryption data */
-	fil_space_crypt_t* crypt_data;
+  /** Whether needs_flush(), or this is in fil_system.unflushed_spaces */
+  bool is_in_unflushed_spaces;
 
-	/** Checks that this tablespace in a list of unflushed tablespaces. */
-	bool is_in_unflushed_spaces;
+  /** Whether this in fil_system.default_encrypt_tables (needs key rotation) */
+  bool is_in_default_encrypt;
 
-	/** Checks that this tablespace needs key rotation. */
-	bool is_in_default_encrypt;
-
-	/** True if the device this filespace is on supports atomic writes */
-	bool		atomic_write_supported;
+private:
+  /** Whether any corrupton of this tablespace has been reported */
+  mutable std::atomic_flag is_corrupted;
 
-	/** True if file system storing this tablespace supports
-	punch hole */
-	bool		punch_hole;
+  /** mutex to protect freed_ranges and last_freed_lsn */
+  std::mutex freed_range_mutex;
 
-	/** mutex to protect freed ranges */
-	std::mutex	freed_range_mutex;
+  /** Ranges of freed page numbers; protected by freed_range_mutex */
+  range_set freed_ranges;
 
-	/** Variables to store freed ranges. This can be used to write
-	zeroes/punch the hole in files. Protected by freed_mutex */
-	range_set	freed_ranges;
+  /** LSN of freeing last page; protected by freed_range_mutex */
+  lsn_t last_freed_lsn;
 
-	/** Stores last page freed lsn. Protected by freed_mutex */
-	lsn_t		last_freed_lsn;
+public:
+  /** @return whether doublewrite buffering is needed */
+  inline bool use_doublewrite() const;
 
-	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
+  /** @return whether a page has been freed */
+  inline bool is_freed(uint32_t page);
 
-  /** @return whether doublewrite buffering is needed */
-  bool use_doublewrite() const
-  {
-    return !atomic_write_supported && srv_use_doublewrite_buf &&
-      buf_dblwr.is_initialised();
-  }
+  /** Apply freed_ranges to the file.
+  @param writable whether the file is writable
+  @return number of pages written or hole-punched */
+  uint32_t flush_freed(bool writable);
 
 	/** Append a file to the chain of files of a space.
 	@param[in]	name		file name of a file that is not open
@@ -459,7 +473,6 @@ public:
 	@return	whether the reservation succeeded */
 	bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
 	{
-		ut_ad(rw_lock_own(&latch, RW_LOCK_X));
 		if (n_reserved_extents + n_to_reserve > n_free_now) {
 			return false;
 		}
@@ -473,26 +486,27 @@ public:
 	void release_free_extents(uint32_t n_reserved)
 	{
 		if (!n_reserved) return;
-		ut_ad(rw_lock_own(&latch, RW_LOCK_X));
 		ut_a(n_reserved_extents >= n_reserved);
 		n_reserved_extents -= n_reserved;
 	}
 
-	/** Rename a file.
-	@param[in]	name	table name after renaming
-	@param[in]	path	tablespace file name after renaming
-	@param[in]	log	whether to write redo log
-	@param[in]	replace	whether to ignore the existence of path
-	@return	error code
-	@retval	DB_SUCCESS	on success */
-	dberr_t rename(const char* name, const char* path, bool log,
-		       bool replace = false);
+  /** Rename a file.
+  @param[in]	path	tablespace file name after renaming
+  @param[in]	log	whether to write redo log
+  @param[in]	replace	whether to ignore the existence of path
+  @return	error code
+  @retval	DB_SUCCESS	on success */
+  dberr_t rename(const char *path, bool log, bool replace= false)
+    MY_ATTRIBUTE((nonnull));
 
   /** Note that the tablespace has been imported.
   Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
   written while the space ID is being updated in each page. */
   inline void set_imported();
 
+  /** Report the tablespace as corrupted */
+  ATTRIBUTE_COLD void set_corrupted() const;
+
   /** @return whether the storage device is rotational (HDD, not SSD) */
   inline bool is_rotational() const;
 
@@ -503,8 +517,20 @@ public:
   /** Close each file. Only invoked on fil_system.temp_space. */
   void close();
 
-  /** Note that operations on the tablespace must stop or can resume */
-  inline void set_stopping(bool stopping);
+  /** Note that operations on the tablespace must stop.
+  @return whether the operations were already stopped */
+  inline bool set_stopping_check();
+  /** Note that operations on the tablespace must stop. */
+  inline void set_stopping();
+
+  /** Note that operations on the tablespace can resume after truncation */
+  inline void clear_stopping();
+
+  /** Look up the tablespace and wait for pending operations to cease
+  @param id  tablespace identifier
+  @return tablespace
+  @retval nullptr if no tablespace was found */
+  static fil_space_t *check_pending_operations(ulint id);
 
 private:
   MY_ATTRIBUTE((warn_unused_result))
@@ -549,9 +575,35 @@ public:
 
   /** Clear the NEEDS_FSYNC flag */
   void clear_flush()
-  { n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); }
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(NEEDS_FSYNC == 1U << 29, "compatibility");
+    __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(NEEDS_FSYNC == 1U << 29, "compatibility");
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>
+                                (&n_pending), 29);
+#else
+    n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
+#endif
+  }
 
 private:
+  /** Clear the CLOSING flag */
+  void clear_closing()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(CLOSING == 1U << 30, "compatibility");
+    __asm__ __volatile__("lock btrl $30, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(CLOSING == 1U << 30, "compatibility");
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>
+                                (&n_pending), 30);
+#else
+    n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+#endif
+  }
+
   /** @return pending operations (and flags) */
   uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
 public:
@@ -576,8 +628,7 @@ private:
   @return number of pending operations, possibly with NEEDS_FSYNC flag */
   uint32_t set_closing()
   {
-    return n_pending.fetch_or(CLOSING, std::memory_order_acquire) &
-      (PENDING | NEEDS_FSYNC);
+    return n_pending.fetch_or(CLOSING, std::memory_order_acquire);
   }
 
 public:
@@ -901,7 +952,6 @@ public:
 #ifndef UNIV_INNOCHECKSUM
   MY_ATTRIBUTE((warn_unused_result))
   /** Create a tablespace in fil_system.
-  @param name       tablespace name
   @param id         tablespace identifier
   @param flags      tablespace flags
   @param purpose    tablespace purpose
@@ -910,7 +960,7 @@ public:
   @param opened     true if space files are opened
   @return pointer to created tablespace, to be filled in with add()
   @retval nullptr on failure (such as when the same tablespace exists) */
-  static fil_space_t *create(const char *name, ulint id, ulint flags,
+  static fil_space_t *create(ulint id, ulint flags,
                              fil_type_t purpose, fil_space_crypt_t *crypt_data,
                              fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT,
                              bool opened= false);
@@ -959,11 +1009,7 @@ public:
   }
 
   /** Update committed_size in mtr_t::commit() */
-  void set_committed_size()
-  {
-    ut_ad(rw_lock_own(&latch, RW_LOCK_X));
-    committed_size= size;
-  }
+  void set_committed_size() { committed_size= size; }
 
   /** @return the last persisted page number */
   uint32_t last_page_number() const { return committed_size - 1; }
@@ -996,8 +1042,49 @@ public:
   @param encrypt  expected state of innodb_encrypt_tables
   @return the next tablespace
   @retval nullptr upon reaching the end of the iteration */
-  static inline fil_space_t *next(fil_space_t *space, bool recheck,
-                                  bool encrypt);
+  static space_list_t::iterator next(space_list_t::iterator space,
+                                     bool recheck, bool encrypt);
+
+#ifdef UNIV_DEBUG
+  bool is_latched() const { return latch_count != 0; }
+#endif
+  bool is_owner() const { return latch_owner == pthread_self(); }
+  /** Acquire the allocation latch in exclusive mode */
+  void x_lock()
+  {
+    latch.wr_lock(SRW_LOCK_CALL);
+    ut_ad(!latch_owner);
+    latch_owner= pthread_self();
+    ut_ad(!latch_count.fetch_add(1));
+  }
+  /** Release the allocation latch from exclusive mode */
+  void x_unlock()
+  {
+    ut_ad(latch_count.fetch_sub(1) == 1);
+    ut_ad(latch_owner == pthread_self());
+    latch_owner= 0;
+    latch.wr_unlock();
+  }
+  /** Acquire the allocation latch in shared mode */
+  void s_lock()
+  {
+    ut_ad(!is_owner());
+    latch.rd_lock(SRW_LOCK_CALL);
+    ut_ad(!latch_owner);
+    ut_d(latch_count.fetch_add(1));
+  }
+  /** Release the allocation latch from shared mode */
+  void s_unlock()
+  {
+    ut_ad(latch_count.fetch_sub(1));
+    ut_ad(!latch_owner);
+    latch.rd_unlock();
+  }
+
+  typedef span<const char> name_type;
+
+  /** @return the tablespace name (databasename/tablename) */
+  name_type name() const;
 
 private:
   /** @return whether the file is usable for io() */
@@ -1008,62 +1095,59 @@ private:
 };
 
 #ifndef UNIV_INNOCHECKSUM
-/** Value of fil_space_t::magic_n */
-#define	FIL_SPACE_MAGIC_N	89472
-
 /** File node of a tablespace or the log data space */
 struct fil_node_t final
 {
-	/** tablespace containing this file */
-	fil_space_t*	space;
-	/** file name; protected by fil_system.mutex and log_sys.mutex. */
-	char*		name;
-	/** file handle (valid if is_open) */
-	pfs_os_file_t	handle;
-	/** whether the file actually is a raw device or disk partition */
-	bool		is_raw_disk;
-	/** whether the file is on non-rotational media (SSD) */
-	bool		on_ssd;
-	/** size of the file in database pages (0 if not known yet);
-	the possible last incomplete megabyte may be ignored
-	if space->id == 0 */
-	uint32_t	size;
-	/** initial size of the file in database pages;
-	FIL_IBD_FILE_INITIAL_SIZE by default */
-	uint32_t	init_size;
-	/** maximum size of the file in database pages (0 if unlimited) */
-	uint32_t	max_size;
-	/** whether the file is currently being extended */
-	Atomic_relaxed<bool> being_extended;
-	/** link to other files in this tablespace */
-	UT_LIST_NODE_T(fil_node_t) chain;
-
-	/** whether this file could use atomic write (data file) */
-	bool		atomic_write;
-
-	/** Filesystem block size */
-	ulint		block_size;
-
-	/** FIL_NODE_MAGIC_N */
-	ulint		magic_n;
-
-	/** @return whether this file is open */
-	bool is_open() const
-	{
-		return(handle != OS_FILE_CLOSED);
-	}
+  /** tablespace containing this file */
+  fil_space_t *space;
+  /** file name; protected by fil_system.mutex and log_sys.mutex */
+  char *name;
+  /** file handle */
+  pfs_os_file_t handle;
+  /** whether the file is on non-rotational media (SSD) */
+  unsigned on_ssd:1;
+  /** how to write page_compressed tables
+  (0=do not punch holes but write minimal amount of data, 1=punch holes,
+  2=always write the same amount; thinly provisioned storage will compress) */
+  unsigned punch_hole:2;
+  /** whether this file could use atomic write */
+  unsigned atomic_write:1;
+  /** whether the file actually is a raw device or disk partition */
+  unsigned is_raw_disk:1;
+  /** whether the tablespace discovery is being deferred during crash
+  recovery due to incompletely written page 0 */
+  unsigned deferred:1;
+
+  /** size of the file in database pages (0 if not known yet);
+  the possible last incomplete megabyte may be ignored if space->id == 0 */
+  uint32_t size;
+  /** initial size of the file in database pages;
+  FIL_IBD_FILE_INITIAL_SIZE by default */
+  uint32_t init_size;
+  /** maximum size of the file in database pages (0 if unlimited) */
+  uint32_t max_size;
+  /** whether the file is currently being extended */
+  Atomic_relaxed<bool> being_extended;
+  /** link to other files in this tablespace */
+  UT_LIST_NODE_T(fil_node_t) chain;
+
+  /** Filesystem block size */
+  ulint block_size;
+
+  /** @return whether this file is open */
+  bool is_open() const { return handle != OS_FILE_CLOSED; }
 
-	/** Read the first page of a data file.
-	@return	whether the page was found valid */
-	bool read_page0();
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
 
-	/** Determine some file metadata when creating or reading the file.
-	@param	file	the file that is being created, or OS_FILE_CLOSED */
-	void find_metadata(os_file_t file = OS_FILE_CLOSED
+  /** Determine some file metadata when creating or reading the file.
+  @param file   the file that is being created, or OS_FILE_CLOSED */
+  void find_metadata(os_file_t file= OS_FILE_CLOSED
 #ifndef _WIN32
-			   , struct stat* statbuf = NULL
+                     , bool create= false, struct stat *statbuf= nullptr
 #endif
-			   );
+                     );
 
   /** Close the file handle. */
   void close();
@@ -1082,8 +1166,11 @@ private:
   void prepare_to_close_or_detach();
 };
 
-/** Value of fil_node_t::magic_n */
-#define	FIL_NODE_MAGIC_N	89389
+inline bool fil_space_t::use_doublewrite() const
+{
+  return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
+    buf_dblwr.is_created();
+}
 
 inline void fil_space_t::set_imported()
 {
@@ -1113,8 +1200,8 @@ extern const char* dot_ext[];
 #define DOT_ISL dot_ext[ISL]
 #define DOT_CFG dot_ext[CFG]
 
-/** When mysqld is run, the default directory "." is the mysqld datadir,
-but in the MySQL Embedded Server Library and mysqlbackup it is not the default
+/** When mariadbd is run, the default directory "." is the mysqld datadir,
+but in the MariaDB Embedded Server Library and mysqlbackup it is not the default
 directory, and we must set the base file path explicitly */
 extern const char*	fil_path_to_mysql_datadir;
 #else
@@ -1183,8 +1270,9 @@ struct fil_addr_t {
 
 /** For the first page in a system tablespace data file(ibdata*, not *.ibd):
 the file has been flushed to disk at least up to this lsn
-For other pages: 32-bit key version used to encrypt the page + 32-bit checksum
-or 64 bites of zero if no encryption */
+For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
+format: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bits of zero if no encryption */
 #define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
 
 /** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
@@ -1344,11 +1432,7 @@ struct fil_system_t {
     Some members may require late initialisation, thus we just mark object as
     uninitialised. Real initialisation happens in create().
   */
-  fil_system_t(): m_initialised(false)
-  {
-    UT_LIST_INIT(space_list, &fil_space_t::space_list);
-    UT_LIST_INIT(named_spaces, &fil_space_t::named_spaces);
-  }
+  fil_system_t() : m_initialised(false) {}
 
   bool is_initialised() const { return m_initialised; }
 
@@ -1388,12 +1472,13 @@ public:
 public:
   /** Detach a tablespace from the cache and close the files.
   @param space tablespace
-  @param detach_handle whether to detach or close handles
-  @return detached handles or empty vector */
-  std::vector<pfs_os_file_t> detach(fil_space_t *space,
-                                    bool detach_handle= false);
+  @param detach_handle whether to detach the handle, instead of closing
+  @return detached handle
+  @retval OS_FILE_CLOSED if no handle was detached */
+  pfs_os_file_t detach(fil_space_t *space, bool detach_handle= false);
 
-	ib_mutex_t	mutex;		/*!< The mutex protecting the cache */
+  /** the mutex protecting most data fields, and some fields of fil_space_t */
+  mysql_mutex_t mutex;
 	fil_space_t*	sys_space;	/*!< The innodb_system tablespace */
 	fil_space_t*	temp_space;	/*!< The innodb_temporary tablespace */
   /** Map of fil_space_t::id to fil_space_t* */
@@ -1413,12 +1498,11 @@ public:
   /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
   to the end of space_list, for FIFO policy of try_to_close() */
   ulint freeze_space_list;
-
   /** List of all file spaces, opened spaces should be at the top of the list
   to optimize try_to_close() execution. Protected with fil_system.mutex. */
-  UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+  ilist<fil_space_t, space_list_tag_t> space_list;
 
-	UT_LIST_BASE_NODE_T(fil_space_t) named_spaces;
+  ilist<fil_space_t, named_spaces_tag_t> named_spaces;
 					/*!< list of all file spaces
 					for which a FILE_MODIFY
 					record has been written since
@@ -1443,7 +1527,7 @@ public:
   fil_system.space_list, so that fil_space_t::try_to_close() should close
   it as a last resort.
   @param space space to move */
-  void move_opened_last_to_space_list(fil_space_t *space)
+  inline void move_opened_last_to_space_list(fil_space_t *space)
   {
     /* In the case when several files of the same space are added in a
     row, there is no need to remove and add a space to the same position
@@ -1451,8 +1535,7 @@ public:
     if (freeze_space_list || space_list_last_opened == space)
       return;
 
-    UT_LIST_REMOVE(space_list, space);
-
+    space_list.erase(space_list_t::iterator(space));
     add_opened_last_to_space_list(space);
   }
 
@@ -1465,10 +1548,17 @@ public:
     if (UNIV_UNLIKELY(freeze_space_list))
       return;
 
+    space_list_t::iterator s= space_list_t::iterator(space);
+
     if (space_list_last_opened == space)
-      space_list_last_opened= UT_LIST_GET_PREV(space_list, space);
-    UT_LIST_REMOVE(space_list, space);
-    UT_LIST_ADD_LAST(space_list, space);
+    {
+      ut_ad(s != space_list.begin());
+      space_list_t::iterator prev= s;
+      space_list_last_opened= &*--prev;
+    }
+
+    space_list.erase(s);
+    space_list.push_back(*space);
   }
 
   /** Return the next tablespace from default_encrypt_tables list.
@@ -1484,6 +1574,11 @@ public:
 
   /** Extend all open data files to the recovered size */
   ATTRIBUTE_COLD void extend_to_recv_size();
+
+  /** Determine if a tablespace associated with a file name exists.
+  @param path   tablespace file name to look for
+  @return a matching tablespace */
+  inline fil_space_t *find(const char *path) const;
 };
 
 /** The tablespace memory cache. */
@@ -1492,23 +1587,65 @@ extern fil_system_t	fil_system;
 inline void fil_space_t::reacquire()
 {
   ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
-  ut_d(if (mutex_own(&fil_system.mutex)) return);
+#ifdef SAFE_MUTEX
+  if (mysql_mutex_is_owner(&fil_system.mutex)) return;
   ut_ad(n & PENDING);
   ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+#endif /* SAFE_MUTEX */
+}
+
+/** Note that operations on the tablespace must stop.
+@return whether the operations were already stopped */
+inline bool fil_space_t::set_stopping_check()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+#if (defined __clang_major__ && __clang_major__ < 10) || defined __APPLE_CC__
+  /* Only clang-10 introduced support for asm goto */
+  return n_pending.fetch_or(STOPPING, std::memory_order_relaxed) & STOPPING;
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  static_assert(STOPPING == 1U << 31, "compatibility");
+  __asm__ goto("lock btsl $31, %0\t\njnc %l1" : : "m" (n_pending)
+               : "cc", "memory" : not_stopped);
+  return true;
+not_stopped:
+  return false;
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+  static_assert(STOPPING == 1U << 31, "compatibility");
+  return _interlockedbittestandset(reinterpret_cast<volatile long*>
+                                   (&n_pending), 31);
+#else
+  return n_pending.fetch_or(STOPPING, std::memory_order_relaxed) & STOPPING;
+#endif
 }
 
-/** Note that operations on the tablespace must stop or can resume */
-inline void fil_space_t::set_stopping(bool stopping)
+/** Note that operations on the tablespace must stop.
+@return whether the operations were already stopped */
+inline void fil_space_t::set_stopping()
 {
-  ut_ad(mutex_own(&fil_system.mutex));
-  ut_d(auto n=) n_pending.fetch_xor(STOPPING, std::memory_order_relaxed);
-  ut_ad(!(n & STOPPING) == stopping);
+  mysql_mutex_assert_owner(&fil_system.mutex);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  static_assert(STOPPING == 1U << 31, "compatibility");
+  __asm__ __volatile__("lock btsl $31, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+  static_assert(STOPPING == 1U << 31, "compatibility");
+  _interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 31);
+#else
+  n_pending.fetch_or(STOPPING, std::memory_order_relaxed);
+#endif
+}
+
+inline void fil_space_t::clear_stopping()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  static_assert(STOPPING == 1U << 31, "compatibility");
+  ut_d(auto n=) n_pending.fetch_sub(STOPPING, std::memory_order_relaxed);
+  ut_ad(n & STOPPING);
 }
 
 /** Flush pending writes from the file system cache to the file. */
 template<bool have_reference> inline void fil_space_t::flush()
 {
-  ut_ad(!mutex_own(&fil_system.mutex));
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
   ut_ad(!have_reference || (pending() & PENDING));
   ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
   if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
@@ -1531,9 +1668,9 @@ inline uint32_t fil_space_t::get_size()
 {
   if (!size)
   {
-    mutex_enter(&fil_system.mutex);
+    mysql_mutex_lock(&fil_system.mutex);
     read_page0();
-    mutex_exit(&fil_system.mutex);
+    mysql_mutex_unlock(&fil_system.mutex);
   }
   return size;
 }
@@ -1585,20 +1722,12 @@ fil_write_flushed_lsn(
 	lsn_t	lsn)
 MY_ATTRIBUTE((warn_unused_result));
 
+MY_ATTRIBUTE((warn_unused_result))
 /** Delete a tablespace and associated .ibd file.
-@param[in]	id		tablespace identifier
-@param[in]	if_exists	whether to ignore missing tablespace
-@param[out]	leaked_handles	return detached handles here
-@return	DB_SUCCESS or error */
-dberr_t
-fil_delete_tablespace(ulint id, bool if_exists= false,
-                      std::vector<pfs_os_file_t> *detached_handles= nullptr);
-
-/** Prepare to truncate an undo tablespace.
-@param[in]	space_id	undo tablespace id
-@return	the tablespace
-@retval	NULL if the tablespace does not exist */
-fil_space_t* fil_truncate_prepare(ulint space_id);
+@param id    tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return	OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(ulint id);
 
 /** Close a single-table tablespace on failed IMPORT TABLESPACE.
 The tablespace must be cached in the memory cache.
@@ -1609,15 +1738,15 @@ void fil_close_tablespace(ulint id);
 Allocates and builds a file name from a path, a table or tablespace name
 and a suffix. The string must be freed by caller with ut_free().
 @param[in] path NULL or the directory path or the full path and filename.
-@param[in] name NULL if path is full, or Table/Tablespace name
-@param[in] suffix NULL or the file extention to use.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
 @return own: file name */
-char*
-fil_make_filepath(
-	const char*	path,
-	const char*	name,
-	ib_extention	suffix,
-	bool		strip_name);
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+                        ib_extention ext, bool trim_name);
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+                        ib_extention suffix, bool strip_name);
 
 /** Create a tablespace file.
 @param[in]	space_id	Tablespace ID
@@ -1634,14 +1763,14 @@ must be >= FIL_IBD_FILE_INITIAL_SIZE
 fil_space_t*
 fil_ibd_create(
 	ulint		space_id,
-	const char*	name,
+	const table_name_t name,
 	const char*	path,
 	ulint		flags,
 	uint32_t	size,
 	fil_encryption_t mode,
 	uint32_t	key_id,
 	dberr_t*	err)
-	MY_ATTRIBUTE((nonnull(2,8), warn_unused_result));
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
 (Typically when upgrading from MariaDB 10.1.0..10.1.20.)
@@ -1655,7 +1784,7 @@ right in it. If does not succeed, prints an error message to the .err log. This
 function is used to open a tablespace when we start up mysqld, and also in
 IMPORT TABLESPACE.
 NOTE that we assume this operation is used either at the database startup
-or under the protection of the dictionary mutex, so that two users cannot
+or under the protection of dict_sys.latch, so that two users cannot
 race here. This operation does not leave the file associated with the
 tablespace open, but closes it after we have looked at the space id in it.
 
@@ -1666,15 +1795,11 @@ file inode probably is much faster (the OS caches them) than accessing
 the first page of the file.  This boolean may be initially false, but if
 a remote tablespace is found it will be changed to true.
 
-If the fix_dict boolean is set, then it is safe to use an internal SQL
-statement to update the dictionary tables if they are incorrect.
-
-@param[in]	validate	true if we should validate the tablespace
-@param[in]	fix_dict	true if the dictionary is available to be fixed
+@param[in]	validate	0=maybe missing, 1=do not validate, 2=validate
 @param[in]	purpose		FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
 @param[in]	id		tablespace ID
 @param[in]	flags		expected FSP_SPACE_FLAGS
-@param[in]	tablename	table name
+@param[in]	name		table name
 If file-per-table, it is the table name in the databasename/tablename format
 @param[in]	path_in		expected filepath, usually read from dictionary
 @param[out]	err		DB_SUCCESS or error code
@@ -1682,12 +1807,11 @@ If file-per-table, it is the table name in the databasename/tablename format
 @retval	NULL	if the tablespace could not be opened */
 fil_space_t*
 fil_ibd_open(
-	bool			validate,
-	bool			fix_dict,
+	unsigned		validate,
 	fil_type_t		purpose,
 	ulint			id,
 	ulint			flags,
-	const table_name_t&	tablename,
+	fil_space_t::name_type	name,
 	const char*		path_in,
 	dberr_t*		err = NULL)
 	MY_ATTRIBUTE((warn_unused_result));
@@ -1700,7 +1824,9 @@ enum fil_load_status {
 	/** The file(s) were not found */
 	FIL_LOAD_NOT_FOUND,
 	/** The file(s) were not valid */
-	FIL_LOAD_INVALID
+	FIL_LOAD_INVALID,
+	/** The tablespace file was deferred to open */
+	FIL_LOAD_DEFER
 };
 
 /** Open a single-file tablespace and add it to the InnoDB data structures.
@@ -1720,15 +1846,10 @@ fil_ibd_load(
 memory cache. Note that if we have not done a crash recovery at the database
 startup, there may be many tablespaces which are not yet in the memory cache.
 @param[in]	id		Tablespace ID
-@param[in]	name		Tablespace name used in fil_space_t::create().
 @param[in]	table_flags	table flags
 @return the tablespace
 @retval	NULL	if no matching tablespace exists in the memory cache */
-fil_space_t*
-fil_space_for_table_exists_in_mem(
-	ulint		id,
-	const char*	name,
-	ulint		table_flags);
+fil_space_t *fil_space_for_table_exists_in_mem(ulint id, ulint table_flags);
 
 /** Try to extend a tablespace if it is smaller than the specified size.
 @param[in,out]	space	tablespace
@@ -1759,31 +1880,6 @@ fil_delete_file(
 /*============*/
 	const char*	path);	/*!< in: filepath of the ibd tablespace */
 
-/********************************************************************//**
-Looks for a pre-existing fil_space_t with the given tablespace ID
-and, if found, returns the name and filepath in newly allocated buffers that the caller must free.
-@param[in] space_id The tablespace ID to search for.
-@param[out] name Name of the tablespace found.
-@param[out] fileapth The filepath of the first datafile for thtablespace found.
-@return true if tablespace is found, false if not. */
-bool
-fil_space_read_name_and_filepath(
-	ulint	space_id,
-	char**	name,
-	char**	filepath);
-
-/** Convert a file name to a tablespace name.
-@param[in]	filename	directory/databasename/tablename.ibd
-@return database/tablename string, to be freed with ut_free() */
-char*
-fil_path_to_space_name(
-	const char*	filename);
-
-/** Acquire the fil_system mutex. */
-#define fil_system_enter()	mutex_enter(&fil_system.mutex)
-/** Release the fil_system mutex. */
-#define fil_system_exit()	mutex_exit(&fil_system.mutex)
-
 /*******************************************************************//**
 Returns the table space by a given id, NULL if not found. */
 fil_space_t*
@@ -1848,11 +1944,6 @@ void test_make_filepath();
 @param[in]	space		tablespace
 @param[in]	offset		page number
 @return	block size */
-UNIV_INTERN
-ulint
-fil_space_get_block_size(const fil_space_t* space, unsigned offset);
+ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset);
 
-#include "fil0fil.inl"
 #endif /* UNIV_INNOCHECKSUM */
-
-#endif /* fil0fil_h */
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
index 7db85e87ed0..8c11d61c5aa 100644
--- a/storage/innobase/include/fsp0file.h
+++ b/storage/innobase/include/fsp0file.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -49,7 +49,6 @@ public:
 
 	Datafile()
 		:
-		m_name(),
 		m_filepath(),
 		m_filename(),
 		m_handle(),
@@ -68,9 +67,8 @@ public:
 		/* No op */
 	}
 
-	Datafile(const char* name, ulint flags, uint32_t size, ulint order)
+	Datafile(ulint flags, uint32_t size, ulint order)
 		:
-		m_name(mem_strdup(name)),
 		m_filepath(),
 		m_filename(),
 		m_handle(),
@@ -86,8 +84,6 @@ public:
 		m_last_os_error(),
 		m_file_info()
 	{
-		ut_ad(m_name != NULL);
-		/* No op */
 	}
 
 	Datafile(const Datafile& file)
@@ -105,9 +101,6 @@ public:
 		m_last_os_error(),
 		m_file_info()
 	{
-		m_name = mem_strdup(file.m_name);
-		ut_ad(m_name != NULL);
-
 		if (file.m_filepath != NULL) {
 			m_filepath = mem_strdup(file.m_filepath);
 			ut_a(m_filepath != NULL);
@@ -127,10 +120,6 @@ public:
 	{
 		ut_a(this != &file);
 
-		ut_ad(m_name == NULL);
-		m_name = mem_strdup(file.m_name);
-		ut_a(m_name != NULL);
-
 		m_size = file.m_size;
 		m_order = file.m_order;
 		m_type = file.m_type;
@@ -164,10 +153,8 @@ public:
 		return(*this);
 	}
 
-	/** Initialize the name and flags of this datafile.
-	@param[in]	name	tablespace name, will be copied
-	@param[in]	flags	tablespace flags */
-	void init(const char* name, ulint flags);
+	/** Initialize the tablespace flags */
+	void init(ulint flags) { m_flags= flags; }
 
 	/** Release the resources. */
 	virtual void shutdown();
@@ -176,14 +163,12 @@ public:
 	so that it can be validated.
 	@param[in]	strict	whether to issue error messages
 	@return DB_SUCCESS or error code */
-	virtual dberr_t open_read_only(bool strict);
+	dberr_t open_read_only(bool strict);
 
 	/** Open a data file in read-write mode during start-up so that
 	doublewrite pages can be restored and then it can be validated.
-	@param[in]	read_only_mode	if true, then readonly mode checks
-					are enforced.
 	@return DB_SUCCESS or error code */
-	virtual dberr_t open_read_write(bool read_only_mode)
+	inline dberr_t open_read_write()
 		MY_ATTRIBUTE((warn_unused_result));
 
 	/** Initialize OS specific file info. */
@@ -197,24 +182,15 @@ public:
 	Prepend the dirpath to filename using the extension given.
 	If dirpath is NULL, prepend the default datadir to filepath.
 	Store the result in m_filepath.
-	@param[in]	dirpath		directory path
-	@param[in]	filename	filename or filepath
-	@param[in]	ext		filename extension */
-	void make_filepath(
-		const char*	dirpath,
-		const char*	filename,
-		ib_extention	ext);
+	@param dirpath  directory path
+	@param name     tablespace (table) name
+	@param ext      filename extension */
+	void make_filepath(const char* dirpath, fil_space_t::name_type name,
+			   ib_extention ext);
 
 	/** Set the filepath by duplicating the filepath sent in */
 	void set_filepath(const char* filepath);
 
-	/** Allocate and set the datafile or tablespace name in m_name.
-	If a name is provided, use it; else extract a file-per-table
-	tablespace name from m_filepath. The value of m_name
-	will be freed in the destructor.
-	@param[in]	name	Tablespace Name if known, NULL if not */
-	void set_name(const char*	name);
-
 	/** Validates the datafile and checks that it conforms with
 	the expected space ID and flags.  The file should exist and be
 	successfully opened in order for this function to validate it.
@@ -247,13 +223,6 @@ public:
 	dberr_t validate_first_page(lsn_t* flush_lsn)
 		MY_ATTRIBUTE((warn_unused_result));
 
-	/** Get Datafile::m_name.
-	@return m_name */
-	const char*	name()	const
-	{
-		return(m_name);
-	}
-
 	/** Get Datafile::m_filepath.
 	@return m_filepath */
 	const char*	filepath()	const
@@ -355,6 +324,9 @@ public:
 	@return the first data page */
 	const byte* get_first_page() const { return(m_first_page); }
 
+	void set_space_id(ulint space_id) { m_space_id= space_id; }
+
+	void set_flags(ulint flags) { m_flags = flags; }
 private:
 	/** Free the filepath buffer. */
 	void free_filepath();
@@ -363,13 +335,22 @@ private:
 	in the filepath. */
 	void set_filename()
 	{
-		if (m_filepath == NULL) {
+		if (!m_filepath) {
 			return;
 		}
 
-		char* last_slash = strrchr(m_filepath, OS_PATH_SEPARATOR);
-
-		m_filename = last_slash ? last_slash + 1 : m_filepath;
+		if (char *last_slash = strrchr(m_filepath, '/')) {
+#if _WIN32
+			if (char *last = strrchr(m_filepath, '\\')) {
+				if (last > last_slash) {
+					last_slash = last;
+				}
+			}
+#endif
+			m_filename = last_slash + 1;
+		} else {
+			m_filename = m_filepath;
+		}
 	}
 
 	/** Create/open a data file.
@@ -406,12 +387,6 @@ private:
 
 	/* DATA MEMBERS */
 
-	/** Datafile name at the tablespace location.
-	This is either the basename of the file if an absolute path
-	was entered, or it is the relative path to the datadir or
-	Tablespace::m_path. */
-	char*			m_name;
-
 protected:
 	/** Physical file path with base name and extension */
 	char*			m_filepath;
@@ -471,6 +446,8 @@ protected:
 	ulint			m_last_os_error;
 
 public:
+	/** true if table is deferred during recovery */
+	bool			m_defer=false;
 	/** Use the following to determine the uniqueness of this datafile. */
 #ifdef _WIN32
 	/* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */
@@ -520,57 +497,28 @@ public:
 		return(m_link_filepath);
 	}
 
-	/** Create a link filename based on the contents of m_name,
-	open that file, and read the contents into m_filepath.
-	@retval DB_SUCCESS if remote linked tablespace file is opened and read.
-	@retval DB_CANNOT_OPEN_FILE if the link file does not exist. */
-	dberr_t open_link_file();
+	/** Attempt to read the contents of an .isl file into m_filepath.
+	@param name   table name
+	@return filepath()
+	@retval nullptr  if the .isl file does not exist or cannot be read */
+	const char* open_link_file(const fil_space_t::name_type name);
 
 	/** Delete an InnoDB Symbolic Link (ISL) file. */
 	void delete_link_file(void);
 
-	/** Open a handle to the file linked to in an InnoDB Symbolic Link file
-	in read-only mode so that it can be validated.
-	@param[in]	strict	whether to issue error messages
-	@return DB_SUCCESS or error code */
-	dberr_t open_read_only(bool strict) override;
-
-	/** Opens a handle to the file linked to in an InnoDB Symbolic Link
-	file in read-write mode so that it can be restored from doublewrite
-	and validated.
-	@param[in]	read_only_mode	If true, then readonly mode checks
-					are enforced.
-	@return DB_SUCCESS or error code */
-	dberr_t open_read_write(bool read_only_mode) override
-		MY_ATTRIBUTE((warn_unused_result));
-
 	/******************************************************************
 	Global Static Functions;  Cannot refer to data members.
 	******************************************************************/
 
-	/** Creates a new InnoDB Symbolic Link (ISL) file.  It is always
-	created under the 'datadir' of MySQL. The datadir is the directory
-	of a running mysqld program. We can refer to it by simply using
-	the path ".".
-	@param[in]	name		tablespace name
-	@param[in]	filepath	remote filepath of tablespace datafile
+	/** Create InnoDB Symbolic Link (ISL) file.
+	@param name     tablespace name
+	@param filepath full file name
 	@return DB_SUCCESS or error code */
-	static dberr_t create_link_file(
-		const char*	name,
-		const char*	filepath);
+	static dberr_t create_link_file(fil_space_t::name_type name,
+					const char *filepath);
 
 	/** Delete an InnoDB Symbolic Link (ISL) file by name.
-	@param[in]	name	tablespace name */
-	static void delete_link_file(const char* name);
-
-	/** Read an InnoDB Symbolic Link (ISL) file by name.
-	It is always created under the datadir of MySQL.
-	For file-per-table tablespaces, the isl file is expected to be
-	in a 'database' directory and called 'tablename.isl'.
-	The caller must free the memory returned if it is not null.
-	@param[in]	link_filepath	filepath of the ISL file
-	@return Filepath of the IBD file read from the ISL file */
-	static char* read_link_file(
-		const char*	link_filepath);
+	@param name   tablespace name */
+	static void delete_link_file(fil_space_t::name_type name);
 };
 #endif /* fsp0file_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index 1be45915239..b73fc2b54eb 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -207,16 +207,17 @@ typedef	byte	fseg_inode_t;
 	(16 + 3 * FLST_BASE_NODE_SIZE			\
 	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
 
-static constexpr uint32_t FSEG_MAGIC_N_VALUE= 97937874;
+static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2};
 
-#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
-					the number of unused but reserved
+#define	FSEG_FILLFACTOR		8	/* If the number of unused but reserved
 					pages in a segment is less than
-					reserved pages * 1/x, and there are
+					reserved pages / FSEG_FILLFACTOR,
+					and there are
 					at least FSEG_FRAG_LIMIT used pages,
 					then we allow a new empty extent to
 					be added to the segment in
-					fseg_alloc_free_page. Otherwise, we
+					fseg_alloc_free_page_general().
+					Otherwise, we
 					use unused pages of the segment. */
 
 #define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
@@ -342,36 +343,28 @@ fsp_header_check_encryption_key(
 	ulint			fsp_flags,
 	page_t*			page);
 
-/**********************************************************************//**
-Writes the space id and flags to a tablespace header.  The flags contain
-row type, physical/compressed page size, and logical/uncompressed page
-size of the tablespace. */
-void
-fsp_header_init_fields(
-/*===================*/
-	page_t*	page,		/*!< in/out: first page in the space */
-	ulint	space_id,	/*!< in: space id */
-	ulint	flags);		/*!< in: tablespace flags (FSP_SPACE_FLAGS):
-				0, or table->flags if newer than COMPACT */
 /** Initialize a tablespace header.
 @param[in,out]	space	tablespace
 @param[in]	size	current size in blocks
-@param[in,out]	mtr	mini-transaction */
-void fsp_header_init(fil_space_t* space, uint32_t size, mtr_t* mtr)
-	MY_ATTRIBUTE((nonnull));
+@param[in,out]	mtr	mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** Create a new segment.
 @param space                tablespace
 @param byte_offset          byte offset of the created segment header
 @param mtr                  mini-transaction
+@param err                  error code
 @param has_done_reservation whether fsp_reserve_free_extents() was invoked
 @param block                block where segment header is placed,
                             or NULL to allocate an additional page for that
 @return the block where the segment header is placed, x-latched
-@retval NULL if could not create segment because of lack of space */
+@retval nullptr if could not create segment */
 buf_block_t*
-fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr,
-            bool has_done_reservation= false, buf_block_t *block= NULL);
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+            bool has_done_reservation= false, buf_block_t *block= nullptr)
+  MY_ATTRIBUTE((nonnull(1,3,4), warn_unused_result));
 
 /** Calculate the number of pages reserved by a segment,
 and how many pages are currently used.
@@ -386,22 +379,6 @@ ulint fseg_n_reserved_pages(const buf_block_t &block,
   MY_ATTRIBUTE((nonnull));
 /**********************************************************************//**
 Allocates a single free page from a segment. This function implements
-the intelligent allocation strategy which tries to minimize
-file space fragmentation.
-@param[in,out] seg_header segment header
-@param[in] hint hint of which page would be desirable
-@param[in] direction if the new page is needed because
-				of an index page split, and records are
-				inserted there in order, into which
-				direction they go alphabetically: FSP_DOWN,
-				FSP_UP, FSP_NO_DIR
-@param[in,out] mtr mini-transaction
-@return X-latched block, or NULL if no page could be allocated */
-#define fseg_alloc_free_page(seg_header, hint, direction, mtr)		\
-	fseg_alloc_free_page_general(seg_header, hint, direction,	\
-				     false, mtr, mtr)
-/**********************************************************************//**
-Allocates a single free page from a segment. This function implements
 the intelligent allocation strategy which tries to minimize file space
 fragmentation.
 @retval NULL if no page could be allocated */
@@ -422,8 +399,9 @@ fseg_alloc_free_page_general(
 				is no need to do the check for this individual
 				page */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
-	mtr_t*		init_mtr)/*!< in/out: mtr or another mini-transaction
+	mtr_t*		init_mtr,/*!< in/out: mtr or another mini-transaction
 				in which the page should be initialized. */
+	dberr_t*	err)	/*!< out: error code */
 	MY_ATTRIBUTE((warn_unused_result, nonnull));
 
 /** Reserves free pages from a tablespace. All mini-transactions which may
@@ -452,19 +430,21 @@ if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
 different rules in that special case, just ensuring that there are n_pages
 free pages available.
 
-@param[out]	n_reserved	number of extents actually reserved; if we
-				return true and the tablespace size is <
-				FSP_EXTENT_SIZE pages, then this can be 0,
-				otherwise it is n_ext
-@param[in,out]	space		tablespace
-@param[in]	n_ext		number of extents to reserve
-@param[in]	alloc_type	page reservation type (FSP_BLOB, etc)
-@param[in,out]	mtr		the mini transaction
-@param[in]	n_pages		for small tablespaces (tablespace size is
-				less than FSP_EXTENT_SIZE), number of free
-				pages to reserve.
-@return true if we were able to make the reservation */
-bool
+@param[out]     n_reserved      number of extents actually reserved; if we
+                                return true and the tablespace size is <
+                                FSP_EXTENT_SIZE pages, then this can be 0,
+                                otherwise it is n_ext
+@param[in,out]  space           tablespace
+@param[in]      n_ext           number of extents to reserve
+@param[in]      alloc_type      page reservation type (FSP_BLOB, etc)
+@param[in,out]  mtr             the mini transaction
+@param[out]     err             error code
+@param[in]      n_pages         for small tablespaces (tablespace size is
+                                less than FSP_EXTENT_SIZE), number of free
+                                pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
 fsp_reserve_free_extents(
 	uint32_t*	n_reserved,
 	fil_space_t*	space,
@@ -477,43 +457,62 @@ fsp_reserve_free_extents(
 @param[in,out]	seg_header	file segment header
 @param[in,out]	space		tablespace
 @param[in]	offset		page number
-@param[in,out]	mtr		mini-transaction */
-void
+@param[in,out]	mtr		mini-transaction
+@param[in]	have_latch	whether space->x_lock() was already called
+@return error code */
+dberr_t
 fseg_free_page(
 	fseg_header_t*	seg_header,
 	fil_space_t*	space,
 	uint32_t	offset,
-	mtr_t*		mtr);
-/** Determine whether a page is free.
-@param[in,out]	space	tablespace
-@param[in]	page	page number
-@return whether the page is marked as free */
-bool
-fseg_page_is_free(fil_space_t* space, unsigned page)
+	mtr_t*		mtr,
+	bool		have_latch = false)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/**********************************************************************//**
-Frees part of a segment. This function can be used to free a segment
-by repeatedly calling this function in different mini-transactions.
-Doing the freeing in a single mini-transaction might result in
-too big a mini-transaction.
+
+/** Determine whether a page is allocated.
+@param space   tablespace
+@param page    page number
+@return error code
+@retval DB_SUCCESS             if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC  if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param	header	segment header; NOTE: if the header resides on first
+		page of the frag list of the segment, this pointer
+		becomes obsolete after the last freeing step
+@param	mtr	mini-transaction
+@param	ahi	Drop the adaptive hash index
 @return whether the freeing was completed */
 bool
 fseg_free_step(
-	fseg_header_t*	header,	/*!< in, own: segment header; NOTE: if the header
-				resides on the first page of the frag list
-				of the segment, this pointer becomes obsolete
-				after the last freeing step */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
 	MY_ATTRIBUTE((warn_unused_result));
-/**********************************************************************//**
-Frees part of a segment. Differs from fseg_free_step because this function
-leaves the header page unfreed.
+
+/** Frees part of a segment. Differs from fseg_free_step because
+this function leaves the header page unfreed.
+@param	header	segment header which must reside on the first
+		fragment page of the segment
+@param	mtr	mini-transaction
+@param	ahi	drop the adaptive hash index
 @return whether the freeing was completed, except for the header page */
 bool
 fseg_free_step_not_header(
-	fseg_header_t*	header,	/*!< in: segment header which must reside on
-				the first fragment page of the segment */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
 	MY_ATTRIBUTE((warn_unused_result));
 
 /** Reset the page type.
@@ -541,9 +540,8 @@ fil_block_check_type(
 	ulint			type,
 	mtr_t*			mtr)
 {
-	if (UNIV_UNLIKELY(type != fil_page_get_type(block.frame))) {
-		fil_block_reset_type(block, type, mtr);
-	}
+  if (UNIV_UNLIKELY(type != fil_page_get_type(block.page.frame)))
+    fil_block_reset_type(block, type, mtr);
 }
 
 /** Checks if a page address is an extent descriptor page address.
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
index c00c8d689bf..ed65af52bc8 100644
--- a/storage/innobase/include/fsp0space.h
+++ b/storage/innobase/include/fsp0space.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -50,7 +50,6 @@ public:
 	Tablespace()
 		:
 		m_files(),
-		m_name(),
 		m_space_id(ULINT_UNDEFINED),
 		m_path(),
 		m_flags(),
@@ -79,9 +78,6 @@ public:
 	/** Data file iterator */
 	iterator end() { return m_files.end(); }
 
-	void set_name(const char* name) { m_name = name; }
-	const char* name() const { return m_name; }
-
 	/** Set tablespace path and filename members.
 	@param[in]	path	where tablespace file(s) resides
 	@param[in]	len	length of the file path */
@@ -90,8 +86,6 @@ public:
 		ut_ad(m_path == NULL);
 		m_path = mem_strdupl(path, len);
 		ut_ad(m_path != NULL);
-
-		os_normalize_path(m_path);
 	}
 
 	/** Set tablespace path and filename members.
@@ -218,9 +212,6 @@ private:
 
 	/* DATA MEMBERS */
 
-	/** Name of the tablespace. */
-	const char*	m_name;
-
 	/** Tablespace ID */
 	ulint		m_space_id;
 
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index f8e4c06baae..1912c31b744 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2020, MariaDB Corporation.
+Copyright (c) 2014, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -49,7 +49,7 @@ static constexpr size_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0;
 If records are inserted in order, there are the following
 flags to tell this (their type is made byte for the compiler
 to warn if direction and hint parameters are switched in
-fseg_alloc_free_page) */
+fseg_alloc_free_page_general) */
 /* @{ */
 #define	FSP_UP		((byte)111)	/*!< alphabetically upwards */
 #define	FSP_DOWN	((byte)112)	/*!< alphabetically downwards */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
index 9c2153b7ca3..720fe7f25b9 100644
--- a/storage/innobase/include/fts0fts.h
+++ b/storage/innobase/include/fts0fts.h
@@ -314,7 +314,7 @@ public:
 
 	/** Whether the ADDED table record sync-ed after crash recovery */
 	unsigned	added_synced:1;
-	/** Whether the table holds dict_sys.mutex */
+	/** Whether the table holds dict_sys.latch */
 	unsigned	dict_locked:1;
 
 	/** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL
@@ -373,13 +373,6 @@ extern ulong		fts_min_token_size;
 need a sync to free some memory */
 extern bool		fts_need_sync;
 
-#define	fts_que_graph_free(graph)			\
-do {							\
-	mutex_enter(&dict_sys.mutex);			\
-	que_graph_free(graph);				\
-	mutex_exit(&dict_sys.mutex);			\
-} while (0)
-
 /******************************************************************//**
 Create a FTS cache. */
 fts_cache_t*
@@ -439,8 +432,7 @@ fts_trx_free(
 	fts_trx_t*	fts_trx);		/*!< in, own: FTS trx */
 
 /** Creates the common auxiliary tables needed for supporting an FTS index
-on the given table. row_mysql_lock_data_dictionary must have been called
-before this.
+on the given table.
 The following tables are created.
 CREATE TABLE $FTS_PREFIX_DELETED
 	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
@@ -463,8 +455,7 @@ fts_create_common_tables(
 	bool		skip_doc_id_index)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /** Creates the column specific ancillary tables needed for supporting an
-FTS index on the given table. row_mysql_lock_data_dictionary must have
-been called before this.
+FTS index on the given table.
 
 All FTS AUX Index tables have the following schema.
 CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
@@ -489,17 +480,29 @@ fts_add_doc_id_column(
 	dict_table_t*	table,	/*!< in/out: Table with FTS index */
 	mem_heap_t*	heap);	/*!< in: temporary memory heap, or NULL */
 
-/*********************************************************************//**
-Drops the ancillary tables needed for supporting an FTS index on the
-given table. row_mysql_lock_data_dictionary must have been called before
-this.
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx   transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index);
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
 @return DB_SUCCESS or error code */
-dberr_t
-fts_drop_tables(
-/*============*/
-	trx_t*		trx,			/*!< in: transaction */
-	dict_table_t*	table);			/*!< in: table has the FTS
-						index */
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table);
+
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table);
+
+/** Drop the internal FTS_ tables for table.
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table);
+
 /******************************************************************//**
 The given transaction is about to be committed; do whatever is necessary
 from the FTS system's POV.
@@ -624,11 +627,7 @@ fts_optimize_init(void);
 /****************************************************************//**
 Drops index ancillary tables for a FTS index
 @return DB_SUCCESS or error code */
-dberr_t
-fts_drop_index_tables(
-/*==================*/
-	trx_t*		trx,			/*!< in: transaction */
-	dict_index_t*	index)			/*!< in: Index to drop */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
 	MY_ATTRIBUTE((warn_unused_result));
 
 /** Add the table to add to the OPTIMIZER's list.
@@ -649,12 +648,6 @@ fts_optimize_remove_table(
 void
 fts_optimize_shutdown();
 
-/** Send sync fts cache for the table.
-@param[in]	table	table to sync */
-void
-fts_optimize_request_sync_table(
-	dict_table_t*	table);
-
 /**********************************************************************//**
 Take a FTS savepoint. */
 void
@@ -706,26 +699,11 @@ fts_savepoint_rollback_last_stmt(
 /*=============================*/
 	trx_t*		trx);			/*!< in: transaction */
 
-/** Drop all orphaned FTS auxiliary tables, those that don't have a parent
-table or FTS index defined on them. */
-void fts_drop_orphaned_tables();
-
 /** Run SYNC on the table, i.e., write out data from the cache to the
 FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	table		fts table
-@param[in]	wait		whether to wait for existing sync to finish
 @return DB_SUCCESS on success, error code on failure. */
-dberr_t fts_sync_table(dict_table_t* table, bool wait = true);
-
-/****************************************************************//**
-Free the query graph but check whether dict_sys.mutex is already
-held */
-void
-fts_que_graph_free_check_lock(
-/*==========================*/
-	fts_table_t*		fts_table,	/*!< in: FTS table */
-	const fts_index_cache_t*index_cache,	/*!< in: FTS index cache */
-	que_t*			graph);		/*!< in: query graph */
+dberr_t fts_sync_table(dict_table_t* table);
 
 /****************************************************************//**
 Create an FTS index cache. */
@@ -863,13 +841,12 @@ fts_table_fetch_doc_ids(
 This function brings FTS index in sync when FTS index is first
 used. There are documents that have not yet sync-ed to auxiliary
 tables from last server abnormally shutdown, we will need to bring
-such document into FTS cache before any further operations
-@return TRUE if all OK */
-ibool
+such document into FTS cache before any further operations */
+void
 fts_init_index(
 /*===========*/
 	dict_table_t*	table,			/*!< in: Table with FTS */
-	ibool		has_cache_lock);	/*!< in: Whether we already
+	bool		has_cache_lock);	/*!< in: Whether we already
 						have cache lock */
 /*******************************************************************//**
 Add a newly create index in FTS cache */
@@ -933,9 +910,8 @@ fts_trx_create(
 
 /** Clear all fts resources when there is no internal DOC_ID
 and there are no new fts index to add.
-@param[in,out]  table   table  where fts is to be freed
-@param[in]      trx     transaction to drop all fts tables */
-void fts_clear_all(dict_table_t *table, trx_t *trx);
+@param[in,out]  table   table  where fts is to be freed */
+void fts_clear_all(dict_table_t *table);
 
 /** Check whether the given name is fts auxiliary table
 and fetch the parent table id and index id
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
index 660f7459249..ae0bb036e37 100644
--- a/storage/innobase/include/fts0priv.h
+++ b/storage/innobase/include/fts0priv.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -135,7 +135,7 @@ fts_eval_sql(
 /** Construct the name of an internal FTS table for the given table.
 @param[in]	fts_table	metadata on fulltext-indexed table
 @param[out]	table_name	a name up to MAX_FULL_NAME_LEN
-@param[in]	dict_locked	whether dict_sys.mutex is being held */
+@param[in]	dict_locked	whether dict_sys.latch is being held */
 void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
 			bool dict_locked = false)
 	MY_ATTRIBUTE((nonnull));
@@ -295,16 +295,6 @@ fts_trx_table_id_cmp(
 #define fts_sql_commit(trx) trx_commit_for_mysql(trx)
 #define fts_sql_rollback(trx) (trx)->rollback()
 /******************************************************************//**
-Parse an SQL string. %s is replaced with the table's id. Don't acquire
-the dict mutex
-@return query graph */
-que_t*
-fts_parse_sql_no_dict_lock(
-/*=======================*/
-	pars_info_t*	info,		/*!< in: parser info */
-	const char*	sql)		/*!< in: SQL string to evaluate */
-	MY_ATTRIBUTE((nonnull(2), malloc, warn_unused_result));
-/******************************************************************//**
 Get value from config table. The caller must ensure that enough
 space is allocated for value to hold the column contents
 @return DB_SUCCESS or error code */
@@ -424,8 +414,7 @@ Append deleted doc ids to vector and sort the vector. */
 void
 fts_cache_append_deleted_doc_ids(
 /*=============================*/
-	const fts_cache_t*
-			cache,		/*!< in: cache to use */
+	fts_cache_t*	cache,		/*!< in: cache to use */
 	ib_vector_t*	vector);	/*!< in: append to this vector */
 /******************************************************************//**
 Search the index specific cache for a particular FTS index.
@@ -470,12 +459,6 @@ fts_get_table_id(
 					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
 					long */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/** Construct the name of an internal FTS table for the given table.
-@param[in]	fts_table	metadata on fulltext-indexed table
-@param[in]	dict_locked	whether dict_sys.mutex is being held
-@return	the prefix, must be freed with ut_free() */
-char* fts_get_table_name_prefix(const fts_table_t* fts_table)
-	MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
 /******************************************************************//**
 Add node positions. */
 void
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
index 2cddf152d04..04e99d595c5 100644
--- a/storage/innobase/include/fts0types.h
+++ b/storage/innobase/include/fts0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,7 +28,6 @@ Created 2007-03-27 Sunny Bains
 #define INNOBASE_FTS0TYPES_H
 
 #include "fts0fts.h"
-#include "fut0fut.h"
 #include "pars0pars.h"
 #include "que0types.h"
 #include "ut0byte.h"
@@ -76,7 +75,6 @@ struct fts_index_cache_t {
 
 	que_t**		ins_graph;	/*!< Insert query graphs */
 
-	que_t**		sel_graph;	/*!< Select query graphs */
 	CHARSET_INFO*	charset;	/*!< charset */
 };
 
@@ -88,52 +86,23 @@ struct fts_stopword_t {
 	CHARSET_INFO*	charset;	/*!< charset for stopword */
 };
 
-/** The SYNC state of the cache. There is one instance of this struct
-associated with each ADD thread. */
-struct fts_sync_t {
-	trx_t*		trx;		/*!< The transaction used for SYNCing
-					the cache to disk */
-	dict_table_t*	table;		/*!< Table with FTS index(es) */
-	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
-	ibool		cache_full;	/*!< flag, when true it indicates that
-					we need to sync the cache to disk */
-	ulint		lower_index;	/*!< the start index of the doc id
-					vector from where to start adding
-					documents to the FTS cache */
-	ulint		upper_index;	/*!< max index of the doc id vector to
-					add to the FTS cache */
-	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
-	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
-					cache. It should equal to
-					doc_ids[lower_index] */
-	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
-					noted as being full, we use this to
-					set the upper_limit field */
-	time_t		start_time;	/*!< SYNC start time; only used if
-					fts_enable_diag_print */
-	bool		in_progress;	/*!< flag whether sync is in progress.*/
-	bool		unlock_cache;	/*!< flag whether unlock cache when
-					write fts node */
-	os_event_t	event;		/*!< sync finish event;
-					only os_event_set() and os_event_wait()
-					are used */
-};
+struct fts_sync_t;
 
 /** The cache for the FTS system. It is a memory-based inverted index
 that new entries are added to, until it grows over the configured maximum
 size, at which time its contents are written to the INDEX table. */
-struct fts_cache_t {
-	rw_lock_t	lock;		/*!< lock protecting all access to the
-					memory buffer. FIXME: this needs to
-					be our new upgrade-capable rw-lock */
-
-	rw_lock_t	init_lock;	/*!< lock used for the cache
-					intialization, it has different
-					SYNC level as above cache lock */
+struct fts_cache_t
+{
+  /** lock protecting all access to the memory buffer */
+  mysql_mutex_t lock;
+  /** cache initialization */
+  mysql_mutex_t init_lock;
 
-	ib_mutex_t	deleted_lock;	/*!< Lock covering deleted_doc_ids */
+  /** protection for deleted_doc_ids */
+  mysql_mutex_t deleted_lock;
 
-	ib_mutex_t	doc_id_lock;	/*!< Lock covering Doc ID */
+  /** protection for DOC_ID */
+  mysql_mutex_t	doc_id_lock;
 
 	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
 					element is of type fts_update_t */
@@ -206,7 +175,6 @@ struct fts_node_t {
 	ulint		ilist_size_alloc;
 					/*!< Allocated size of ilist in
 					bytes */
-	bool		synced;		/*!< flag whether the node is synced */
 };
 
 /** A tokenizer word. Contains information about one word. */
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
index 1ade24cd069..746dab80400 100644
--- a/storage/innobase/include/fut0lst.h
+++ b/storage/innobase/include/fut0lst.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,30 +24,21 @@ File-based list utilities
 Created 11/28/1995 Heikki Tuuri
 ***********************************************************************/
 
-#ifndef fut0lst_h
-#define fut0lst_h
+#pragma once
+
+/* The physical size of a list base node in bytes */
+#define	FLST_BASE_NODE_SIZE	(4 + 2 * FIL_ADDR_SIZE)
+/* The physical size of a list node in bytes */
+#define	FLST_NODE_SIZE		(2 * FIL_ADDR_SIZE)
 
 #ifdef UNIV_INNOCHECKSUM
 # include "fil0fil.h"
 #else
-#include "fut0fut.h"
-#include "mtr0log.h"
-
-/* The C 'types' of base node and list node: these should be used to
-write self-documenting code. Of course, the sizeof macro cannot be
-applied to these types! */
+# include "mtr0log.h"
 
 typedef	byte	flst_base_node_t;
 typedef	byte	flst_node_t;
 
-#endif /* !UNIV_INNOCHECKSUM */
-
-/* The physical size of a list base node in bytes */
-#define	FLST_BASE_NODE_SIZE	(4 + 2 * FIL_ADDR_SIZE)
-/* The physical size of a list node in bytes */
-#define	FLST_NODE_SIZE		(2 * FIL_ADDR_SIZE)
-
-#ifndef UNIV_INNOCHECKSUM
 /* We define the field offsets of a node for the list */
 #define FLST_PREV	0	/* 6-byte address of the previous list element;
 				the page part of address is FIL_NULL, if no
@@ -70,9 +61,10 @@ typedef	byte	flst_node_t;
 @param[in,out]	mtr	mini-transaction */
 inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
 {
-  ut_ad(!mach_read_from_2(FLST_LEN + ofs + block->frame));
-  ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + block->frame));
-  ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + block->frame));
+  ut_d(const page_t *page= block->page.frame);
+  ut_ad(!mach_read_from_2(FLST_LEN + ofs + page));
+  ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + page));
+  ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + page));
   compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
   mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
   mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
@@ -82,7 +74,7 @@ inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
 @param[in]      block   file page
 @param[in,out]  base    base node
 @param[in,out]  mtr     mini-transaction */
-void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr)
   MY_ATTRIBUTE((nonnull));
 
 /** Append a file list node to a list.
@@ -90,28 +82,31 @@ void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
 @param[in]      boffset byte offset of the base node
 @param[in,out]  add     block to be added
 @param[in]      aoffset byte offset of the node to be added
-@param[in,outr] mtr     mini-transaction */
-void flst_add_last(buf_block_t *base, uint16_t boffset,
-                   buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
-  MY_ATTRIBUTE((nonnull));
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+                      buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 /** Prepend a file list node to a list.
 @param[in,out]  base    base node block
 @param[in]      boffset byte offset of the base node
 @param[in,out]  add     block to be added
 @param[in]      aoffset byte offset of the node to be added
-@param[in,outr] mtr     mini-transaction */
-void flst_add_first(buf_block_t *base, uint16_t boffset,
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
                     buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
-  MY_ATTRIBUTE((nonnull));
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 /** Remove a file list node.
 @param[in,out]  base    base node block
 @param[in]      boffset byte offset of the base node
 @param[in,out]  cur     block to be removed
 @param[in]      coffset byte offset of the current record to be removed
-@param[in,outr] mtr     mini-transaction */
-void flst_remove(buf_block_t *base, uint16_t boffset,
-                 buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
-  MY_ATTRIBUTE((nonnull));
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** @return the length of a list */
 inline uint32_t flst_get_len(const flst_base_node_t *base)
@@ -153,11 +148,9 @@ inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
   return flst_read_addr(node + FLST_PREV);
 }
 
-#ifdef UNIV_DEBUG
+# ifdef UNIV_DEBUG
 /** Validate a file-based list. */
 void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
-#endif
+# endif
 
 #endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
index 9a350325bca..b07261ce042 100644
--- a/storage/innobase/include/gis0rtree.h
+++ b/storage/innobase/include/gis0rtree.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -59,6 +59,44 @@ Created 2013/03/27 Jimmy Yang and Allen Lai
 
 /* Geometry data header */
 #define	GEO_DATA_HEADER_SIZE	4
+
+/** Search for a spatial index leaf page record.
+@param cur         cursor
+@param tuple       search tuple
+@param latch_mode  latching mode
+@param mtr         mini-transaction
+@param mode        search mode */
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                        btr_latch_mode latch_mode, mtr_t *mtr,
+                        page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Search for inserting a spatial index leaf page record.
+@param cur         cursor
+@param tuple       search tuple
+@param latch_mode  latching mode
+@param mtr         mini-transaction */
+inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                               btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur         cursor
+@param tuple       search tuple
+@param mode        search mode
+@param mtr         mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+                        page_cur_mode_t mode, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+                                page_cur_mode_t mode,
+                                btr_latch_mode latch_mode,
+                                btr_cur_t *cur, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
 /**********************************************************************//**
 Builds a Rtree node pointer out of a physical record and a page number.
 @return own: node pointer */
@@ -93,7 +131,8 @@ rtr_page_split_and_insert(
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err);	/*!< out: error code */
 
 /**************************************************************//**
 Sets the child node mbr in a node pointer. */
@@ -123,7 +162,8 @@ rtr_pcur_move_to_next(
 				function may release the page latch */
 	ulint		cur_level,
 				/*!< in: current level */
-	mtr_t*		mtr);	/*!< in: mtr */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
 
 /****************************************************************//**
 Searches the right position in rtree for a page cursor. */
@@ -254,21 +294,14 @@ rtr_get_mbr_from_tuple(
 	rtr_mbr*	mbr);	/*!< out: mbr to fill */
 
 /* Get the rtree page father.
-@param[in]	offsets		work area for the return value
-@param[in]	index		rtree index
-@param[in]	block		child page in the index
-@param[in]	mtr		mtr
+@param[in,out]	mtr		mtr
 @param[in]	sea_cur		search cursor, contains information
 				about parent nodes in search
-@param[in]	cursor		cursor on node pointer record,
-				its page x-latched */
-void
-rtr_page_get_father(
-	dict_index_t*	index,
-	buf_block_t*	block,
-	mtr_t*		mtr,
-	btr_cur_t*	sea_cur,
-	btr_cur_t*	cursor);
+@param[in,out]	cursor		cursor on node pointer record,
+				its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+  MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
 
 /************************************************************//**
 Returns the father block to a page. It is assumed that mtr holds
@@ -279,8 +312,6 @@ rtr_page_get_father_block(
 /*======================*/
 	rec_offs*	offsets,/*!< in: work area for the return value */
 	mem_heap_t*	heap,	/*!< in: memory heap to use */
-	dict_index_t*	index,	/*!< in: b-tree index */
-	buf_block_t*	block,	/*!< in: child page in the index */
 	mtr_t*		mtr,	/*!< in: mtr */
 	btr_cur_t*	sea_cur,/*!< in: search cursor, contains information
 				about parent nodes in search */
@@ -294,7 +325,7 @@ rtr_store_parent_path(
 /*==================*/
 	const buf_block_t*	block,	/*!< in: block of the page */
 	btr_cur_t*		btr_cur,/*!< in/out: persistent cursor */
-	ulint			latch_mode,
+	btr_latch_mode		latch_mode,
 					/*!< in: latch_mode */
 	ulint			level,	/*!< in: index level */
 	mtr_t*			mtr);	/*!< in: mtr */
@@ -302,28 +333,12 @@ rtr_store_parent_path(
 /**************************************************************//**
 Initializes and opens a persistent cursor to an index tree. It should be
 closed with btr_pcur_close. */
-void
-rtr_pcur_open_low(
-/*==============*/
-	dict_index_t*	index,	/*!< in: index */
-	ulint		level,	/*!< in: level in the btree */
+bool rtr_search(
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
-	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
-				NOTE that if the search is made using a unique
-				prefix of a record, mode should be
-				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
-				may end up on the previous page from the
-				record! */
-	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_latch_mode	latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
 	btr_pcur_t*	cursor,	/*!< in: memory buffer for persistent cursor */
-	const char*	file,	/*!< in: file name */
-	unsigned	line,	/*!< in: line where called */
-	mtr_t*		mtr);	/*!< in: mtr */
-
-#define rtr_pcur_open(i,t,md,l,c,m)			\
-	rtr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
-
-struct btr_cur_t;
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
 
 /*********************************************************//**
 Returns the R-Tree node stored in the parent search path
@@ -347,9 +362,12 @@ rtr_get_parent_cursor(
 	ulint		level,		/*!< in: index level of buffer page */
 	ulint		is_insert);	/*!< in: whether insert operation */
 
+MY_ATTRIBUTE((warn_unused_result))
 /*************************************************************//**
-Copy recs from a page to new_block of rtree. */
-void
+Copy recs from a page to new_block of rtree.
+
+@return error code */
+dberr_t
 rtr_page_copy_rec_list_end_no_locks(
 /*================================*/
 	buf_block_t*	new_block,	/*!< in: index page to copy to */
@@ -362,9 +380,12 @@ rtr_page_copy_rec_list_end_no_locks(
 	ulint*		num_moved,	/*!< out: num of rec to move */
 	mtr_t*		mtr);		/*!< in: mtr */
 
+MY_ATTRIBUTE((warn_unused_result))
 /*************************************************************//**
-Copy recs till a specified rec from a page to new_block of rtree. */
-void
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
 rtr_page_copy_rec_list_start_no_locks(
 /*==================================*/
 	buf_block_t*	new_block,	/*!< in: index page to copy to */
@@ -436,7 +457,6 @@ rtr_check_same_block(
 	btr_cur_t*	cur,	/*!< in/out: position at the parent entry
 				pointing to the child if successful */
 	buf_block_t*	parentb,/*!< in: parent page to check */
-	buf_block_t*	childb, /*!< in: child Page */
 	mem_heap_t*	heap);	/*!< in: memory heap */
 
 /*********************************************************************//**
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
index 1b53caa306b..5101eeb6f7a 100644
--- a/storage/innobase/include/gis0rtree.inl
+++ b/storage/innobase/include/gis0rtree.inl
@@ -57,6 +57,9 @@ rtr_page_cal_mbr(
 	page = buf_block_get_frame(block);
 
 	rec = page_rec_get_next(page_get_infimum_rec(page));
+	if (UNIV_UNLIKELY(!rec)) {
+		return;
+	}
 	offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
 				  ? index->n_fields : 0,
 				  ULINT_UNDEFINED, &heap);
@@ -176,12 +179,12 @@ rtr_get_parent_node(
 		return(NULL);
 	}
 
-	mutex_enter(&btr_cur->rtr_info->rtr_path_mutex);
+	mysql_mutex_lock(&btr_cur->rtr_info->rtr_path_mutex);
 
 	num = btr_cur->rtr_info->parent_path->size();
 
 	if (!num) {
-		mutex_exit(&btr_cur->rtr_info->rtr_path_mutex);
+		mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
 		return(NULL);
 	}
 
@@ -204,7 +207,7 @@ rtr_get_parent_node(
 		}
 	}
 
-	mutex_exit(&btr_cur->rtr_info->rtr_path_mutex);
+	mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
 
 	return(found_node);
 }
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
index 55944bfcce3..d6a4ef67a38 100644
--- a/storage/innobase/include/gis0type.h
+++ b/storage/innobase/include/gis0type.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -72,7 +72,7 @@ typedef	struct matched_rec {
 	buf_block_t	block;		/*!< the shadow buffer block */
 	ulint		used;		/*!< memory used */
 	rtr_rec_vector*	matched_recs;	/*!< vector holding the matching rec */
-	ib_mutex_t	rtr_match_mutex;/*!< mutex protect the match_recs
+	mysql_mutex_t	rtr_match_mutex;/*!< mutex protect the match_recs
 					vector */
 	bool		valid;		/*!< whether result in matched_recs
 					or this search is valid (page not
@@ -103,14 +103,8 @@ typedef	struct rtr_info{
 				/*!< vector holding parent pages during
 				search */
 	matched_rec_t*	matches;/*!< struct holding matching leaf records */
-	ib_mutex_t	rtr_path_mutex;
+	mysql_mutex_t	rtr_path_mutex;
 				/*!< mutex protect the "path" vector */
-	buf_block_t*	tree_blocks[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM];
-				/*!< tracking pages that would be locked
-				at leaf level, for future free */
-        ulint		tree_savepoints[RTR_MAX_LEVELS + RTR_LEAF_LATCH_NUM];
-				/*!< savepoint used to release latches/blocks
-				on each level and leaf level */
 	rtr_mbr_t	mbr;	/*!< the search MBR */
 	que_thr_t*      thr;	/*!< the search thread */
 	mem_heap_t*	heap;	/*!< memory heap */
@@ -137,7 +131,7 @@ typedef	struct rtr_info{
 struct rtr_info_track_t {
 	/** Active search info */
 	std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active;
-	ib_mutex_t rtr_active_mutex;
+	mysql_mutex_t rtr_active_mutex;
 						/*!< mutex to protect
 						rtr_active */
 };
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index 04e1ec96b73..d5239ec3f9a 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -109,15 +109,6 @@ innobase_convert_name(
 	THD*		thd);	/*!< in: MySQL connection thread, or NULL */
 
 /******************************************************************//**
-Returns true if the thread is the replication thread on the slave
-server.
-@return true if thd is the replication thread */
-ibool
-thd_is_replication_slave_thread(
-/*============================*/
-	THD*	thd);	/*!< in: thread handle */
-
-/******************************************************************//**
 Returns true if the transaction this thread is processing has edited
 non-transactional tables. Used by the deadlock detector when deciding
 which transaction to rollback in case of a deadlock - we try to avoid
@@ -128,13 +119,6 @@ thd_has_edited_nontrans_tables(
 /*===========================*/
 	THD*	thd);	/*!< in: thread handle */
 
-/**
-Get high resolution timestamp for the current query start time.
-
-@retval timestamp in microseconds precision
-*/
-unsigned long long thd_query_start_micro(const MYSQL_THD thd);
-
 /*************************************************************//**
 Prints info of a THD object (== user session thread) to the given file. */
 void
@@ -156,15 +140,6 @@ uint8_t
 get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
 
 /******************************************************************//**
-Get the variable length bounds of the given character set. */
-void
-innobase_get_cset_width(
-/*====================*/
-	ulint	cset,		/*!< in: MySQL charset-collation code */
-	unsigned*mbminlen,	/*!< out: minimum length of a char (in bytes) */
-	unsigned*mbmaxlen);	/*!< out: maximum length of a char (in bytes) */
-
-/******************************************************************//**
 Compares NUL-terminated UTF-8 strings case insensitively.
 @return 0 if a=b, <0 if a<b, >1 if a>b */
 int
@@ -208,11 +183,6 @@ innobase_casedn_str(
 	char*	a);	/*!< in/out: string to put in lower case */
 
 #ifdef WITH_WSREP
-void
-wsrep_innobase_kill_one_trx(
-	THD* bf_thd,
-	trx_t *victim_trx,
-	my_bool signal);
 ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
                              unsigned char* str, ulint str_length,
                              ulint buf_length);
@@ -220,6 +190,15 @@ ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
 
 extern "C" struct charset_info_st *thd_charset(THD *thd);
 
+/** Get high resolution timestamp for the current query start time.
+The timestamp is not anchored to any specific point in time,
+but can be used for comparison.
+@param thd user thread
+@retval timestamp in microseconds precision
+*/
+extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd);
+
+
 /** Determines the current SQL statement.
 Thread unsafe, can only be called from the thread owning the THD.
 @param[in]	thd	MySQL thread handle
@@ -250,45 +229,16 @@ innobase_get_at_most_n_mbchars(
 @param[in]	thd	thread handle, or NULL to query
 			the global innodb_tmpdir.
 @retval NULL if innodb_tmpdir="" */
-UNIV_INTERN
-const char*
-thd_innodb_tmpdir(
-	THD*	thd);
+const char *thd_innodb_tmpdir(THD *thd);
 
 /******************************************************************//**
 Returns the lock wait timeout for the current connection.
 @return the lock wait timeout, in seconds */
-ulong
+uint&
 thd_lock_wait_timeout(
 /*==================*/
 	THD*	thd);	/*!< in: thread handle, or NULL to query
 			the global innodb_lock_wait_timeout */
-/** Get status of innodb_tmpdir.
-@param[in]	thd	thread handle, or NULL to query
-			the global innodb_tmpdir.
-@retval NULL if innodb_tmpdir="" */
-const char*
-thd_innodb_tmpdir(
-	THD*	thd);
-
-/**********************************************************************//**
-Get the current setting of the table_cache_size global parameter. We do
-a dirty read because for one there is no synchronization object and
-secondly there is little harm in doing so even if we get a torn read.
-@return SQL statement string */
-ulint
-innobase_get_table_cache_size(void);
-/*===============================*/
-
-/**********************************************************************//**
-Get the current setting of the lower_case_table_names global parameter from
-mysqld.cc. We do a dirty read because for one there is no synchronization
-object and secondly there is little harm in doing so even if we get a torn
-read.
-@return value of lower_case_table_names */
-ulint
-innobase_get_lower_case_table_names(void);
-/*=====================================*/
 
 /******************************************************************//**
 compare two character string case insensitively according to their charset. */
@@ -458,7 +408,6 @@ innobase_convert_to_filename_charset(
 
 /********************************************************************//**
 Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
 void
 ib_push_warning(
 	trx_t*		trx,	/*!< in: trx */
@@ -468,7 +417,6 @@ ib_push_warning(
 
 /********************************************************************//**
 Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
 void
 ib_push_warning(
 	void*		ithd,	/*!< in: thd */
@@ -478,7 +426,6 @@ ib_push_warning(
 
 /********************************************************************//**
 Helper function to push warnings from InnoDB internals to SQL-layer. */
-UNIV_INTERN
 void
 ib_foreign_warn(
 	trx_t*		trx,	/*!< in: trx */
@@ -498,19 +445,13 @@ normalize_table_name_c_low(
 	char*		norm_name,	/*!< out: normalized name as a
 					null-terminated string */
 	const char*	name,		/*!< in: table name string */
-	ibool		set_lower_case); /*!< in: TRUE if we want to set
+	bool		set_lower_case); /*!< in: true if we want to set
 					name to lower case */
-/** Update the system variable with the given value of the InnoDB
-buffer pool size.
-@param[in]	buf_pool_size	given value of buffer pool size.*/
-void
-innodb_set_buf_pool_size(ulonglong buf_pool_size);
 
 /** Create a MYSQL_THD for a background thread and mark it as such.
 @param name thread info for SHOW PROCESSLIST
 @return new MYSQL_THD */
-MYSQL_THD
-innobase_create_background_thd(const char* name);
+MYSQL_THD innobase_create_background_thd(const char* name);
 
 /** Destroy a THD object associated with a background task.
 @param[in]	thd	MYSQL_THD to destroy */
@@ -521,5 +462,15 @@ void destroy_background_thd(MYSQL_THD thd);
 void
 innobase_reset_background_thd(MYSQL_THD);
 
+#ifdef WITH_WSREP
+/** Append table-level exclusive key.
+@param thd   MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+struct dict_table_t;
+bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table);
+#endif /* WITH_WSREP */
+
 #endif /* !UNIV_INNOCHECKSUM */
 #endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index 981ff5a0814..6eb5bb3f183 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,12 +26,47 @@ Created 5/20/1997 Heikki Tuuri
 
 #pragma once
 #include "ut0rnd.h"
+#include "ut0new.h"
 
 struct hash_table_t;
-struct hash_cell_t{
-	void*	node;	/*!< hash chain node, NULL if none */
+struct hash_cell_t
+{
+  /** singly-linked, nullptr terminated list of hash buckets */
+  void *node;
+
+  /** Append an element.
+  @tparam T      type of the element
+  @param insert  the being-inserted element
+  @param next    the next-element pointer in T */
+  template<typename T>
+  void append(T &insert, T *T::*next)
+  {
+    void **after;
+    for (after= &node; *after;
+         after= reinterpret_cast<void**>(&(static_cast<T*>(*after)->*next)));
+    insert.*next= nullptr;
+    *after= &insert;
+  }
+
+  /** Insert an element after another.
+  @tparam T  type of the element
+  @param after   the element after which to insert
+  @param insert  the being-inserted element
+  @param next    the next-element pointer in T */
+  template<typename T>
+  void insert_after(T &after, T &insert, T *T::*next)
+  {
+#ifdef UNIV_DEBUG
+    for (const T *c= static_cast<const T*>(node); c; c= c->*next)
+      if (c == &after)
+        goto found;
+    ut_error;
+  found:
+#endif
+    insert.*next= after.*next;
+    after.*next= &insert;
+  }
 };
-typedef void*	hash_node_t;
 
 /*******************************************************************//**
 Inserts a struct to a hash table. */
@@ -59,29 +94,6 @@ do {\
 	}\
 } while (0)
 
-/*******************************************************************//**
-Inserts a struct to the head of hash table. */
-
-#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA)	\
-do {							\
-	hash_cell_t*	cell3333;			\
-	TYPE*		struct3333;			\
-							\
-	(DATA)->NAME = NULL;				\
-							\
-	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)];	\
-							\
-	if (cell3333->node == NULL) {			\
-		cell3333->node = DATA;			\
-		DATA->NAME = NULL;			\
-	} else {					\
-		struct3333 = (TYPE*) cell3333->node;	\
-							\
-		DATA->NAME = struct3333;		\
-							\
-		cell3333->node = DATA;			\
-	}						\
-} while (0)
 #ifdef UNIV_HASH_DEBUG
 # define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
 # define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
@@ -117,18 +129,6 @@ do {\
 	HASH_INVALIDATE(DATA, NAME);\
 } while (0)
 
-#define HASH_REPLACE(TYPE, NAME, TABLE, FOLD, DATA_OLD, DATA_NEW)             \
-	do {                                                                  \
-		(DATA_NEW)->NAME = (DATA_OLD)->NAME;                          \
-                                                                              \
-		hash_cell_t& cell3333                                         \
-			= (TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
-		TYPE** struct3333 = (TYPE**)&cell3333.node;                   \
-		while (*struct3333 != DATA_OLD) {                             \
-			struct3333 = &((*struct3333)->NAME);                  \
-		}                                                             \
-		*struct3333 = DATA_NEW;                                       \
-	} while (0)
 /*******************************************************************//**
 Gets the first struct in a hash chain, NULL if none. */
 
@@ -183,33 +183,6 @@ do {									\
 	}								\
 } while (0)
 
-/****************************************************************//**
-Move all hash table entries from OLD_TABLE to NEW_TABLE. */
-
-#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
-do {\
-	ulint		i2222;\
-	ulint		cell_count2222;\
-\
-	cell_count2222 = (OLD_TABLE)->n_cells;	\
-\
-	for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
-		NODE_TYPE*	node2222 = static_cast<NODE_TYPE*>(\
-			HASH_GET_FIRST((OLD_TABLE), i2222));\
-\
-		while (node2222) {\
-			NODE_TYPE*	next2222 = static_cast<NODE_TYPE*>(\
-				node2222->PTR_NAME);\
-			ulint		fold2222 = FOLD_FUNC(node2222);\
-\
-			HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
-				fold2222, node2222);\
-\
-			node2222 = next2222;\
-		}\
-	}\
-} while (0)
-
 /** Hash table with singly-linked overflow lists */
 struct hash_table_t
 {
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
index 73be4b0a8e8..e38515f0402 100644
--- a/storage/innobase/include/ibuf0ibuf.h
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -279,8 +279,6 @@ Must not be called when recv_no_ibuf_operations==true.
 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
 @param[in]	x_latch		FALSE if relaxed check (avoid latching the
 bitmap page)
-@param[in]	file		file name
-@param[in]	line		line where called
 @param[in,out]	mtr		mtr which will contain an x-latch to the
 bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
 in which case a new transaction is created.
@@ -292,8 +290,6 @@ ibuf_page_low(
 #ifdef UNIV_DEBUG
 	bool			x_latch,
 #endif /* UNIV_DEBUG */
-	const char*		file,
-	unsigned		line,
 	mtr_t*			mtr)
 	MY_ATTRIBUTE((warn_unused_result));
 
@@ -305,7 +301,7 @@ Must not be called when recv_no_ibuf_operations==true.
 @param[in,out]	mtr		mini-transaction or NULL
 @return TRUE if level 2 or level 3 page */
 # define ibuf_page(page_id, zip_size, mtr)	\
-	ibuf_page_low(page_id, zip_size, true, __FILE__, __LINE__, mtr)
+	ibuf_page_low(page_id, zip_size, true, mtr)
 
 #else /* UNIV_DEBUG */
 
@@ -316,7 +312,7 @@ Must not be called when recv_no_ibuf_operations==true.
 @param[in,out]	mtr		mini-transaction or NULL
 @return TRUE if level 2 or level 3 page */
 # define ibuf_page(page_id, zip_size, mtr)	\
-	ibuf_page_low(page_id, zip_size, __FILE__, __LINE__, mtr)
+	ibuf_page_low(page_id, zip_size, mtr)
 
 #endif /* UNIV_DEBUG */
 /***********************************************************************//**
@@ -360,9 +356,11 @@ exist entries for such a page if the page belonged to an index which
 subsequently was dropped.
 @param block    X-latched page to try to apply changes to, or NULL to discard
 @param page_id  page identifier
-@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */
-void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
-                                   ulint zip_size);
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+                                      const page_id_t page_id,
+                                      ulint zip_size);
 
 /** Delete all change buffer entries for a tablespace,
 in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
index 9f4e937f31d..1e21f74ff2b 100644
--- a/storage/innobase/include/ibuf0ibuf.inl
+++ b/storage/innobase/include/ibuf0ibuf.inl
@@ -100,9 +100,8 @@ ibuf_should_try(
 						decide */
 {
 	return(innodb_change_buffering
+	       && !(index->type & (DICT_CLUSTERED | DICT_IBUF))
 	       && ibuf.max_size != 0
-	       && !dict_index_is_clust(index)
-	       && !dict_index_is_spatial(index)
 	       && index->table->quiesce == QUIESCE_NONE
 	       && (ignore_sec_unique || !dict_index_is_unique(index)));
 }
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index 40bb557a5b2..16acd031177 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -28,40 +28,30 @@ Created 5/7/1996 Heikki Tuuri
 #define lock0lock_h
 
 #include "buf0types.h"
-#include "trx0types.h"
+#include "trx0trx.h"
 #include "mtr0types.h"
 #include "rem0types.h"
-#include "que0types.h"
-#include "lock0types.h"
 #include "hash0hash.h"
 #include "srv0srv.h"
 #include "ut0vec.h"
 #include "gis0rtree.h"
 #include "lock0prdt.h"
-
-/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
-	setting innodb_lock_schedule_algorithm. */
-enum innodb_lock_schedule_algorithm_t {
-	/*!< First Come First Served */
-	INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
-	/*!< Variance-Aware-Transaction-Scheduling */
-	INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
-};
-
-extern ulong innodb_lock_schedule_algorithm;
+#include "transactional_lock_guard.h"
 
 // Forward declaration
 class ReadView;
 
 /** The value of innodb_deadlock_detect */
-extern my_bool	innobase_deadlock_detect;
+extern my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+extern ulong innodb_deadlock_report;
+
+namespace Deadlock
+{
+  /** The allowed values of innodb_deadlock_report */
+  enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
+}
 
-/*********************************************************************//**
-Gets the size of a lock struct.
-@return size in bytes */
-ulint
-lock_get_size(void);
-/*===============*/
 /*********************************************************************//**
 Gets the heap_no of the smallest user record on a page.
 @return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
@@ -70,6 +60,12 @@ ulint
 lock_get_min_heap_no(
 /*=================*/
 	const buf_block_t*	block);	/*!< in: buffer block */
+
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index   a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
+
 /*************************************************************//**
 Updates the lock table when we have reorganized a page. NOTE: we copy
 also the locks set on the infimum of the page; the infimum may carry
@@ -129,28 +125,18 @@ lock_update_merge_right(
 	const buf_block_t*	left_block);	/*!< in: merged index
 						page which will be
 						discarded */
-/*************************************************************//**
-Updates the lock table when the root page is copied to another in
-btr_root_raise_and_insert. Note that we leave lock structs on the
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
 root page, even though they do not make sense on other than leaf
 pages: the reason is that in a pessimistic update the infimum record
 of the root page will act as a dummy carrier of the locks of the record
 to be updated. */
-void
-lock_update_root_raise(
-/*===================*/
-	const buf_block_t*	block,	/*!< in: index page to which copied */
-	const buf_block_t*	root);	/*!< in: root page */
-/*************************************************************//**
-Updates the lock table when a page is copied to another and the original page
-is removed from the chain of leaf pages, except if page is the root! */
-void
-lock_update_copy_and_discard(
-/*=========================*/
-	const buf_block_t*	new_block,	/*!< in: index page to
-						which copied */
-	const buf_block_t*	block);		/*!< in: index page;
-						NOT the root! */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root);
+/** Update the lock table when a page is copied to another.
+@param new_block  the target page
+@param old        old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old);
+
 /** Update gap locks between the last record of the left_block and the
 first record of the right_block when a record is about to be inserted
 at the start of the right_block, even though it should "naturally" be
@@ -192,24 +178,16 @@ lock_update_split_left(
 /*===================*/
 	const buf_block_t*	right_block,	/*!< in: right page */
 	const buf_block_t*	left_block);	/*!< in: left page */
-/*************************************************************//**
-Updates the lock table when a page is merged to the left. */
-void
-lock_update_merge_left(
-/*===================*/
-	const buf_block_t*	left_block,	/*!< in: left page to
-						which merged */
-	const rec_t*		orig_pred,	/*!< in: original predecessor
-						of supremum on the left page
-						before merge */
-	const buf_block_t*	right_block);	/*!< in: merged index page
-						which will be discarded */
-/*************************************************************//**
-Updates the lock table when a page is split and merged to
-two pages. */
-UNIV_INTERN
-void
-lock_update_split_and_merge(
+/** Update the lock table when a page is merged to the left.
+@param left      left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right     merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+                            const page_id_t right);
+
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
 	const buf_block_t* left_block,	/*!< in: left page to which merged */
 	const rec_t* orig_pred,		/*!< in: original predecessor of
 					supremum on the left page before merge*/
@@ -220,9 +198,9 @@ inherited from rec. */
 void
 lock_rec_reset_and_inherit_gap_locks(
 /*=================================*/
-	const buf_block_t*	heir_block,	/*!< in: block containing the
+	const buf_block_t&	heir_block,	/*!< in: block containing the
 						record which inherits */
-	const buf_block_t*	block,		/*!< in: block containing the
+	const page_id_t		donor,		/*!< in: page containing the
 						record from which inherited;
 						does NOT reset the locks on
 						this record */
@@ -271,20 +249,25 @@ lock_rec_store_on_page_infimum(
 					record of the same page; lock
 					bits are reset on the
 					record */
-/*********************************************************************//**
-Restores the state of explicit lock requests on a single record, where the
-state was stored on the infimum of the page. */
-void
-lock_rec_restore_from_page_infimum(
-/*===============================*/
-	const buf_block_t*	block,	/*!< in: buffer block containing rec */
-	const rec_t*		rec,	/*!< in: record whose lock state
-					is restored */
-	const buf_block_t*	donator);/*!< in: page (rec is not
-					necessarily on this page)
-					whose infimum stored the lock
-					state; lock bits are reset on
-					the infimum */
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block   buffer block containing rec
+@param rec     record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+					const rec_t *rec, page_id_t donator);
+
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table      table on which the lock is created
+@param type_mode  lock type and mode
+@param trx        transaction
+@param c_lock     conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+                          lock_t *c_lock= nullptr);
+
 /*********************************************************************//**
 Checks if locks of other transactions prevent an immediate insert of
 a record. If they do, first tests if the query thread should anyway
@@ -295,8 +278,6 @@ for a gap x-lock to the lock queue.
 dberr_t
 lock_rec_insert_check_and_lock(
 /*===========================*/
-	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
-				set, does nothing */
 	const rec_t*	rec,	/*!< in: record after which to insert */
 	buf_block_t*	block,	/*!< in/out: buffer block of rec */
 	dict_index_t*	index,	/*!< in: index */
@@ -319,8 +300,6 @@ lock queue.
 dberr_t
 lock_clust_rec_modify_check_and_lock(
 /*=================================*/
-	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
-					bit is set, does nothing */
 	const buf_block_t*	block,	/*!< in: buffer block of rec */
 	const rec_t*		rec,	/*!< in: record which should be
 					modified */
@@ -430,71 +409,41 @@ lock_clust_rec_read_check_and_lock_alt(
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Checks that a record is seen in a consistent read.
-@return true if sees, or false if an earlier version of the record
-should be retrieved */
-bool
-lock_clust_rec_cons_read_sees(
-/*==========================*/
-	const rec_t*	rec,	/*!< in: user record which should be read or
-				passed over by a read cursor */
-	dict_index_t*	index,	/*!< in: clustered index */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	ReadView*	view);	/*!< in: consistent read view */
-/*********************************************************************//**
-Checks that a non-clustered index record is seen in a consistent read.
 
-NOTE that a non-clustered index page contains so little information on
-its modifications that also in the case false, the present version of
-rec may be the right, but we must check this from the clustered index
-record.
-
-@return true if certainly sees, or false if an earlier version of the
-clustered index record might be needed */
-bool
-lock_sec_rec_cons_read_sees(
-/*========================*/
-	const rec_t*		rec,	/*!< in: user record which
-					should be read or passed over
-					by a read cursor */
-	const dict_index_t*     index,  /*!< in: index */
-	const ReadView*	view)	/*!< in: consistent read view */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Locks the specified database table in the mode given. If the lock cannot
-be granted immediately, the query thread is put to wait.
-@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
-dberr_t
-lock_table(
-/*=======*/
-	unsigned	flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
-				does nothing */
-	dict_table_t*	table,	/*!< in/out: database table
-				in dictionary cache */
-	lock_mode	mode,	/*!< in: lock mode */
-	que_thr_t*	thr)	/*!< in: query thread */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Creates a table IX lock object for a resurrected transaction. */
-void
-lock_table_ix_resurrect(
-/*====================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	trx_t*		trx);	/*!< in/out: transaction */
+/** Acquire a table lock.
+@param table   table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode    lock mode
+@param thr     SQL execution thread
+@retval DB_SUCCESS    if the lock was acquired
+@retval DB_DEADLOCK   if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT  if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+                   lock_mode mode, que_thr_t *thr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Create a table lock object for a resurrected transaction.
+@param table    table to be X-locked
+@param trx      transaction
+@param mode     LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode);
 
 /** Sets a lock on a table based on the given mode.
-@param[in]	table	table to lock
-@param[in,out]	trx	transaction
-@param[in]	mode	LOCK_X or LOCK_S
-@return error code or DB_SUCCESS. */
-dberr_t
-lock_table_for_trx(
-	dict_table_t*	table,
-	trx_t*		trx,
-	enum lock_mode	mode)
+@param table	table to lock
+@param trx	transaction
+@param mode	LOCK_X or LOCK_S
+@param no_wait  whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+                           bool no_wait= false)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
+/** Exclusively lock the data dictionary tables.
+@param trx  dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx);
+
 /*************************************************************//**
 Removes a granted record lock of a transaction from the queue and grants
 locks to other transactions waiting in the queue if they now are entitled
@@ -504,7 +453,7 @@ lock_rec_unlock(
 /*============*/
 	trx_t*			trx,	/*!< in/out: transaction that has
 					set a record lock */
-	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const page_id_t		id,	/*!< in: page containing rec */
 	const rec_t*		rec,	/*!< in: record */
 	lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
 
@@ -512,17 +461,17 @@ lock_rec_unlock(
 and release possible other transactions waiting because of these locks. */
 void lock_release(trx_t* trx);
 
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx);
+
 /** Release non-exclusive locks on XA PREPARE,
 and release possible other transactions waiting because of these locks. */
 void lock_release_on_prepare(trx_t *trx);
 
-/*************************************************************//**
-Get the lock hash table */
-UNIV_INLINE
-hash_table_t*
-lock_hash_get(
-/*==========*/
-	ulint	mode);	/*!< in: lock mode */
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table);
 
 /**********************************************************************//**
 Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
@@ -559,124 +508,48 @@ lock_report_trx_id_insanity(
 	trx_id_t	max_trx_id);	/*!< in: trx_sys.get_max_trx_id() */
 /*********************************************************************//**
 Prints info of locks for all transactions.
-@return FALSE if not able to obtain lock mutex and exits without
-printing info */
+@return FALSE if not able to acquire lock_sys.latch (and display info) */
 ibool
 lock_print_info_summary(
 /*====================*/
 	FILE*	file,	/*!< in: file where to print */
-	ibool   nowait)	/*!< in: whether to wait for the lock mutex */
+	ibool   nowait)	/*!< in: whether to wait for lock_sys.latch */
 	MY_ATTRIBUTE((warn_unused_result));
 
 /** Prints transaction lock wait and MVCC state.
 @param[in,out]	file	file where to print
 @param[in]	trx	transaction
-@param[in]	now	current time */
-void
-lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now);
+@param[in]	now	current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+                                        my_hrtime_t now);
 
 /*********************************************************************//**
-Prints info of locks for each transaction. This function assumes that the
-caller holds the lock mutex and more importantly it will release the lock
-mutex on behalf of the caller. (This should be fixed in the future). */
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
 void
 lock_print_info_all_transactions(
 /*=============================*/
 	FILE*	file);	/*!< in: file where to print */
-/*********************************************************************//**
-Return approximate number or record locks (bits set in the bitmap) for
-this transaction. Since delete-marked records may be removed, the
-record count will not be precise.
-The caller must be holding lock_sys.mutex. */
-ulint
-lock_number_of_rows_locked(
-/*=======================*/
-	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
-	MY_ATTRIBUTE((warn_unused_result));
 
 /*********************************************************************//**
 Return the number of table locks for a transaction.
-The caller must be holding lock_sys.mutex. */
+The caller must be holding lock_sys.latch. */
 ulint
 lock_number_of_tables_locked(
 /*=========================*/
 	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/*******************************************************************//**
-Gets the type of a lock. Non-inline version for using outside of the
-lock module.
-@return LOCK_TABLE or LOCK_REC */
-ulint
-lock_get_type(
-/*==========*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-Gets the id of the table on which the lock is.
-@return id of the table */
-table_id_t
-lock_get_table_id(
-/*==============*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/** Determine which table a lock is associated with.
-@param[in]	lock	the lock
-@return name of the table */
-const table_name_t&
-lock_get_table_name(
-	const lock_t*	lock);
-
-/*******************************************************************//**
-For a record lock, gets the index on which the lock is.
-@return index */
-const dict_index_t*
-lock_rec_get_index(
-/*===============*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-For a record lock, gets the name of the index on which the lock is.
-The string should not be free()'d or modified.
-@return name of the index */
-const char*
-lock_rec_get_index_name(
-/*====================*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*******************************************************************//**
-Check if there are any locks (table or rec) against table.
-@return TRUE if locks exist */
-bool
-lock_table_has_locks(
-/*=================*/
-	const dict_table_t*	table);	/*!< in: check if there are any locks
-					held on records in this table or on the
-					table itself */
+/** Check if there are any locks on a table.
+@return true if table has either table or record locks. */
+bool lock_table_has_locks(dict_table_t *table);
 
-/** A task which wakes up threads whose lock wait may have lasted too long */
-void lock_wait_timeout_task(void*);
-
-/********************************************************************//**
-Releases a user OS thread waiting for a lock to be released, if the
-thread is already suspended. */
-void
-lock_wait_release_thread_if_suspended(
-/*==================================*/
-	que_thr_t*	thr);	/*!< in: query thread associated with the
-				user OS thread	 */
-
-/***************************************************************//**
-Puts a user OS thread to wait for a lock to be released. If an error
-occurs during the wait trx->error_state associated with thr is
-!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
-are possible errors. DB_DEADLOCK is returned if selective deadlock
-resolution chose this transaction as a victim. */
-void
-lock_wait_suspend_thread(
-/*=====================*/
-	que_thr_t*	thr);	/*!< in: query thread associated with the
-				user OS thread */
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr);
 /*********************************************************************//**
 Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
 function should be called at the the end of an SQL statement, by the
@@ -685,28 +558,15 @@ void
 lock_unlock_table_autoinc(
 /*======================*/
 	trx_t*	trx);			/*!< in/out: transaction */
-/*********************************************************************//**
-Check whether the transaction has already been rolled back because it
-was selected as a deadlock victim, or if it has to wait then cancel
-the wait lock.
-@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
-dberr_t
-lock_trx_handle_wait(
-/*=================*/
-	trx_t*	trx);	/*!< in/out: trx lock state */
-/*********************************************************************//**
-Get the number of locks on a table.
-@return number of locks */
-ulint
-lock_table_get_n_locks(
-/*===================*/
-	const dict_table_t*	table);	/*!< in: table */
-/*******************************************************************//**
-Initialise the trx lock list. */
-void
-lock_trx_lock_list_init(
-/*====================*/
-	trx_lock_list_t*	lock_list);	/*!< List to initialise */
+
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
+@param trx           transaction that is or was waiting for a lock
+@retval DB_SUCCESS   if the lock was granted
+@retval DB_DEADLOCK  if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+                     lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx);
 
 /*********************************************************************//**
 Checks that a transaction id is sensible, i.e., not in the future.
@@ -732,16 +592,11 @@ lock_trx_has_sys_table_locks(
 /** Check if the transaction holds an explicit exclusive lock on a record.
 @param[in]	trx	transaction
 @param[in]	table	table
-@param[in]	block	leaf page
+@param[in]	id	leaf page identifier
 @param[in]	heap_no	heap number identifying the record
 @return whether an explicit X-lock is held */
-bool
-lock_trx_has_expl_x_lock(
-	const trx_t*		trx,	/*!< in: transaction to check */
-	const dict_table_t*	table,	/*!< in: table to check */
-	const buf_block_t*	block,	/*!< in: buffer block of the record */
-	ulint			heap_no)/*!< in: record heap number */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+                              page_id_t id, ulint heap_no);
 #endif /* UNIV_DEBUG */
 
 /** Lock operation struct */
@@ -750,47 +605,164 @@ struct lock_op_t{
 	lock_mode	mode;	/*!< lock mode */
 };
 
-typedef ib_mutex_t LockMutex;
-
 /** The lock system struct */
 class lock_sys_t
 {
+  friend struct LockGuard;
+  friend struct LockMultiGuard;
+  friend struct TMLockGuard;
+  friend struct TMLockMutexGuard;
+  friend struct TMLockTrxGuard;
+
+  /** Hash table latch */
+  struct hash_latch
+#ifdef SUX_LOCK_GENERIC
+  : private rw_lock
+  {
+    /** Wait for an exclusive lock */
+    void wait();
+    /** Try to acquire a lock */
+    bool try_acquire() { return write_trylock(); }
+    /** Acquire a lock */
+    void acquire() { if (!try_acquire()) wait(); }
+    /** Release a lock */
+    void release();
+    /** @return whether any lock is being held or waited for by any thread */
+    bool is_locked_or_waiting() const
+    { return rw_lock::is_locked_or_waiting(); }
+    /** @return whether this latch is possibly held by any thread */
+    bool is_locked() const { return rw_lock::is_locked(); }
+#else
+  {
+  private:
+    srw_spin_lock_low lock;
+  public:
+    /** Try to acquire a lock */
+    bool try_acquire() { return lock.wr_lock_try(); }
+    /** Acquire a lock */
+    void acquire() { lock.wr_lock(); }
+    /** Release a lock */
+    void release() { lock.wr_unlock(); }
+    /** @return whether any lock may be held by any thread */
+    bool is_locked_or_waiting() const noexcept
+    { return lock.is_locked_or_waiting(); }
+    /** @return whether this latch is possibly held by any thread */
+    bool is_locked() const noexcept { return lock.is_locked(); }
+#endif
+  };
+
+public:
+  struct hash_table
+  {
+    /** Number of consecutive array[] elements occupied by a hash_latch */
+    static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2;
+    static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation");
+
+    /** Number of array[] elements per hash_latch.
+    Must be LATCH less than a power of 2. */
+    static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH;
+    static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+      ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+    /** number of payload elements in array[]. Protected by lock_sys.latch. */
+    ulint n_cells;
+    /** the hash table, with pad(n_cells) elements, aligned to L1 cache size;
+    in any hash chain, lock_t::is_waiting() entries must not precede
+    granted locks */
+    hash_cell_t *array;
+
+    /** Create the hash table.
+    @param n  the lower bound of n_cells */
+    void create(ulint n);
+
+    /** Resize the hash table.
+    @param n  the lower bound of n_cells */
+    void resize(ulint n);
+
+    /** Free the hash table. */
+    void free() { aligned_free(array); array= nullptr; }
+
+    /** @return the index of an array element */
+    inline ulint calc_hash(ulint fold) const;
+
+    /** @return raw array index converted to padded index */
+    static ulint pad(ulint h)
+    {
+      ulint latches= LATCH * (h / ELEMENTS_PER_LATCH);
+      ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH;
+      return LATCH + latches + empty_slots + h;
+    }
+
+    /** Get a latch. */
+    static hash_latch *latch(hash_cell_t *cell)
+    {
+      void *l= ut_align_down(cell, sizeof *cell *
+                             (ELEMENTS_PER_LATCH + LATCH));
+      return static_cast<hash_latch*>(l);
+    }
+    /** Get a hash table cell. */
+    inline hash_cell_t *cell_get(ulint fold) const;
+
+#ifdef UNIV_DEBUG
+    void assert_locked(const page_id_t id) const;
+#else
+    void assert_locked(const page_id_t) const {}
+#endif
+
+  private:
+    /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+    static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+    /** @return the index of an array element */
+    static ulint calc_hash(ulint fold, ulint n_cells)
+    {
+      return pad(hash(fold, n_cells));
+    }
+  };
+
+private:
   bool m_initialised;
 
+  /** mutex proteting the locks */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
+#ifdef UNIV_DEBUG
+  /** The owner of exclusive latch (0 if none); protected by latch */
+  std::atomic<pthread_t> writer{0};
+  /** Number of shared latches */
+  std::atomic<ulint> readers{0};
+#endif
+#ifdef SUX_LOCK_GENERIC
+protected:
+  /** mutex for hash_latch::wait() */
+  pthread_mutex_t hash_mutex;
+  /** condition variable for hash_latch::wait() */
+  pthread_cond_t hash_cond;
+#endif
 public:
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	LockMutex	mutex;			/*!< Mutex protecting the
-						locks */
   /** record locks */
-  hash_table_t rec_hash;
+  hash_table rec_hash;
   /** predicate locks for SPATIAL INDEX */
-  hash_table_t prdt_hash;
+  hash_table prdt_hash;
   /** page locks for SPATIAL INDEX */
-  hash_table_t prdt_page_hash;
-
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	LockMutex	wait_mutex;		/*!< Mutex protecting the
-						next two fields */
-	srv_slot_t*	waiting_threads;	/*!< Array  of user threads
-						suspended while waiting for
-						locks within InnoDB, protected
-						by the lock_sys.wait_mutex;
-						os_event_set() and
-						os_event_reset() on
-						waiting_threads[]->event
-						are protected by
-						trx_t::mutex */
-	srv_slot_t*	last_slot;		/*!< highest slot ever used
-						in the waiting_threads array,
-						protected by
-						lock_sys.wait_mutex */
-
-	ulint		n_lock_max_wait_time;	/*!< Max wait time */
-
-	std::unique_ptr<tpool::timer>	timeout_timer; /*!< Thread pool timer task */
-	bool timeout_timer_active;
-
-
+  hash_table prdt_page_hash;
+
+  /** mutex covering lock waits; @see trx_lock_t::wait_lock */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
+private:
+  /** The increment of wait_count for a wait. Anything smaller is a
+  pending wait count. */
+  static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19;
+  /** waits and total number of lock waits; protected by wait_mutex */
+  uint64_t wait_count;
+  /** Cumulative wait time; protected by wait_mutex */
+  uint32_t wait_time;
+  /** Longest wait time; protected by wait_mutex */
+  uint32_t wait_time_max;
+public:
+  /** number of deadlocks detected; protected by wait_mutex */
+  ulint deadlocks;
+  /** number of lock wait timeouts; protected by wait_mutex */
+  ulint timeouts;
   /**
     Constructor.
 
@@ -800,8 +772,99 @@ public:
   lock_sys_t(): m_initialised(false) {}
 
 
-  bool is_initialised() { return m_initialised; }
+  bool is_initialised() const { return m_initialised; }
+
+#ifdef UNIV_PFS_RWLOCK
+  /** Acquire exclusive lock_sys.latch */
+  ATTRIBUTE_NOINLINE
+  void wr_lock(const char *file, unsigned line);
+  /** Release exclusive lock_sys.latch */
+  ATTRIBUTE_NOINLINE void wr_unlock();
+  /** Acquire shared lock_sys.latch */
+  ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line);
+  /** Release shared lock_sys.latch */
+  ATTRIBUTE_NOINLINE void rd_unlock();
+#else
+  /** Acquire exclusive lock_sys.latch */
+  void wr_lock()
+  {
+    mysql_mutex_assert_not_owner(&wait_mutex);
+    ut_ad(!is_writer());
+    latch.wr_lock();
+    ut_ad(!writer.exchange(pthread_self(),
+                           std::memory_order_relaxed));
+  }
+  /** Release exclusive lock_sys.latch */
+  void wr_unlock()
+  {
+    ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+          pthread_self());
+    latch.wr_unlock();
+  }
+  /** Acquire shared lock_sys.latch */
+  void rd_lock()
+  {
+    mysql_mutex_assert_not_owner(&wait_mutex);
+    ut_ad(!is_writer());
+    latch.rd_lock();
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+  }
+  /** Release shared lock_sys.latch */
+  void rd_unlock()
+  {
+    ut_ad(!is_writer());
+    ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+    latch.rd_unlock();
+  }
+#endif
+  /** Try to acquire exclusive lock_sys.latch
+  @return whether the latch was acquired */
+  bool wr_lock_try()
+  {
+    ut_ad(!is_writer());
+    if (!latch.wr_lock_try()) return false;
+    ut_ad(!writer.exchange(pthread_self(),
+                           std::memory_order_relaxed));
+    return true;
+  }
+  /** Try to acquire shared lock_sys.latch
+  @return whether the latch was acquired */
+  bool rd_lock_try()
+  {
+    ut_ad(!is_writer());
+    if (!latch.rd_lock_try()) return false;
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+    return true;
+  }
 
+  /** Assert that wr_lock() has been invoked by this thread */
+  void assert_locked() const { ut_ad(is_writer()); }
+  /** Assert that wr_lock() has not been invoked by this thread */
+  void assert_unlocked() const { ut_ad(!is_writer()); }
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread is the lock_sys.latch writer */
+  bool is_writer() const
+  {
+# ifdef SUX_LOCK_GENERIC
+    return writer.load(std::memory_order_relaxed) == pthread_self();
+# else
+    return writer.load(std::memory_order_relaxed) == pthread_self() ||
+      (xtest() && !latch.is_locked_or_waiting());
+# endif
+  }
+  /** Assert that a lock shard is exclusively latched (by some thread) */
+  void assert_locked(const lock_t &lock) const;
+  /** Assert that a table lock shard is exclusively latched by this thread */
+  void assert_locked(const dict_table_t &table) const;
+  /** Assert that a hash table cell is exclusively latched (by some thread) */
+  void assert_locked(const hash_cell_t &cell) const;
+#else
+  void assert_locked(const lock_t &) const {}
+  void assert_locked(const dict_table_t &) const {}
+  void assert_locked(const hash_cell_t &) const {}
+#endif
 
   /**
     Creates the lock system at database start.
@@ -822,45 +885,296 @@ public:
   /** Closes the lock system at database shutdown. */
   void close();
 
-  /** @return the hash value for a page address */
-  ulint hash(const page_id_t id) const
-  { ut_ad(mutex_own(&mutex)); return rec_hash.calc_hash(id.fold()); }
 
-  /** Get the first lock on a page.
-  @param lock_hash   hash table to look at
-  @param id          page number
-  @return first lock
-  @retval nullptr if none exists */
-  lock_t *get_first(const hash_table_t &lock_hash, const page_id_t id) const
+  /** Check for deadlocks while holding only lock_sys.wait_mutex. */
+  void deadlock_check();
+
+  /** Cancel a waiting lock request.
+  @tparam check_victim  whether to check for DB_DEADLOCK
+  @param trx            active transaction
+  @param lock           waiting lock request
+  @retval DB_SUCCESS    if no lock existed
+  @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
+  @retval DB_LOCK_WAIT  if the lock was canceled */
+  template<bool check_victim>
+  static dberr_t cancel(trx_t *trx, lock_t *lock);
+  /** Cancel a waiting lock request (if any) when killing a transaction */
+  static void cancel(trx_t *trx);
+
+  /** Note that a record lock wait started */
+  inline void wait_start();
+
+  /** Note that a record lock wait resumed */
+  inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now);
+
+  /** @return pending number of lock waits */
+  ulint get_wait_pending() const
+  {
+    return static_cast<ulint>(wait_count & (WAIT_COUNT_STEP - 1));
+  }
+  /** @return cumulative number of lock waits */
+  ulint get_wait_cumulative() const
+  { return static_cast<ulint>(wait_count / WAIT_COUNT_STEP); }
+  /** Cumulative wait time; protected by wait_mutex */
+  ulint get_wait_time_cumulative() const { return wait_time; }
+  /** Longest wait time; protected by wait_mutex */
+  ulint get_wait_time_max() const { return wait_time_max; }
+
+  /** Get the lock hash table for a mode */
+  hash_table &hash_get(ulint mode)
   {
-    ut_ad(&lock_hash == &rec_hash || &lock_hash == &prdt_hash ||
-          &lock_hash == &prdt_page_hash);
-    for (lock_t *lock= static_cast<lock_t*>
-         (HASH_GET_FIRST(&lock_hash, hash(id)));
-         lock; lock= static_cast<lock_t*>(HASH_GET_NEXT(hash, lock)))
-      if (lock->un_member.rec_lock.page_id == id)
-         return lock;
-    return nullptr;
+    if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))))
+      return rec_hash;
+    return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash;
   }
 
-  /** Get the first record lock on a page.
-  @param id          page number
-  @return first lock
-  @retval nullptr if none exists */
-  lock_t *get_first(const page_id_t id) const
-  { return get_first(rec_hash, id); }
-  /** Get the first predicate lock on a SPATIAL INDEX page.
+  /** Get the lock hash table for predicate a mode */
+  hash_table &prdt_hash_get(bool page)
+  { return page ? prdt_page_hash : prdt_hash; }
+
+  /** Get the first lock on a page.
+  @param cell        hash table cell
   @param id          page number
   @return first lock
   @retval nullptr if none exists */
-  lock_t *get_first_prdt(const page_id_t id) const
-  { return get_first(prdt_hash, id); }
-  /** Get the first predicate lock on a SPATIAL INDEX page.
-  @param id          page number
+  static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id);
+
+  /** Get the first explicit lock request on a record.
+  @param cell     first lock hash table cell
+  @param id       page identifier
+  @param heap_no  record identifier in page
   @return first lock
   @retval nullptr if none exists */
-  lock_t *get_first_prdt_page(const page_id_t id) const
-  { return get_first(prdt_page_hash, id); }
+  static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id,
+                                  ulint heap_no);
+
+  /** Remove locks on a discarded SPATIAL INDEX page.
+  @param id   page to be discarded
+  @param page whether to discard also from lock_sys.prdt_hash */
+  void prdt_page_free_from_discard(const page_id_t id, bool all= false);
+
+  /** Cancel possible lock waiting for a transaction */
+  static void cancel_lock_wait_for_trx(trx_t *trx);
+};
+
+/** The lock system */
+extern lock_sys_t lock_sys;
+
+/** @return the index of an array element */
+inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
+{
+  ut_ad(lock_sys.is_writer() || lock_sys.readers);
+  return calc_hash(fold, n_cells);
+}
+
+/** Get a hash table cell. */
+inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
+{
+  ut_ad(lock_sys.is_writer() || lock_sys.readers);
+  return &array[calc_hash(fold)];
+}
+
+/** Get the first lock on a page.
+@param cell        hash table cell
+@param id          page number
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id)
+{
+  lock_sys.assert_locked(cell);
+  for (auto lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+  {
+    ut_ad(!lock->is_table());
+    if (lock->un_member.rec_lock.page_id == id)
+      return lock;
+  }
+  return nullptr;
+}
+
+/** lock_sys.latch exclusive guard */
+struct LockMutexGuard
+{
+  LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+  { lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); }
+  ~LockMutexGuard() { lock_sys.wr_unlock(); }
+};
+
+/** lock_sys latch guard for 1 page_id_t */
+struct LockGuard
+{
+  LockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+  ~LockGuard()
+  {
+    lock_sys_t::hash_table::latch(cell_)->release();
+    /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+    lock_sys.rd_unlock();
+  }
+  /** @return the hash array cell */
+  hash_cell_t &cell() const { return *cell_; }
+private:
+  /** The hash array cell */
+  hash_cell_t *cell_;
+};
+
+/** lock_sys latch guard for 2 page_id_t */
+struct LockMultiGuard
+{
+  LockMultiGuard(lock_sys_t::hash_table &hash,
+                 const page_id_t id1, const page_id_t id2);
+  ~LockMultiGuard();
+
+  /** @return the first hash array cell */
+  hash_cell_t &cell1() const { return *cell1_; }
+  /** @return the second hash array cell */
+  hash_cell_t &cell2() const { return *cell2_; }
+private:
+  /** The first hash array cell */
+  hash_cell_t *cell1_;
+  /** The second hash array cell */
+  hash_cell_t *cell2_;
+};
+
+/** lock_sys.latch exclusive guard using transactional memory */
+struct TMLockMutexGuard
+{
+  TRANSACTIONAL_INLINE
+  TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
+  }
+  TRANSACTIONAL_INLINE
+  ~TMLockMutexGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided()) xend(); else
+#endif
+    lock_sys.wr_unlock();
+  }
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept
+  { return !lock_sys.latch.is_locked_or_waiting(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** lock_sys latch guard for 1 page_id_t, using transactional memory */
+struct TMLockGuard
+{
+  TRANSACTIONAL_TARGET
+  TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+  TRANSACTIONAL_INLINE ~TMLockGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (elided)
+    {
+      xend();
+      return;
+    }
+#endif
+    lock_sys_t::hash_table::latch(cell_)->release();
+    /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+    lock_sys.rd_unlock();
+  }
+  /** @return the hash array cell */
+  hash_cell_t &cell() const { return *cell_; }
+private:
+  /** The hash array cell */
+  hash_cell_t *cell_;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  /** whether the latches were elided */
+  bool elided;
+#endif
+};
+
+/** guard for shared lock_sys.latch and trx_t::mutex using
+transactional memory */
+struct TMLockTrxGuard
+{
+  trx_t &trx;
+
+  TRANSACTIONAL_INLINE
+#ifndef UNIV_PFS_RWLOCK
+  TMLockTrxGuard(trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) trx
+#else
+  TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
+#endif
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (!lock_sys.latch.is_write_locked() && was_elided())
+        return;
+      xabort();
+    }
+#endif
+    lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
+    trx.mutex_lock();
+  }
+  TRANSACTIONAL_INLINE
+  ~TMLockTrxGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided())
+    {
+      xend();
+      return;
+    }
+#endif
+    lock_sys.rd_unlock();
+    trx.mutex_unlock();
+  }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** guard for trx_t::mutex using transactional memory */
+struct TMTrxGuard
+{
+  trx_t &trx;
+
+  TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    trx.mutex_lock();
+  }
+  TRANSACTIONAL_INLINE ~TMTrxGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided())
+    {
+      xend();
+      return;
+    }
+#endif
+    trx.mutex_unlock();
+  }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
 };
 
 /*********************************************************************//**
@@ -871,13 +1185,8 @@ UNIV_INLINE
 lock_t*
 lock_rec_create(
 /*============*/
-#ifdef WITH_WSREP
 	lock_t*			c_lock,	/*!< conflicting lock */
-	que_thr_t*		thr,	/*!< thread owning trx */
-#endif
-	unsigned		type_mode,/*!< in: lock mode and wait
-					flag, type is ignored and
-					replaced by LOCK_REC */
+	unsigned		type_mode,/*!< in: lock mode and wait flag */
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
@@ -887,19 +1196,15 @@ lock_rec_create(
 					/*!< in: true if caller owns
 					trx mutex */
 
-/*************************************************************//**
-Removes a record lock request, waiting or granted, from the queue. */
-void
-lock_rec_discard(
-/*=============*/
-	lock_t*		in_lock);	/*!< in: record lock object: all
-					record locks which are contained
-					in this lock object are removed */
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash     hash table
+@param in_lock  lock object */
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
 
 /** Create a new record lock and inserts it to the lock queue,
 without checking for deadlocks or conflicts.
-@param[in]	type_mode	lock mode and wait flag; type will be replaced
-				with LOCK_REC
+@param[in]	c_lock		conflicting lock, or NULL
+@param[in]	type_mode	lock mode and wait flag
 @param[in]	page_id		index page number
 @param[in]	page		R-tree index page, or NULL
 @param[in]	heap_no		record heap number in the index page
@@ -909,10 +1214,7 @@ without checking for deadlocks or conflicts.
 @return created lock */
 lock_t*
 lock_rec_create_low(
-#ifdef WITH_WSREP
-	lock_t*		c_lock,	/*!< conflicting lock */
-	que_thr_t*	thr,	/*!< thread owning trx */
-#endif
+	lock_t*		c_lock,
 	unsigned	type_mode,
 	const page_id_t	page_id,
 	const page_t*	page,
@@ -920,8 +1222,10 @@ lock_rec_create_low(
 	dict_index_t*	index,
 	trx_t*		trx,
 	bool		holds_trx_mutex);
+
 /** Enqueue a waiting request for a lock which cannot be granted immediately.
 Check for deadlocks.
+@param[in]	c_lock		conflicting lock
 @param[in]	type_mode	the requested lock mode (LOCK_S or LOCK_X)
 				possibly ORed with LOCK_GAP or
 				LOCK_REC_NOT_GAP, ORed with
@@ -929,22 +1233,20 @@ Check for deadlocks.
 				waiting lock request is set
 				when performing an insert of
 				an index record
-@param[in]	block		leaf page in the index
+@param[in]	id		page identifier
+@param[in]	page		leaf page in the index
 @param[in]	heap_no		record heap number in the block
 @param[in]	index		index tree
 @param[in,out]	thr		query thread
 @param[in]	prdt		minimum bounding box (spatial index)
 @retval	DB_LOCK_WAIT		if the waiting lock was enqueued
-@retval	DB_DEADLOCK		if this transaction was chosen as the victim
-@retval	DB_SUCCESS_LOCKED_REC	if the other transaction was chosen as a victim
-				(or it happened to commit) */
+@retval	DB_DEADLOCK		if this transaction was chosen as the victim */
 dberr_t
 lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
-	lock_t*			c_lock,	/*!< conflicting lock */
-#endif
+	lock_t*			c_lock,
 	unsigned		type_mode,
-	const buf_block_t*	block,
+	const page_id_t		id,
+	const page_t*		page,
 	ulint			heap_no,
 	dict_index_t*		index,
 	que_thr_t*		thr,
@@ -962,67 +1264,6 @@ lock_rtr_move_rec_list(
 						moved */
 	ulint			num_move);	/*!< in: num of rec to move */
 
-/*************************************************************//**
-Removes record lock objects set on an index page which is discarded. This
-function does not move locks, or check for waiting locks, therefore the
-lock bitmaps must already be reset when this function is called. */
-void
-lock_rec_free_all_from_discard_page(
-/*================================*/
-	const buf_block_t*	block);		/*!< in: page to be discarded */
-
-/** The lock system */
-extern lock_sys_t lock_sys;
-
-/** Test if lock_sys.mutex can be acquired without waiting. */
-#define lock_mutex_enter_nowait() 		\
-	(lock_sys.mutex.trylock(__FILE__, __LINE__))
-
-/** Test if lock_sys.mutex is owned. */
-#define lock_mutex_own() (lock_sys.mutex.is_owned())
-
-/** Acquire the lock_sys.mutex. */
-#define lock_mutex_enter() do {			\
-	mutex_enter(&lock_sys.mutex);		\
-} while (0)
-
-/** Release the lock_sys.mutex. */
-#define lock_mutex_exit() do {			\
-	lock_sys.mutex.exit();			\
-} while (0)
-
-/** Test if lock_sys.wait_mutex is owned. */
-#define lock_wait_mutex_own() (lock_sys.wait_mutex.is_owned())
-
-/** Acquire the lock_sys.wait_mutex. */
-#define lock_wait_mutex_enter() do {		\
-	mutex_enter(&lock_sys.wait_mutex);	\
-} while (0)
-
-/** Release the lock_sys.wait_mutex. */
-#define lock_wait_mutex_exit() do {		\
-	lock_sys.wait_mutex.exit();		\
-} while (0)
-
-#ifdef WITH_WSREP
-/*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-UNIV_INTERN
-void
-lock_cancel_waiting_and_release(
-/*============================*/
-	lock_t*	lock);	/*!< in/out: waiting lock request */
-
-/*******************************************************************//**
-Get lock mode and table/index name
-@return	string containing lock info */
-std::string
-lock_get_info(
-	const lock_t*);
-
-#endif /* WITH_WSREP */
-
 #include "lock0lock.inl"
 
 #endif
diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl
index 2d5b6ff37f1..1b9255ffb3e 100644
--- a/storage/innobase/include/lock0lock.inl
+++ b/storage/innobase/include/lock0lock.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -37,7 +37,7 @@ lock_get_min_heap_no(
 /*=================*/
 	const buf_block_t*	block)	/*!< in: buffer block */
 {
-	const page_t*	page	= block->frame;
+	const page_t*	page	= block->page.frame;
 
 	if (page_is_comp(page)) {
 		return(rec_get_heap_no_new(
@@ -52,23 +52,6 @@ lock_get_min_heap_no(
 	}
 }
 
-/*************************************************************//**
-Get the lock hash table */
-UNIV_INLINE
-hash_table_t*
-lock_hash_get(
-/*==========*/
-	ulint	mode)	/*!< in: lock mode */
-{
-	if (mode & LOCK_PREDICATE) {
-		return &lock_sys.prdt_hash;
-	} else if (mode & LOCK_PRDT_PAGE) {
-		return &lock_sys.prdt_page_hash;
-	} else {
-		return &lock_sys.rec_hash;
-	}
-}
-
 /*********************************************************************//**
 Creates a new record lock and inserts it to the lock queue. Does NOT check
 for deadlocks or lock compatibility!
@@ -77,13 +60,8 @@ UNIV_INLINE
 lock_t*
 lock_rec_create(
 /*============*/
-#ifdef WITH_WSREP
 	lock_t*			c_lock,	/*!< conflicting lock */
-	que_thr_t*		thr,	/*!< thread owning trx */
-#endif
-	unsigned		type_mode,/*!< in: lock mode and wait
-					flag, type is ignored and
-					replaced by LOCK_REC */
+	unsigned		type_mode,/*!< in: lock mode and wait flag */
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	ulint			heap_no,/*!< in: heap number of the record */
@@ -93,11 +71,8 @@ lock_rec_create(
 					/*!< in: TRUE if caller owns
 					trx mutex */
 {
-	btr_assert_not_corrupted(block, index);
 	return lock_rec_create_low(
-#ifdef WITH_WSREP
-		c_lock, thr,
-#endif
-		type_mode, block->page.id(), block->frame, heap_no,
+		c_lock,
+		type_mode, block->page.id(), block->page.frame, heap_no,
 		index, trx, caller_owns_trx_mutex);
 }
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
index 43d68996691..db8e33922c4 100644
--- a/storage/innobase/include/lock0prdt.h
+++ b/storage/innobase/include/lock0prdt.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -126,8 +126,6 @@ a predicate record.
 dberr_t
 lock_prdt_insert_check_and_lock(
 /*============================*/
-	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
-				set, does nothing */
 	const rec_t*	rec,	/*!< in: record after which to insert */
 	buf_block_t*	block,	/*!< in/out: buffer block of rec */
 	dict_index_t*	index,	/*!< in: index */
@@ -183,8 +181,7 @@ lock_prdt_rec_move(
 /*===============*/
 	const buf_block_t*	receiver,	/*!< in: buffer block containing
 						the receiving record */
-	const buf_block_t*	donator);	/*!< in: buffer block containing
-						the donating record */
+	const page_id_t		donator);	/*!< in: target page */
 
 /** Check whether there are R-tree Page lock on a page
 @param[in]	trx	trx to test the lock
@@ -192,13 +189,4 @@ lock_prdt_rec_move(
 @return	true if there is none */
 bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
 
-/** Removes predicate lock objects set on an index page which is discarded.
-@param[in]	block		page to be discarded
-@param[in]	lock_hash	lock hash */
-void
-lock_prdt_page_free_from_discard(
-/*=============================*/
-	const buf_block_t*	block,
-	hash_table_t*		lock_hash);
-
 #endif
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
index f39692903fa..b0a5f7aaf3b 100644
--- a/storage/innobase/include/lock0priv.h
+++ b/storage/innobase/include/lock0priv.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2018, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -64,49 +64,44 @@ operator<<(std::ostream& out, const lock_table_t& lock)
 	return(lock.print(out));
 }
 
-/** Convert the member 'type_mode' into a human readable string.
-@return human readable string */
-inline
-std::string
-ib_lock_t::type_mode_string() const
-{
-	std::ostringstream sout;
-	sout << type_string();
-	sout << " | " << lock_mode_string(mode());
-
-	if (is_record_not_gap()) {
-		sout << " | LOCK_REC_NOT_GAP";
-	}
-
-	if (is_waiting()) {
-		sout << " | LOCK_WAIT";
-	}
-
-	if (is_gap()) {
-		sout << " | LOCK_GAP";
-	}
-
-	if (is_insert_intention()) {
-		sout << " | LOCK_INSERT_INTENTION";
-	}
-	return(sout.str());
-}
-
 inline
 std::ostream&
 ib_lock_t::print(std::ostream& out) const
 {
-	out << "[lock_t: type_mode=" << type_mode << "("
-		<< type_mode_string() << ")";
-
-	if (is_record_lock()) {
-		out << un_member.rec_lock;
-	} else {
-		out << un_member.tab_lock;
-	}
-
-	out << "]";
-	return(out);
+  static_assert(LOCK_MODE_MASK == 7, "compatibility");
+  static_assert(LOCK_IS == 0, "compatibility");
+  static_assert(LOCK_IX == 1, "compatibility");
+  static_assert(LOCK_S == 2, "compatibility");
+  static_assert(LOCK_X == 3, "compatibility");
+  static_assert(LOCK_AUTO_INC == 4, "compatibility");
+  static_assert(LOCK_NONE == 5, "compatibility");
+  static_assert(LOCK_NONE_UNSET == 7, "compatibility");
+  const char *const modes[8]=
+  { "IS", "IX", "S", "X", "AUTO_INC", "NONE", "?", "NONE_UNSET" };
+
+  out << "[lock_t: type_mode=" << type_mode << "(" << type_string()
+      << " | LOCK_" << modes[mode()];
+
+  if (is_record_not_gap())
+    out << " | LOCK_REC_NOT_GAP";
+  if (is_waiting())
+    out << " | LOCK_WAIT";
+
+  if (is_gap())
+    out << " | LOCK_GAP";
+
+  if (is_insert_intention())
+    out << " | LOCK_INSERT_INTENTION";
+
+  out << ")";
+
+  if (is_table())
+    out << un_member.tab_lock;
+  else
+    out << un_member.rec_lock;
+
+  out << "]";
+  return out;
 }
 
 inline
@@ -120,24 +115,6 @@ operator<<(std::ostream& out, const ib_lock_t& lock)
 extern ibool	lock_print_waits;
 #endif /* UNIV_DEBUG */
 
-/** Restricts the length of search we will do in the waits-for
-graph of transactions */
-static const ulint	LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK = 1000000;
-
-/** Restricts the search depth we will do in the waits-for graph of
-transactions */
-static const ulint	LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK = 200;
-
-/** When releasing transaction locks, this specifies how often we release
-the lock mutex for a moment to give also others access to it */
-static const ulint	LOCK_RELEASE_INTERVAL = 1000;
-
-/* Safety margin when creating a new record lock: this many extra records
-can be inserted to the page without need to create a lock with a bigger
-bitmap */
-
-static const ulint	LOCK_PAGE_BITMAP_MARGIN = 64;
-
 /* An explicit record lock affects both the record and the gap before it.
 An implicit x-lock does not affect the gap, it only locks the index
 record from read or update.
@@ -414,9 +391,6 @@ static const byte lock_strength_matrix[5][5] = {
  /* AI */ {  FALSE, FALSE, FALSE, FALSE,  TRUE}
 };
 
-/** Maximum depth of the DFS stack. */
-static const ulint MAX_STACK_SIZE = 4096;
-
 #define PRDT_HEAPNO	PAGE_HEAP_NO_INFIMUM
 /** Record locking request status */
 enum lock_rec_req_status {
@@ -434,15 +408,6 @@ static const ulint      lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
 #endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
-Gets the type of a lock.
-@return LOCK_TABLE or LOCK_REC */
-UNIV_INLINE
-ulint
-lock_get_type_low(
-/*==============*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*********************************************************************//**
 Gets the previous record lock set on a record.
 @return previous lock on the same record, NULL if none exists */
 const lock_t*
@@ -452,14 +417,6 @@ lock_rec_get_prev(
 	ulint		heap_no);/*!< in: heap number of the record */
 
 /*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-void
-lock_cancel_waiting_and_release(
-/*============================*/
-	lock_t*	lock);	/*!< in/out: waiting lock request */
-
-/*********************************************************************//**
 Checks if some transaction has an implicit x-lock on a record in a clustered
 index.
 @return transaction id of the transaction which has the x-lock, or 0 */
@@ -502,7 +459,7 @@ lock_rec_get_n_bits(
 
 /**********************************************************************//**
 Sets the nth bit of a record lock to TRUE. */
-UNIV_INLINE
+inline
 void
 lock_rec_set_nth_bit(
 /*=================*/
@@ -515,7 +472,13 @@ lock_rec_set_nth_bit(
 @return previous value of the bit */
 inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
 {
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(!lock->is_table());
+#ifdef SUX_LOCK_GENERIC
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+	      || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
 	ut_ad(i < lock->un_member.rec_lock.n_bits);
 
 	byte*	b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
@@ -524,8 +487,9 @@ inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
 	*b &= byte(~mask);
 
 	if (bit != 0) {
-		ut_ad(lock->trx->lock.n_rec_locks > 0);
-		--lock->trx->lock.n_rec_locks;
+		ut_d(auto n=)
+		lock->trx->lock.n_rec_locks--;
+		ut_ad(n);
 	}
 
 	return(bit);
@@ -560,25 +524,26 @@ lock_rec_get_next_const(
 	ulint		heap_no,/*!< in: heap number of the record */
 	const lock_t*	lock);	/*!< in: lock */
 
-/*********************************************************************//**
-Gets the first explicit lock request on a record.
-@return first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first(
-/*===============*/
-	hash_table_t*		hash,	/*!< in: hash chain the lock on */
-	const buf_block_t*	block,	/*!< in: block containing the record */
-	ulint			heap_no);/*!< in: heap number of the record */
-
-/*********************************************************************//**
-Gets the mode of a lock.
-@return mode */
-UNIV_INLINE
-enum lock_mode
-lock_get_mode(
-/*==========*/
-	const lock_t*	lock);	/*!< in: lock */
+/** Get the first explicit lock request on a record.
+@param cell     first lock hash table cell
+@param id       page identifier
+@param heap_no  record identifier in page
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id,
+                                     ulint heap_no)
+{
+  lock_sys.assert_locked(cell);
+
+  for (lock_t *lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+  {
+    ut_ad(!lock->is_table());
+    if (lock->un_member.rec_lock.page_id == id &&
+        lock_rec_get_nth_bit(lock, heap_no))
+      return lock;
+  }
+  return nullptr;
+}
 
 /*********************************************************************//**
 Calculates if lock mode 1 is compatible with lock mode 2.
@@ -601,15 +566,6 @@ lock_mode_stronger_or_eq(
 	enum lock_mode	mode2);	/*!< in: lock mode */
 
 /*********************************************************************//**
-Gets the wait flag of a lock.
-@return LOCK_WAIT if waiting, 0 if not */
-UNIV_INLINE
-ulint
-lock_get_wait(
-/*==========*/
-	const lock_t*	lock);	/*!< in: lock */
-
-/*********************************************************************//**
 Checks if a transaction has the specified table lock, or stronger. This
 function should only be called by the thread that owns the transaction.
 @return lock or NULL */
@@ -621,33 +577,6 @@ lock_table_has(
 	const dict_table_t*	table,	/*!< in: table */
 	enum lock_mode		mode);	/*!< in: lock mode */
 
-/** Set the wait status of a lock.
-@param[in,out]	lock	lock that will be waited for
-@param[in,out]	trx	transaction that will wait for the lock */
-inline void lock_set_lock_and_trx_wait(lock_t* lock, trx_t* trx)
-{
-	ut_ad(lock);
-	ut_ad(lock->trx == trx);
-	ut_ad(trx->lock.wait_lock == NULL);
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(trx));
-
-	trx->lock.wait_lock = lock;
-	lock->type_mode |= LOCK_WAIT;
-}
-
-/** Reset the wait status of a lock.
-@param[in,out]	lock	lock that was possibly being waited for */
-inline void lock_reset_lock_and_trx_wait(lock_t* lock)
-{
-	ut_ad(lock_get_wait(lock));
-	ut_ad(lock_mutex_own());
-	ut_ad(lock->trx->lock.wait_lock == NULL
-	      || lock->trx->lock.wait_lock == lock);
-	lock->trx->lock.wait_lock = NULL;
-	lock->type_mode &= ~LOCK_WAIT;
-}
-
 #include "lock0priv.inl"
 
 #endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl
index e16949a4917..3b4ebcc835b 100644
--- a/storage/innobase/include/lock0priv.inl
+++ b/storage/innobase/include/lock0priv.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,20 +36,6 @@ methods but they are used only in that file. */
 #include "row0row.h"
 
 /*********************************************************************//**
-Gets the type of a lock.
-@return LOCK_TABLE or LOCK_REC */
-UNIV_INLINE
-ulint
-lock_get_type_low(
-/*==============*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	ut_ad(lock);
-
-	return(lock->type_mode & LOCK_TYPE_MASK);
-}
-
-/*********************************************************************//**
 Checks if some transaction has an implicit x-lock on a record in a clustered
 index.
 @return transaction id of the transaction which has the x-lock, or 0 */
@@ -81,7 +67,7 @@ lock_rec_get_n_bits(
 
 /**********************************************************************//**
 Sets the nth bit of a record lock to TRUE. */
-UNIV_INLINE
+inline
 void
 lock_rec_set_nth_bit(
 /*=================*/
@@ -91,8 +77,7 @@ lock_rec_set_nth_bit(
 	ulint	byte_index;
 	ulint	bit_index;
 
-	ut_ad(lock);
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(!lock->is_table());
 	ut_ad(i < lock->un_member.rec_lock.n_bits);
 
 	byte_index = i / 8;
@@ -106,7 +91,13 @@ lock_rec_set_nth_bit(
 #if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
 # pragma GCC diagnostic pop
 #endif
-	++lock->trx->lock.n_rec_locks;
+#ifdef SUX_LOCK_GENERIC
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+	      || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+	lock->trx->lock.n_rec_locks++;
 }
 
 /*********************************************************************//**
@@ -118,7 +109,7 @@ lock_rec_get_next_on_page(
 /*======================*/
 	lock_t*	lock)	/*!< in: a record lock */
 {
-	return((lock_t*) lock_rec_get_next_on_page_const(lock));
+  return const_cast<lock_t*>(lock_rec_get_next_on_page_const(lock));
 }
 
 /*********************************************************************//**
@@ -131,10 +122,7 @@ lock_rec_get_next(
 	ulint	heap_no,/*!< in: heap number of the record */
 	lock_t*	lock)	/*!< in: lock */
 {
-	ut_ad(lock_mutex_own());
-
 	do {
-		ut_ad(lock_get_type_low(lock) == LOCK_REC);
 		lock = lock_rec_get_next_on_page(lock);
 	} while (lock && !lock_rec_get_nth_bit(lock, heap_no));
 
@@ -151,25 +139,7 @@ lock_rec_get_next_const(
 	ulint		heap_no,/*!< in: heap number of the record */
 	const lock_t*	lock)	/*!< in: lock */
 {
-	return(lock_rec_get_next(heap_no, (lock_t*) lock));
-}
-
-/*********************************************************************//**
-Gets the first explicit lock request on a record.
-@return	first lock, NULL if none exists */
-UNIV_INLINE
-lock_t*
-lock_rec_get_first(
-/*===============*/
-	hash_table_t*		hash,	/*!< in: hash chain the lock on */
-	const buf_block_t*	block,	/*!< in: block containing the record */
-	ulint			heap_no)/*!< in: heap number of the record */
-{
-  for (lock_t *lock= lock_sys.get_first(*hash, block->page.id());
-       lock; lock= lock_rec_get_next_on_page(lock))
-    if (lock_rec_get_nth_bit(lock, heap_no))
-      return lock;
-  return nullptr;
+  return lock_rec_get_next(heap_no, const_cast<lock_t*>(lock));
 }
 
 /*********************************************************************//**
@@ -184,8 +154,7 @@ lock_rec_get_nth_bit(
 {
 	const byte*     b;
 
-	ut_ad(lock);
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(!lock->is_table());
 
 	if (i >= lock->un_member.rec_lock.n_bits) {
 
@@ -206,10 +175,9 @@ lock_rec_get_next_on_page_const(
 /*============================*/
 	const lock_t*	lock)	/*!< in: a record lock */
 {
-  ut_ad(lock_mutex_own());
-  ut_ad(lock_get_type_low(lock) == LOCK_REC);
+  ut_ad(!lock->is_table());
 
-  const page_id_t page_id(lock->un_member.rec_lock.page_id);
+  const page_id_t page_id{lock->un_member.rec_lock.page_id};
 
   while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
     if (lock->un_member.rec_lock.page_id == page_id)
@@ -218,20 +186,6 @@ lock_rec_get_next_on_page_const(
 }
 
 /*********************************************************************//**
-Gets the mode of a lock.
-@return mode */
-UNIV_INLINE
-enum lock_mode
-lock_get_mode(
-/*==========*/
-	const lock_t*	lock)   /*!< in: lock */
-{
-	ut_ad(lock);
-
-	return(static_cast<enum lock_mode>(lock->type_mode & LOCK_MODE_MASK));
-}
-
-/*********************************************************************//**
 Calculates if lock mode 1 is compatible with lock mode 2.
 @return nonzero if mode1 compatible with mode2 */
 UNIV_INLINE
@@ -264,20 +218,6 @@ lock_mode_stronger_or_eq(
 }
 
 /*********************************************************************//**
-Gets the wait flag of a lock.
-@return LOCK_WAIT if waiting, 0 if not */
-UNIV_INLINE
-ulint
-lock_get_wait(
-/*==========*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	ut_ad(lock);
-
-	return(lock->type_mode & LOCK_WAIT);
-}
-
-/*********************************************************************//**
 Checks if a transaction has the specified table lock, or stronger. This
 function should only be called by the thread that owns the transaction.
 @return lock or NULL */
@@ -300,22 +240,16 @@ lock_table_has(
 			continue;
 		}
 
-		lock_mode	mode = lock_get_mode(lock);
-
 		ut_ad(trx == lock->trx);
-		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
-		ut_ad(lock->un_member.tab_lock.table != NULL);
+		ut_ad(lock->is_table());
+		ut_ad(lock->un_member.tab_lock.table);
 
 		if (table == lock->un_member.tab_lock.table
-		    && lock_mode_stronger_or_eq(mode, in_mode)) {
-
-			ut_ad(!lock_get_wait(lock));
-
+		    && lock_mode_stronger_or_eq(lock->mode(), in_mode)) {
+			ut_ad(!lock->is_waiting());
 			return(lock);
 		}
 	}
 
 	return(NULL);
 }
-
-/* vim: set filetype=c: */
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
index 23307375426..dc57a31c5f8 100644
--- a/storage/innobase/include/lock0types.h
+++ b/storage/innobase/include/lock0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -46,35 +46,9 @@ enum lock_mode {
 			in an exclusive mode */
 	LOCK_NONE,	/* this is used elsewhere to note consistent read */
 	LOCK_NUM = LOCK_NONE, /* number of lock modes */
-	LOCK_NONE_UNSET = 255
+	LOCK_NONE_UNSET = 7
 };
 
-/** Convert the given enum value into string.
-@param[in]	mode	the lock mode
-@return human readable string of the given enum value */
-inline
-const char* lock_mode_string(enum lock_mode mode)
-{
-	switch (mode) {
-	case LOCK_IS:
-		return("LOCK_IS");
-	case LOCK_IX:
-		return("LOCK_IX");
-	case LOCK_S:
-		return("LOCK_S");
-	case LOCK_X:
-		return("LOCK_X");
-	case LOCK_AUTO_INC:
-		return("LOCK_AUTO_INC");
-	case LOCK_NONE:
-		return("LOCK_NONE");
-	case LOCK_NONE_UNSET:
-		return("LOCK_NONE_UNSET");
-	default:
-		ut_error;
-	}
-}
-
 /** A table lock */
 struct lock_table_t {
 	dict_table_t*	table;		/*!< database table in dictionary
@@ -121,17 +95,12 @@ operator<<(std::ostream& out, const lock_rec_t& lock)
 	return(lock.print(out));
 }
 
-#define LOCK_MODE_MASK	0xFUL	/*!< mask used to extract mode from the
+#define LOCK_MODE_MASK	0x7	/*!< mask used to extract mode from the
 				type_mode field in a lock */
 /** Lock types */
 /* @{ */
-#define LOCK_TABLE	16U	/*!< table lock */
-#define	LOCK_REC	32U	/*!< record lock */
-#define LOCK_TYPE_MASK	0xF0UL	/*!< mask used to extract lock type from the
-				type_mode field in a lock */
-#if LOCK_MODE_MASK & LOCK_TYPE_MASK
-# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
-#endif
+/** table lock (record lock if the flag is not set) */
+#define LOCK_TABLE	8U
 
 #define LOCK_WAIT	256U	/*!< Waiting lock flag; when set, it
 				means that the lock has not yet been
@@ -176,14 +145,14 @@ operator<<(std::ostream& out, const lock_rec_t& lock)
 #endif
 /* @} */
 
-/** Lock struct; protected by lock_sys.mutex */
+/** Lock struct; protected by lock_sys.latch */
 struct ib_lock_t
 {
-	trx_t*		trx;		/*!< transaction owning the
-					lock */
-	UT_LIST_NODE_T(ib_lock_t)
-			trx_locks;	/*!< list of the locks of the
-					transaction */
+  /** the owner of the lock */
+  trx_t *trx;
+  /** other locks of the transaction; protected by
+  lock_sys.is_writer() and trx->mutex_is_owner(); @see trx_lock_t::trx_locks */
+  UT_LIST_NODE_T(ib_lock_t) trx_locks;
 
 	dict_index_t*	index;		/*!< index for a record lock */
 
@@ -210,13 +179,6 @@ struct ib_lock_t
 					LOCK_INSERT_INTENTION,
 					wait flag, ORed */
 
-	/** Determine if the lock object is a record lock.
-	@return true if record lock, false otherwise. */
-	bool is_record_lock() const
-	{
-		return(type() == LOCK_REC);
-	}
-
 	bool is_waiting() const
 	{
 		return(type_mode & LOCK_WAIT);
@@ -237,9 +199,7 @@ struct ib_lock_t
 		return(type_mode & LOCK_INSERT_INTENTION);
 	}
 
-	ulint type() const {
-		return(type_mode & LOCK_TYPE_MASK);
-	}
+	bool is_table() const { return type_mode & LOCK_TABLE; }
 
 	enum lock_mode mode() const
 	{
@@ -251,21 +211,8 @@ struct ib_lock_t
 	@return the given output stream. */
 	std::ostream& print(std::ostream& out) const;
 
-	/** Convert the member 'type_mode' into a human readable string.
-	@return human readable string */
-	std::string type_mode_string() const;
-
 	const char* type_string() const
-	{
-		switch (type_mode & LOCK_TYPE_MASK) {
-		case LOCK_REC:
-			return("LOCK_REC");
-		case LOCK_TABLE:
-			return("LOCK_TABLE");
-		default:
-			ut_error;
-		}
-	}
+	{ return is_table() ? "LOCK_TABLE" : "LOCK_REC"; }
 };
 
 typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t;
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
index 980a79d8f9e..b9390927ece 100644
--- a/storage/innobase/include/log0crypt.h
+++ b/storage/innobase/include/log0crypt.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
-Copyright (C) 2014, 2020, MariaDB Corporation.
+Copyright (C) 2014, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -38,19 +38,13 @@ The random parameters will be persisted in the log checkpoint pages.
 @see log_crypt_write_checkpoint_buf()
 @see log_crypt_read_checkpoint_buf()
 @return whether the operation succeeded */
-UNIV_INTERN
-bool
-log_crypt_init();
+bool log_crypt_init();
 
 /*********************************************************************//**
 Writes the crypto (version, msg and iv) info, which has been used for
 log blocks with lsn <= this checkpoint's lsn, to a log header's
 checkpoint buf. */
-UNIV_INTERN
-void
-log_crypt_write_checkpoint_buf(
-/*===========================*/
-	byte*	buf);			/*!< in/out: checkpoint buffer */
+void log_crypt_write_checkpoint_buf(byte *buf);
 
 /** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
 @param[in]	buf	checkpoint buffer
@@ -93,9 +87,7 @@ bool log_crypt(byte* buf, lsn_t lsn, ulint size, log_crypt_t op = LOG_ENCRYPT);
 @param[in]	offs		offset to block
 @param[in]	encrypt		true=encrypt; false=decrypt
 @return whether the operation succeeded */
-UNIV_INTERN
-bool
-log_tmp_block_encrypt(
+bool log_tmp_block_encrypt(
 	const byte*	src,
 	ulint		size,
 	byte*		dst,
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
index 4a5567ff62d..0f9a4da049b 100644
--- a/storage/innobase/include/log0log.h
+++ b/storage/innobase/include/log0log.h
@@ -77,7 +77,7 @@ log_reserve_and_write_fast(
 Checks if there is need for a log buffer flush or a new checkpoint, and does
 this if yes. Any database operation should call this when it has modified
 more than about 4 pages. NOTE that this function may only be called when the
-OS thread owns no synchronization objects except the dictionary mutex. */
+OS thread owns no synchronization objects except dict_sys.latch. */
 UNIV_INLINE
 void
 log_free_check(void);
@@ -97,15 +97,21 @@ bool
 log_set_capacity(ulonglong file_size)
 	MY_ATTRIBUTE((warn_unused_result));
 
-/** Ensure that the log has been written to the log file up to a given
+/**
+Ensure that the log has been written to the log file up to a given
 log entry (such as that of a transaction commit). Start a new write, or
 wait and check if an already running write is covering the request.
 @param[in]	lsn		log sequence number that should be
 included in the redo log file write
 @param[in]	flush_to_disk	whether the written log should also
 be flushed to the file system
-@param[in]	rotate_key	whether to rotate the encryption key */
-void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false);
+@param[in]	rotate_key	whether to rotate the encryption key
+@param[in]  cb completion callback. If not NULL, the callback will be called
+  whenever lsn is written or flushed.
+*/
+struct completion_callback;
+void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key = false,
+  const completion_callback* cb=nullptr);
 
 /** Write to the log file up to the last log entry.
 @param sync  whether to wait for a durable write to complete */
@@ -445,7 +451,7 @@ struct log_t{
 
 private:
   /** The log sequence number of the last change of durable InnoDB files */
-  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE)
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
   std::atomic<lsn_t> lsn;
   /** the first guaranteed-durable log sequence number */
   std::atomic<lsn_t> flushed_to_disk_lsn;
@@ -455,7 +461,7 @@ private:
   std::atomic<bool> check_flush_or_checkpoint_;
 public:
   /** mutex protecting the log */
-  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
   /** first free offset within the log buffer in use */
   size_t buf_free;
   /** recommended maximum size of buf, after which the buffer is flushed */
@@ -469,7 +475,7 @@ public:
   dirty blocks in the list. The idea behind this mutex is to be able
   to release log_sys.mutex during mtr_commit and still ensure that
   insertions in the flush_list happen in the LSN order. */
-  MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_order_mutex;
   /** log_buffer, append data here */
   byte *buf;
   /** log_buffer, writing data to file from this buffer.
diff --git a/storage/innobase/include/log0log.inl b/storage/innobase/include/log0log.inl
index 0ff8c2523d7..c29c0bfa55f 100644
--- a/storage/innobase/include/log0log.inl
+++ b/storage/innobase/include/log0log.inl
@@ -294,7 +294,7 @@ log_reserve_and_write_fast(
 Checks if there is need for a log buffer flush or a new checkpoint, and does
 this if yes. Any database operation should call this when it has modified
 more than about 4 pages. NOTE that this function may only be called when the
-OS thread owns no synchronization objects except the dictionary mutex. */
+OS thread owns no synchronization objects except dict_sys.latch. */
 UNIV_INLINE
 void
 log_free_check(void)
@@ -304,22 +304,6 @@ log_free_check(void)
 	are holding some latches. This is OK, as long as we are not holding
 	any latches on buffer blocks. */
 
-#ifdef UNIV_DEBUG
-	static const latch_level_t latches[] = {
-		SYNC_REDO_RSEG,		/* trx_purge_free_segment() */
-		SYNC_DICT,		/* dict_sys.mutex during
-					commit_try_rebuild() */
-		SYNC_DICT_OPERATION,	/* dict_sys.latch X-latch during
-					commit_try_rebuild() */
-		SYNC_FTS_CACHE,		/* fts_cache_t::lock */
-		SYNC_INDEX_TREE		/* index->lock */
-	};
-#endif /* UNIV_DEBUG */
-
-	ut_ad(!sync_check_iterate(
-		      sync_allowed_latches(latches,
-					   latches + UT_ARR_SIZE(latches))));
-
 	if (log_sys.check_flush_or_checkpoint()) {
 
 		log_check_margins();
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index e4a8f0d25a2..5e8dc1c0160 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,12 +26,13 @@ Created 9/20/1997 Heikki Tuuri
 
 #pragma once
 
-#include "ut0byte.h"
+#include "ut0new.h"
 #include "buf0types.h"
 #include "log0log.h"
 #include "mtr0types.h"
 
 #include <deque>
+#include <map>
 
 /** @return whether recovery is currently running. */
 #define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
@@ -43,11 +44,12 @@ dberr_t
 recv_find_max_checkpoint(ulint* max_field)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Apply any buffered redo log to a page that was just read from a data file.
 @param[in,out]	space	tablespace
-@param[in,out]	bpage	buffer pool page */
-ATTRIBUTE_COLD void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
-	MY_ATTRIBUTE((nonnull));
+@param[in,out]	bpage	buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage);
 
 /** Start recovering from a redo log checkpoint.
 @param[in]	flush_lsn	FIL_PAGE_FILE_FLUSH_LSN
@@ -81,12 +83,12 @@ void recv_sys_justify_left_parsing_buf();
 
 /** Report an operation to create, delete, or rename a file during backup.
 @param[in]	space_id	tablespace identifier
-@param[in]	create		whether the file is being created
+@param[in]	type		file operation redo log type
 @param[in]	name		file name (not NUL-terminated)
 @param[in]	len		length of name, in bytes
 @param[in]	new_name	new file name (NULL if not rename)
 @param[in]	new_len		length of new_name, in bytes (0 if NULL) */
-extern void (*log_file_op)(ulint space_id, bool create,
+extern void (*log_file_op)(ulint space_id, int type,
 			   const byte* name, ulint len,
 			   const byte* new_name, ulint new_len);
 
@@ -95,6 +97,10 @@ during backup
 @param	space_id	undo tablespace identifier */
 extern void (*undo_space_trunc)(uint32_t space_id);
 
+/** Report an operation which does INIT_PAGE for page0 during backup.
+@param	space_id	tablespace identifier */
+extern void (*first_page_init)(ulint space_id);
+
 /** Stored redo log record */
 struct log_rec_t
 {
@@ -213,14 +219,25 @@ struct page_recv_t
 struct recv_sys_t
 {
   /** mutex protecting apply_log_recs and page_recv_t::state */
-  ib_mutex_t mutex;
+  mysql_mutex_t mutex;
+private:
+  /** condition variable for
+  !apply_batch_on || pages.empty() || found_corrupt_log || found_corrupt_fs */
+  pthread_cond_t cond;
+  /** whether recv_apply_hashed_log_recs() is running */
+  bool apply_batch_on;
+  /** set when finding a corrupt log block or record, or there is a
+  log parsing buffer overflow */
+  bool found_corrupt_log;
+  /** set when an inconsistency with the file system contents is detected
+  during log scan or apply */
+  bool found_corrupt_fs;
+public:
   /** whether we are applying redo log records during crash recovery */
   bool recovery_on;
-  /** whether recv_recover_page(), invoked from buf_page_read_complete(),
+  /** whether recv_recover_page(), invoked from buf_page_t::read_complete(),
   should apply log records*/
   bool apply_log_recs;
-  /** whether apply() is running */
-  bool apply_batch_on;
 	byte*		buf;	/*!< buffer for parsing log records */
 	ulint		len;	/*!< amount of data in buf */
 	lsn_t		parse_start_lsn;
@@ -240,14 +257,6 @@ struct recv_sys_t
 	lsn_t		recovered_lsn;
 				/*!< the log records have been parsed up to
 				this lsn */
-	bool		found_corrupt_log;
-				/*!< set when finding a corrupt log
-				block or record, or there is a log
-				parsing buffer overflow */
-	bool		found_corrupt_fs;
-				/*!< set when an inconsistency with
-				the file system contents is detected
-				during log scan or apply */
 	lsn_t		mlog_checkpoint_lsn;
 				/*!< the LSN of a FILE_CHECKPOINT
 				record, or 0 if none was parsed */
@@ -293,13 +302,16 @@ private:
   @param p        iterator pointing to page_id
   @param mtr      mini-transaction
   @param b        pre-allocated buffer pool block
-  @return whether the page was successfully initialized */
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
   inline buf_block_t *recover_low(const page_id_t page_id, map::iterator &p,
                                   mtr_t &mtr, buf_block_t *b);
   /** Attempt to initialize a page based on redo log records.
   @param page_id  page identifier
   @return the recovered block
-  @retval nullptr if the page cannot be initialized based on log records */
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
   buf_block_t *recover_low(const page_id_t page_id);
 
   /** All found log files (multiple ones are possible if we are upgrading
@@ -386,14 +398,36 @@ public:
   @param page_id  corrupted page identifier */
   ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
 
+  /** Flag data file corruption during recovery. */
+  ATTRIBUTE_COLD void set_corrupt_fs();
+  /** Flag log file corruption during recovery. */
+  ATTRIBUTE_COLD void set_corrupt_log();
+  /** Possibly finish a recovery batch. */
+  inline void maybe_finish_batch();
+
+  /** @return whether data file corruption was found */
+  bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); }
+  /** @return whether log file corruption was found */
+  bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }
+
   /** Attempt to initialize a page based on redo log records.
   @param page_id  page identifier
   @return the recovered block
-  @retval nullptr if the page cannot be initialized based on log records */
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
   buf_block_t *recover(const page_id_t page_id)
   {
     return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
   }
+
+  /** Try to recover a tablespace that was not readable earlier
+  @param p          iterator, initially pointing to page_id_t{space_id,0};
+                    the records will be freed and the iterator advanced
+  @param name       tablespace file name
+  @param free_block spare buffer block
+  @return whether recovery failed */
+  bool recover_deferred(map::iterator &p, const std::string &name,
+                        buf_block_t *&free_block);
 };
 
 /** The recovery system */
diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl
index bfccf611991..2f970fd27f0 100644
--- a/storage/innobase/include/mach0data.inl
+++ b/storage/innobase/include/mach0data.inl
@@ -28,6 +28,7 @@ Created 11/28/1995 Heikki Tuuri
 #ifndef UNIV_INNOCHECKSUM
 
 #include "mtr0types.h"
+#include "ut0byte.h"
 
 /*******************************************************//**
 The following function is used to store data in one byte. */
diff --git a/storage/innobase/include/mem0mem.inl b/storage/innobase/include/mem0mem.inl
index 9236bbef05d..9906daf3eb9 100644
--- a/storage/innobase/include/mem0mem.inl
+++ b/storage/innobase/include/mem0mem.inl
@@ -24,6 +24,8 @@ The memory management
 Created 6/8/1994 Heikki Tuuri
 *************************************************************************/
 
+#include "ut0new.h"
+
 #ifdef UNIV_DEBUG
 # define mem_heap_create_block(heap, n, type, file_name, line)		\
 	mem_heap_create_block_func(heap, n, file_name, line, type)
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
index 285672be898..d34a62e7bb2 100644
--- a/storage/innobase/include/mtr0log.h
+++ b/storage/innobase/include/mtr0log.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2019, 2022, MariaDB Corporation.
+Copyright (c) 2019, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -173,7 +173,7 @@ inline uint32_t mlog_decode_len(const byte *log, const byte *end)
 template<unsigned l,mtr_t::write_type w,typename V>
 inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
 {
-  ut_ad(ut_align_down(ptr, srv_page_size) == block.frame);
+  ut_ad(ut_align_down(ptr, srv_page_size) == block.page.frame);
   static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
   byte buf[l];
 
@@ -196,7 +196,7 @@ inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
   }
   byte *p= static_cast<byte*>(ptr);
   const byte *const end= p + l;
-  if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+  if (w != FORCED && is_logged())
   {
     const byte *b= buf;
     while (*p++ == *b++)
@@ -224,7 +224,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
 {
   ut_ad(len);
   set_modified(b);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
 
   static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
@@ -245,7 +245,7 @@ inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
 {
   ut_ad(ofs <= ulint(srv_page_size));
   ut_ad(ofs + len <= ulint(srv_page_size));
-  ::memset(ofs + b->frame, val, len);
+  ::memset(ofs + b->page.frame, val, len);
   memset(*b, ofs, len, val);
 }
 
@@ -261,7 +261,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
   ut_ad(size);
   ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
   set_modified(b);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
 
   static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
@@ -289,10 +289,10 @@ inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
   size_t s= 0;
   while (s < len)
   {
-    ::memcpy(ofs + s + b->frame, str, size);
+    ::memcpy(ofs + s + b->page.frame, str, size);
     s+= len;
   }
-  ::memcpy(ofs + s + b->frame, str, len - s);
+  ::memcpy(ofs + s + b->page.frame, str, len - s);
   memset(*b, ofs, len, str, size);
 }
 
@@ -306,7 +306,7 @@ inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
   ut_ad(len);
   ut_ad(offset <= ulint(srv_page_size));
   ut_ad(offset + len <= ulint(srv_page_size));
-  memcpy_low(b, uint16_t(offset), &b.frame[offset], len);
+  memcpy_low(b, uint16_t(offset), &b.page.frame[offset], len);
 }
 
 /** Log a write of a byte string to a page.
@@ -319,7 +319,7 @@ inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
 {
   ut_ad(len);
   set_modified(block);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
   {
@@ -354,7 +354,7 @@ inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
   ut_ad(d + len <= ulint(srv_page_size));
 
   set_modified(b);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
   size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
@@ -387,7 +387,7 @@ template<byte type>
 inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
                               size_t len, bool alloc, size_t offset)
 {
-  static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
+  static_assert(!(type & 15) && type != RESERVED &&
                 type <= FILE_CHECKPOINT, "invalid type");
   ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
   ut_ad(!bpage || bpage->id() == id);
@@ -401,7 +401,8 @@ inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
   ut_ad(have_offset || offset == 0);
   ut_ad(offset + len <= srv_page_size);
   static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
-
+  ut_ad(type == FREE_PAGE || type == OPTION || (type == EXTENDED && !bpage) ||
+        memo_contains_flagged(bpage, MTR_MEMO_MODIFY));
   size_t max_len;
   if (!have_len)
     max_len= 1 + 5 + 5;
@@ -488,10 +489,10 @@ template<mtr_t::write_type w>
 inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
                           ulint len)
 {
-  ut_ad(ut_align_down(dest, srv_page_size) == b.frame);
+  ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame);
   char *d= static_cast<char*>(dest);
   const char *s= static_cast<const char*>(str);
-  if (w != FORCED && m_log_mode == MTR_LOG_ALL)
+  if (w != FORCED && is_logged())
   {
     ut_ad(len);
     const char *const end= d + len;
@@ -511,55 +512,13 @@ inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
   memcpy(b, ut_align_offset(d, srv_page_size), len);
 }
 
-/** Initialize an entire page.
-@param[in,out]        b       buffer page */
-inline void mtr_t::init(buf_block_t *b)
-{
-  const page_id_t id{b->page.id()};
-  ut_ad(is_named_space(id.space()));
-  ut_ad(!m_freed_pages == !m_freed_space);
-
-  if (UNIV_LIKELY_NULL(m_freed_space) &&
-      m_freed_space->id == id.space() &&
-      m_freed_pages->remove_if_exists(b->page.id().page_no()) &&
-      m_freed_pages->empty())
-  {
-    delete m_freed_pages;
-    m_freed_pages= nullptr;
-    m_freed_space= nullptr;
-  }
-
-  b->page.status= buf_page_t::INIT_ON_FLUSH;
-
-  if (m_log_mode != MTR_LOG_ALL)
-  {
-    ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
-    return;
-  }
-
-  m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page));
-  m_last_offset= FIL_PAGE_TYPE;
-}
-
-/** Free a page.
-@param[in]	space 	tablespace contains page to be freed
-@param[in]	offset	page offset to be freed */
-inline void mtr_t::free(fil_space_t &space, uint32_t offset)
-{
-  ut_ad(is_named_space(&space));
-  ut_ad(!m_freed_space || m_freed_space == &space);
-
-  if (m_log_mode == MTR_LOG_ALL)
-    m_log.close(log_write<FREE_PAGE>({space.id, offset}, nullptr));
-}
-
 /** Write an EXTENDED log record.
 @param block  buffer pool page
 @param type   extended record subtype; @see mrec_ext_t */
 inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
 {
   set_modified(block);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
   *l++= type;
@@ -586,7 +545,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
   ut_ad(!block.zip_size());
   ut_ad(prev_rec < block.physical_size());
   set_modified(block);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
   byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
@@ -613,7 +572,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
   ut_ad(hdr_size < MIN_3BYTE);
   ut_ad(prev_rec < block.physical_size());
   ut_ad(data_size < block.physical_size());
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
   len+= hdr_size < MIN_2BYTE ? 1 : 2;
@@ -645,7 +604,7 @@ inline void mtr_t::undo_append(const buf_block_t &block,
 {
   ut_ad(len > 2);
   set_modified(block);
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
   byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
@@ -668,7 +627,7 @@ inline void mtr_t::undo_append(const buf_block_t &block,
 @param id       first page identifier that will not be in the file */
 inline void mtr_t::trim_pages(const page_id_t id)
 {
-  if (m_log_mode != MTR_LOG_ALL)
+  if (!is_logged())
     return;
   byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
   *l++= TRIM_PAGES;
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
index b64dccb887f..1c044319ca0 100644
--- a/storage/innobase/include/mtr0mtr.h
+++ b/storage/innobase/include/mtr0mtr.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,11 +24,12 @@ Mini-transaction buffer
 Created 11/26/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef mtr0mtr_h
-#define mtr0mtr_h
+#pragma once
 
 #include "fil0fil.h"
 #include "dyn0buf.h"
+#include "buf0buf.h"
+#include "small_vector.h"
 
 /** Start a mini-transaction. */
 #define mtr_start(m)		(m)->start()
@@ -36,73 +37,73 @@ Created 11/26/1995 Heikki Tuuri
 /** Commit a mini-transaction. */
 #define mtr_commit(m)		(m)->commit()
 
-/** Set and return a savepoint in mtr.
-@return	savepoint */
-#define mtr_set_savepoint(m)	(m)->get_savepoint()
-
-/** Release the (index tree) s-latch stored in an mtr memo after a
-savepoint. */
-#define mtr_release_s_latch_at_savepoint(m, s, l)			\
-				(m)->release_s_latch_at_savepoint((s), (l))
-
 /** Change the logging mode of a mini-transaction.
 @return	old mode */
 #define mtr_set_log_mode(m, d)	(m)->set_log_mode((d))
 
-/** Release an object in the memo stack.
-@return true if released */
-#define mtr_memo_release(m, o, t)					\
-				(m)->memo_release((o), (t))
-
-/** Push an object to an mtr memo stack. */
-#define mtr_memo_push(m, o, t)	(m)->memo_push(o, t)
-
-#define mtr_x_lock_space(s, m)	(m)->x_lock_space((s), __FILE__, __LINE__)
-#define mtr_sx_lock_space(s, m) (m)->sx_lock_space((s), __FILE__, __LINE__)
-
-#define mtr_s_lock_index(i, m)	(m)->s_lock(&(i)->lock, __FILE__, __LINE__)
-#define mtr_x_lock_index(i, m)	(m)->x_lock(&(i)->lock, __FILE__, __LINE__)
-#define mtr_sx_lock_index(i, m)	(m)->sx_lock(&(i)->lock, __FILE__, __LINE__)
-
-#define mtr_release_block_at_savepoint(m, s, b)				\
-				(m)->release_block_at_savepoint((s), (b))
-
-#define mtr_block_sx_latch_at_savepoint(m, s, b)			\
-				(m)->sx_latch_at_savepoint((s), (b))
-
-#define mtr_block_x_latch_at_savepoint(m, s, b)				\
-				(m)->x_latch_at_savepoint((s), (b))
+#ifdef UNIV_PFS_RWLOCK
+# define mtr_s_lock_index(i,m)	(m)->s_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_x_lock_index(i,m)	(m)->x_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_sx_lock_index(i,m)	(m)->u_lock(__FILE__, __LINE__, &(i)->lock)
+#else
+# define mtr_s_lock_index(i,m)	(m)->s_lock(&(i)->lock)
+# define mtr_x_lock_index(i,m)	(m)->x_lock(&(i)->lock)
+# define mtr_sx_lock_index(i,m)	(m)->u_lock(&(i)->lock)
+#endif
 
 /** Mini-transaction memo stack slot. */
-struct mtr_memo_slot_t {
-	/** pointer to the object */
-	void*		object;
-
-	/** type of the stored object */
-	mtr_memo_type_t	type;
+struct mtr_memo_slot_t
+{
+  /** pointer to the object */
+  void *object;
+  /** type of the stored object */
+  mtr_memo_type_t type;
+
+  /** Release the object */
+  void release() const;
 };
 
 /** Mini-transaction handle and buffer */
 struct mtr_t {
+  mtr_t();
+  ~mtr_t();
+
   /** Start a mini-transaction. */
   void start();
 
   /** Commit the mini-transaction. */
   void commit();
 
-  /** Release latches till savepoint. To simplify the code only
-  MTR_MEMO_S_LOCK and MTR_MEMO_PAGE_S_FIX slot types are allowed to be
-  released, otherwise it would be neccesary to add one more argument in the
-  function to point out what slot types are allowed for rollback, and this
-  would be overengineering as currently the function is used only in one place
-  in the code.
-  @param savepoint   savepoint, can be obtained with get_savepoint */
-  void rollback_to_savepoint(ulint savepoint);
+  /** Release latches of unmodified buffer pages.
+  @param begin   first slot to release
+  @param end     last slot to release, or get_savepoint() */
+  void rollback_to_savepoint(ulint begin, ulint end);
+
+  /** Release latches of unmodified buffer pages.
+  @param begin   first slot to release */
+  void rollback_to_savepoint(ulint begin)
+  { rollback_to_savepoint(begin, m_memo.size()); }
+
+  /** Release the last acquired buffer page latch. */
+  void release_last_page()
+  { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
 
   /** Commit a mini-transaction that is shrinking a tablespace.
   @param space   tablespace that is being shrunk */
   ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
 
+  /** Commit a mini-transaction that is deleting or renaming a file.
+  @param space           tablespace that is being renamed or deleted
+  @param name            new file name (nullptr=the file will be deleted)
+  @param detached_handle if detached_handle != nullptr and if space is detached
+                         during the function execution the file handle if its
+                         node will be set to OS_FILE_CLOSED, and the previous
+                         value of the file handle will be assigned to the
+                         address, pointed by detached_handle.
+  @return whether the operation succeeded */
+  ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name,
+      pfs_os_file_t *detached_handle= nullptr);
+
   /** Commit a mini-transaction that did not modify any pages,
   but generated some redo log on a higher level, such as
   FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
@@ -112,35 +113,59 @@ struct mtr_t {
   void commit_files(lsn_t checkpoint_lsn= 0);
 
   /** @return mini-transaction savepoint (current size of m_memo) */
-  ulint get_savepoint() const { ut_ad(is_active()); return m_memo.size(); }
-
-	/** Release the (index tree) s-latch stored in an mtr memo after a
-	savepoint.
-	@param savepoint	value returned by @see set_savepoint.
-	@param lock		latch to release */
-	inline void release_s_latch_at_savepoint(
-		ulint		savepoint,
-		rw_lock_t*	lock);
+  ulint get_savepoint() const
+  {
+    ut_ad(is_active());
+    return m_memo.size();
+  }
 
-	/** Release the block in an mtr memo after a savepoint. */
-	inline void release_block_at_savepoint(
-		ulint		savepoint,
-		buf_block_t*	block);
+  /** Get the block at a savepoint */
+  buf_block_t *at_savepoint(ulint savepoint) const
+  {
+    ut_ad(is_active());
+    const mtr_memo_slot_t &slot= m_memo[savepoint];
+    ut_ad(slot.type < MTR_MEMO_S_LOCK);
+    ut_ad(slot.object);
+    return static_cast<buf_block_t*>(slot.object);
+  }
 
-	/** SX-latch a not yet latched block after a savepoint. */
-	inline void sx_latch_at_savepoint(ulint savepoint, buf_block_t* block);
+  /** Try to get a block at a savepoint.
+  @param savepoint the savepoint right before the block was acquired
+  @return the block at the savepoint
+  @retval nullptr  if no buffer block was registered at that savepoint */
+  buf_block_t *block_at_savepoint(ulint savepoint) const
+  {
+    ut_ad(is_active());
+    const mtr_memo_slot_t &slot= m_memo[savepoint];
+    return slot.type < MTR_MEMO_S_LOCK
+      ? static_cast<buf_block_t*>(slot.object)
+      : nullptr;
+  }
 
-	/** X-latch a not yet latched block after a savepoint. */
-	inline void x_latch_at_savepoint(ulint savepoint, buf_block_t*	block);
+  /** Retrieve a page that has already been latched.
+  @param id    page identifier
+  @param type  page latch type
+  @return block
+  @retval nullptr if the block had not been latched yet */
+  buf_block_t *get_already_latched(const page_id_t id, mtr_memo_type_t type)
+    const;
 
   /** @return the logging mode */
   mtr_log_t get_log_mode() const
   {
     static_assert(MTR_LOG_ALL == 0, "efficiency");
-    ut_ad(m_log_mode <= MTR_LOG_NO_REDO);
     return static_cast<mtr_log_t>(m_log_mode);
   }
 
+  /** @return whether log is to be written for changes */
+  bool is_logged() const
+  {
+    static_assert(MTR_LOG_ALL == 0, "efficiency");
+    static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency");
+    static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency");
+    return !(m_log_mode & MTR_LOG_NONE);
+  }
+
   /** Change the logging mode.
   @param mode	 logging mode
   @return	old mode */
@@ -151,10 +176,23 @@ struct mtr_t {
     return old_mode;
   }
 
+  /** Set the log mode of a sub-minitransaction
+  @param mtr  parent mini-transaction */
+  void set_log_mode_sub(const mtr_t &mtr)
+  {
+    ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO);
+    m_log_mode= mtr.m_log_mode | MTR_LOG_SUB;
+    static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, "");
+  }
+
   /** Check if we are holding a block latch in exclusive mode
   @param block  buffer pool block to search for */
   bool have_x_latch(const buf_block_t &block) const;
 
+  /** Check if we are holding a block latch in S or U mode
+  @param block  buffer pool block to search for */
+  bool have_u_or_x_latch(const buf_block_t &block) const;
+
 	/** Copy the tablespaces associated with the mini-transaction
 	(needed for generating FILE_MODIFY records)
 	@param[in]	mtr	mini-transaction that may modify
@@ -214,89 +252,61 @@ struct mtr_t {
 
 	/** Acquire a tablespace X-latch.
 	@param[in]	space_id	tablespace ID
-	@param[in]	file		file name from where called
-	@param[in]	line		line number in file
 	@return the tablespace object (never NULL) */
-	fil_space_t* x_lock_space(
-		ulint		space_id,
-		const char*	file,
-		unsigned	line);
-
-	/** Acquire a shared rw-latch.
-	@param[in]	lock	rw-latch
-	@param[in]	file	file name from where called
-	@param[in]	line	line number in file */
-	void s_lock(rw_lock_t* lock, const char* file, unsigned line)
-	{
-		rw_lock_s_lock_inline(lock, 0, file, line);
-		memo_push(lock, MTR_MEMO_S_LOCK);
-	}
+	fil_space_t* x_lock_space(ulint space_id);
 
-	/** Acquire an exclusive rw-latch.
-	@param[in]	lock	rw-latch
-	@param[in]	file	file name from where called
-	@param[in]	line	line number in file */
-	void x_lock(rw_lock_t* lock, const char* file, unsigned line)
-	{
-		rw_lock_x_lock_inline(lock, 0, file, line);
-		memo_push(lock, MTR_MEMO_X_LOCK);
-	}
+  /** Acquire a shared rw-latch. */
+  void s_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->s_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_S_LOCK);
+  }
 
-	/** Acquire an shared/exclusive rw-latch.
-	@param[in]	lock	rw-latch
-	@param[in]	file	file name from where called
-	@param[in]	line	line number in file */
-	void sx_lock(rw_lock_t* lock, const char* file, unsigned line)
-	{
-		rw_lock_sx_lock_inline(lock, 0, file, line);
-		memo_push(lock, MTR_MEMO_SX_LOCK);
-	}
+  /** Acquire an exclusive rw-latch. */
+  void x_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->x_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_X_LOCK);
+  }
 
-	/** Acquire a tablespace X-latch.
-	@param[in]	space	tablespace
-	@param[in]	file	file name from where called
-	@param[in]	line	line number in file */
-	void x_lock_space(fil_space_t* space, const char* file, unsigned line)
-	{
-		ut_ad(space->purpose == FIL_TYPE_TEMPORARY
-		      || space->purpose == FIL_TYPE_IMPORT
-		      || space->purpose == FIL_TYPE_TABLESPACE);
-		memo_push(space, MTR_MEMO_SPACE_X_LOCK);
-		rw_lock_x_lock_inline(&space->latch, 0, file, line);
-	}
+  /** Acquire an update latch. */
+  void u_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->u_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_SX_LOCK);
+  }
 
- /** Acquire a tablespace SX-latch.
- @param[in]	space	tablespace
- @param[in]	file	file name from where called
- @param[in]	line	line number in file */
- void sx_lock_space(fil_space_t *space, const char *file, unsigned line)
- {
-   ut_ad(space->purpose == FIL_TYPE_TEMPORARY
-         || space->purpose == FIL_TYPE_IMPORT
-	 || space->purpose == FIL_TYPE_TABLESPACE);
-   sx_lock(&space->latch, file, line);
- }
-
-	/** Release an object in the memo stack.
-	@param object	object
-	@param type	object type
-	@return bool if lock released */
-	bool memo_release(const void* object, ulint type);
+  /** Acquire an exclusive tablespace latch.
+  @param space  tablespace */
+  void x_lock_space(fil_space_t *space);
 
+  /** Release an index latch. */
+  void release(const index_lock &lock) { release(&lock); }
+  /** Release a latch to an unmodified page. */
+  void release(const buf_block_t &block) { release(&block); }
 private:
-  /** Note that the mini-transaction will modify data. */
-  void flag_modified() { m_modifications = true; }
+  /** Release an unmodified object. */
+  void release(const void *object);
+public:
   /** Mark the given latched page as modified.
   @param block   page that will be modified */
-  void modify(const buf_block_t& block);
-public:
-  /** Note that the mini-transaction will modify a block. */
-  void set_modified(const buf_block_t &block)
-  { flag_modified(); if (m_log_mode != MTR_LOG_NONE) modify(block); }
+  void set_modified(const buf_block_t &block);
 
   /** Set the state to not-modified. This will not log the changes.
   This is only used during redo log apply, to avoid logging the changes. */
-  void discard_modifications() { m_modifications = false; }
+  void discard_modifications() { m_modifications= false; }
 
   /** Get the LSN of commit().
   @return the commit LSN
@@ -318,59 +328,127 @@ public:
   /** @return true if pages has been trimed */
   bool is_trim_pages() { return m_trim_pages; }
 
+  /** Latch a buffer pool block.
+  @param block    block to be latched
+  @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+  void page_lock(buf_block_t *block, ulint rw_latch);
+
+  /** Acquire a latch on a buffer-fixed buffer pool block.
+  @param savepoint   savepoint location of the buffer-fixed block
+  @param rw_latch    latch to acquire */
+  void upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch);
+
+  /** Register a change to the page latch state. */
+  void lock_register(ulint savepoint, mtr_memo_type_t type)
+  {
+    mtr_memo_slot_t &slot= m_memo[savepoint];
+    ut_ad(slot.type <= MTR_MEMO_BUF_FIX);
+    ut_ad(type <= MTR_MEMO_BUF_FIX);
+    slot.type= type;
+  }
+
+  /** Upgrade U locks on a block to X */
+  void page_lock_upgrade(const buf_block_t &block);
+
+  /** Upgrade index U lock to X */
+  ATTRIBUTE_COLD void index_lock_upgrade();
+
+  /** Check if we are holding tablespace latch
+  @param space  tablespace to search for
+  @return whether space.latch is being held */
+  bool memo_contains(const fil_space_t& space) const
+    MY_ATTRIBUTE((warn_unused_result));
 #ifdef UNIV_DEBUG
   /** Check if we are holding an rw-latch in this mini-transaction
   @param lock   latch to search for
   @param type   held latch type
   @return whether (lock,type) is contained */
-  bool memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
-    MY_ATTRIBUTE((warn_unused_result));
-  /** Check if we are holding exclusive tablespace latch
-  @param space  tablespace to search for
-  @return whether space.latch is being held */
-  bool memo_contains(const fil_space_t& space)
+  bool memo_contains(const index_lock &lock, mtr_memo_type_t type) const
     MY_ATTRIBUTE((warn_unused_result));
 
-
-	/** Check if memo contains the given item.
-	@param object		object to search
-	@param flags		specify types of object (can be ORred) of
-				MTR_MEMO_PAGE_S_FIX ... values
-	@return true if contains */
-	bool memo_contains_flagged(const void* ptr, ulint flags) const;
-
-	/** Check if memo contains the given page.
-	@param[in]	ptr	pointer to within buffer frame
-	@param[in]	flags	specify types of object with OR of
-				MTR_MEMO_PAGE_S_FIX... values
-	@return	the block
-	@retval	NULL	if not found */
-	buf_block_t* memo_contains_page_flagged(
-		const byte*	ptr,
-		ulint		flags) const;
-
-	/** @return true if mini-transaction contains modifications. */
-	bool has_modifications() const { return m_modifications; }
+  /** Check if memo contains an index or buffer block latch.
+  @param object    object to search
+  @param flags     specify types of object latches
+  @return true if contains */
+  bool memo_contains_flagged(const void *object, ulint flags) const
+    MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+  /** Check if memo contains the given page.
+  @param ptr   pointer to within page frame
+  @param flags types latch to look for
+  @return the block
+  @retval nullptr    if not found */
+  buf_block_t *memo_contains_page_flagged(const byte *ptr, ulint flags) const;
+
+  /** @return whether this mini-transaction modifies persistent data */
+  bool has_modifications() const { return m_modifications; }
 #endif /* UNIV_DEBUG */
 
-	/** @return true if a record was added to the mini-transaction */
-	bool is_dirty() const { return m_made_dirty; }
-
-	/** Push an object to an mtr memo stack.
-	@param object	object
-	@param type	object type: MTR_MEMO_S_LOCK, ... */
-	inline void memo_push(void* object, mtr_memo_type_t type);
+  /** Push a buffer page to an the memo.
+  @param block  buffer block
+  @param type	object type: MTR_MEMO_S_LOCK, ... */
+  void memo_push(buf_block_t *block, mtr_memo_type_t type)
+    __attribute__((nonnull))
+  {
+    ut_ad(is_active());
+    ut_ad(type <= MTR_MEMO_PAGE_SX_MODIFY);
+    ut_ad(block->page.buf_fix_count());
+    ut_ad(block->page.in_file());
+#ifdef UNIV_DEBUG
+    switch (type) {
+    case MTR_MEMO_PAGE_S_FIX:
+      ut_ad(block->page.lock.have_s());
+      break;
+    case MTR_MEMO_PAGE_X_FIX: case MTR_MEMO_PAGE_X_MODIFY:
+      ut_ad(block->page.lock.have_x());
+      break;
+    case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_SX_MODIFY:
+      ut_ad(block->page.lock.have_u_or_x());
+      break;
+    case MTR_MEMO_BUF_FIX:
+      break;
+    case MTR_MEMO_MODIFY:
+    case MTR_MEMO_S_LOCK: case MTR_MEMO_X_LOCK: case MTR_MEMO_SX_LOCK:
+    case MTR_MEMO_SPACE_X_LOCK:
+      ut_ad("invalid type" == 0);
+    }
+#endif
+    if (!(type & MTR_MEMO_MODIFY));
+    else if (block->page.id().space() >= SRV_TMP_SPACE_ID)
+    {
+      block->page.set_temp_modified();
+      type= mtr_memo_type_t(type & ~MTR_MEMO_MODIFY);
+    }
+    else
+    {
+      m_modifications= true;
+      if (!m_made_dirty)
+        /* If we are going to modify a previously clean persistent page,
+        we must set m_made_dirty, so that commit() will acquire
+        log_sys.flush_order_mutex and insert the block into
+        buf_pool.flush_list. */
+        m_made_dirty= block->page.oldest_modification() <= 1;
+    }
+    m_memo.emplace_back(mtr_memo_slot_t{block, type});
+  }
 
-	/** Check if this mini-transaction is dirtying a clean page.
-	@param block	block being x-fixed
-	@return true if the mtr is dirtying a clean page. */
-	static inline bool is_block_dirtied(const buf_block_t* block)
-		MY_ATTRIBUTE((warn_unused_result));
+  /** Push an index lock or tablespace latch to the memo.
+  @param object index lock or tablespace latch
+  @param type	object type: MTR_MEMO_S_LOCK, ... */
+  void memo_push(void *object, mtr_memo_type_t type) __attribute__((nonnull))
+  {
+    ut_ad(is_active());
+    ut_ad(type >= MTR_MEMO_S_LOCK);
+    m_memo.emplace_back(mtr_memo_slot_t{object, type});
+  }
 
   /** @return the size of the log is empty */
   size_t get_log_size() const { return m_log.size(); }
   /** @return whether the log and memo are empty */
-  bool is_empty() const { return m_memo.size() == 0 && m_log.size() == 0; }
+  bool is_empty() const { return !get_savepoint() && !get_log_size(); }
+
+  /** Write an OPT_PAGE_CHECKSUM record. */
+  inline void page_checksum(const buf_page_t &bpage);
 
   /** Write request types */
   enum write_type
@@ -470,9 +548,9 @@ public:
   @param[in,out]        b       buffer page */
   void init(buf_block_t *b);
   /** Free a page.
-  @param[in]      space   tablespace contains page to be freed
-  @param[in]      offset  page offset to be freed */
-  inline void free(fil_space_t &space, uint32_t offset);
+  @param space   tablespace
+  @param offset  offset of the page to be freed */
+  void free(const fil_space_t &space, uint32_t offset);
   /** Write log for partly initializing a B-tree or R-tree page.
   @param block    B-tree or R-tree page
   @param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
@@ -618,6 +696,8 @@ private:
   @return {start_lsn,flush_ahead} */
   inline std::pair<lsn_t,page_flush_ahead> finish_write(ulint len);
 
+  /** Release all latches. */
+  void release();
   /** Release the resources */
   inline void release_resources();
 
@@ -628,11 +708,17 @@ public:
   { ut_ad(!m_commit || m_start); return m_start && !m_commit; }
   /** @return whether the mini-transaction has been committed */
   bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+  /** @return whether the mini-transaction is freeing an index tree */
+  bool is_freeing_tree() const { return m_freeing_tree; }
+  /** Notify that the mini-transaction is freeing an index tree */
+  void freeing_tree() { m_freeing_tree= true; }
 private:
   /** whether start() has been called */
   bool m_start= false;
   /** whether commit() has been called */
   bool m_commit= false;
+  /** whether freeing_tree() has been called */
+  bool m_freeing_tree= false;
 #endif
 
   /** The page of the most recent m_log record written, or NULL */
@@ -643,7 +729,7 @@ private:
   /** specifies which operations should be logged; default MTR_LOG_ALL */
   uint16_t m_log_mode:2;
 
-  /** whether at least one buffer pool page was written to */
+  /** whether at least one persistent page was written to */
   uint16_t m_modifications:1;
 
   /** whether at least one previously clean buffer pool page was written to */
@@ -663,7 +749,7 @@ private:
 #endif /* UNIV_DEBUG */
 
   /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
-  mtr_buf_t m_memo;
+  small_vector<mtr_memo_slot_t, 16> m_memo;
 
   /** mini-transaction log */
   mtr_buf_t m_log;
@@ -679,7 +765,3 @@ private:
   /** set of freed page ids */
   range_set *m_freed_pages= nullptr;
 };
-
-#include "mtr0mtr.inl"
-
-#endif /* mtr0mtr_h */
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
index 9e59dc814d3..465c20fe7d2 100644
--- a/storage/innobase/include/mtr0types.h
+++ b/storage/innobase/include/mtr0types.h
@@ -24,14 +24,11 @@ Mini-transaction buffer global types
 Created 11/26/1995 Heikki Tuuri
 *******************************************************/
 
-#ifndef mtr0types_h
-#define mtr0types_h
+#pragma once
 
-#ifndef UNIV_INNOCHECKSUM
-#include "sync0rw.h"
-#else
-#include "univ.i"
-#endif /* UNIV_INNOCHECKSUM */
+#include "buf0types.h"
+
+#include "ut0byte.h"
 
 struct mtr_t;
 
@@ -44,6 +41,11 @@ enum mtr_log_t {
 	Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
 	MTR_LOG_NONE,
 
+	/** Log all operations, but do not write any OPT_PAGE_CHECKSUM
+	records because some of the modified pages were also modified
+	by another mini-transaction that did not write its log yet. */
+	MTR_LOG_SUB,
+
 	/** Don't generate REDO log but add dirty pages to flush list */
 	MTR_LOG_NO_REDO
 };
@@ -80,12 +82,8 @@ type. The following record types refer to data pages:
     RESERVED (6): reserved for future use; a subtype code
     (encoded immediately after the length) would be written
     to reserve code space for further extensions
-    OPTION (7): optional record that may be ignored; a subtype code
-    (encoded immediately after the length) would distinguish actual
-    usage, such as:
-     * MDEV-18976 page checksum record
-     * binlog record
-     * SQL statement (at the start of statement)
+    OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
+    (encoded immediately after the length) would distinguish actual usage
 
 Bits 3..0 indicate the redo log record length, excluding the first
 byte, but including additional length bytes and any other bytes,
@@ -232,9 +230,7 @@ enum mrec_type_t
   /** Reserved for future use. */
   RESERVED= 0x60,
   /** Optional record that may be ignored in crash recovery.
-  A subtype code will be encoded immediately after the length.
-  Possible subtypes would include a MDEV-18976 page checksum record,
-  a binlog record, or an SQL statement. */
+  A subtype (@see mrec_opt) will be encoded after the page identifier. */
   OPTION= 0x70
 };
 
@@ -286,6 +282,15 @@ enum mrec_ext_t
 };
 
 
+/** Recognized OPTION record subtypes. */
+enum mrec_opt
+{
+  /** page checksum at the end of the mini-transaction */
+  OPT_PAGE_CHECKSUM= 0
+  /* Other possible subtypes: a binlog record, or an SQL statement. */
+};
+
+
 /** Redo log record types for file-level operations. These bit
 patterns will be written to redo log files, so the existing codes or
 their interpretation on crash recovery must not be changed. */
@@ -331,9 +336,7 @@ enum mtr_memo_type_t {
 
 	MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5,
 
-	/** acquire X-latch on fil_space_t::latch */
+	/** wr_lock() on fil_space_t::latch */
 	MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1
 };
-#endif /* !UNIV_CHECKSUM */
-
-#endif /* mtr0types_h */
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index a22dc3562b5..f8ae0f51557 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -51,6 +51,8 @@ extern bool	os_has_said_disk_full;
 /** File offset in bytes */
 typedef ib_uint64_t os_offset_t;
 
+class buf_tmp_buffer_t;
+
 #ifdef _WIN32
 
 /** We define always WIN_ASYNC_IO, and check at run-time whether
@@ -206,11 +208,13 @@ public:
     PUNCH_RANGE= WRITE_SYNC | 128,
   };
 
-  constexpr IORequest(buf_page_t *bpage, fil_node_t *node, Type type) :
-    bpage(bpage), node(node), type(type) {}
+  constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot,
+                      fil_node_t *node, Type type) :
+    bpage(bpage), slot(slot), node(node), type(type) {}
 
-  constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr) :
-    bpage(bpage), type(type) {}
+  constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr,
+                      buf_tmp_buffer_t *slot= nullptr) :
+    bpage(bpage), slot(slot), type(type) {}
 
   bool is_read() const { return (type & READ_SYNC) != 0; }
   bool is_write() const { return (type & WRITE_SYNC) != 0; }
@@ -237,7 +241,10 @@ private:
 
 public:
   /** Page to be written on write operation */
-  buf_page_t* const bpage= nullptr;
+  buf_page_t *const bpage= nullptr;
+
+  /** Memory to be used for encrypted or page_compressed pages */
+  buf_tmp_buffer_t *const slot= nullptr;
 
   /** File descriptor */
   fil_node_t *const node= nullptr;
@@ -263,8 +270,8 @@ struct os_file_size_t {
 constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
 
 extern Atomic_counter<ulint> os_n_file_reads;
-extern ulint	os_n_file_writes;
-extern ulint	os_n_fsyncs;
+extern Atomic_counter<size_t> os_n_file_writes;
+extern Atomic_counter<size_t> os_n_fsyncs;
 
 /* File types for directory entry data type */
 
@@ -575,12 +582,8 @@ The wrapper functions have the prefix of "innodb_". */
 # define os_file_close(file)						\
 	pfs_os_file_close_func(file, __FILE__, __LINE__)
 
-# define os_file_read(type, file, buf, offset, n)			\
-	pfs_os_file_read_func(type, file, buf, offset, n, __FILE__, __LINE__)
-
-# define os_file_read_no_error_handling(type, file, buf, offset, n, o)	\
-	pfs_os_file_read_no_error_handling_func(			\
-		type, file, buf, offset, n, o, __FILE__, __LINE__)
+# define os_file_read(type, file, buf, offset, n, o)			\
+	pfs_os_file_read_func(type, file, buf, offset, n,o, __FILE__, __LINE__)
 
 # define os_file_write(type, name, file, buf, offset, n)	\
 	pfs_os_file_write_func(type, name, file, buf, offset,	\
@@ -725,31 +728,6 @@ pfs_os_file_read_func(
 	void*			buf,
 	os_offset_t		offset,
 	ulint			n,
-	const char*		src_file,
-	uint			src_line);
-
-/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-This is the performance schema instrumented wrapper function for
-os_file_read_no_error_handling_func() which requests a synchronous
-read operation.
-@param[in]	type		IO request context
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[out]	o		number of bytes actually read
-@param[in]	src_file	file name where func invoked
-@param[in]	src_line	line where the func invoked
-@return DB_SUCCESS if request was successful */
-UNIV_INLINE
-dberr_t
-pfs_os_file_read_no_error_handling_func(
-	const IORequest&	type,
-	pfs_os_file_t		file,
-	void*			buf,
-	os_offset_t		offset,
-	ulint			n,
 	ulint*			o,
 	const char*		src_file,
 	uint			src_line);
@@ -875,11 +853,8 @@ to original un-instrumented file I/O APIs */
 
 # define os_file_close(file)	os_file_close_func(file)
 
-# define os_file_read(type, file, buf, offset, n)			\
-	os_file_read_func(type, file, buf, offset, n)
-
-# define os_file_read_no_error_handling(type, file, buf, offset, n, o)	\
-	os_file_read_no_error_handling_func(type, file, buf, offset, n, o)
+# define os_file_read(type, file, buf, offset, n, o)		\
+	os_file_read_func(type, file, buf, offset, n, o)
 
 # define os_file_write(type, name, file, buf, offset, n)	\
 	os_file_write_func(type, name, file, buf, offset, n)
@@ -985,6 +960,7 @@ Requests a synchronous read operation.
 @param[out]	buf		buffer where to read
 @param[in]	offset		file offset where to read
 @param[in]	n		number of bytes to read
+@param[out]	o		number of bytes actually read
 @return DB_SUCCESS if request was successful */
 dberr_t
 os_file_read_func(
@@ -992,7 +968,8 @@ os_file_read_func(
 	os_file_t		file,
 	void*			buf,
 	os_offset_t		offset,
-	ulint			n)
+	ulint			n,
+	ulint*			o)
 	MY_ATTRIBUTE((warn_unused_result));
 
 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
@@ -1007,27 +984,6 @@ os_file_read_string(
 	char*		str,
 	ulint		size);
 
-/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-Requests a synchronous positioned read operation. This function does not do
-any error handling. In case of error it returns FALSE.
-@param[in]	type		IO request context
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[out]	o		number of bytes actually read
-@return DB_SUCCESS or error code */
-dberr_t
-os_file_read_no_error_handling_func(
-	const IORequest&	type,
-	os_file_t		file,
-	void*			buf,
-	os_offset_t		offset,
-	ulint			n,
-	ulint*			o)
-	MY_ATTRIBUTE((warn_unused_result));
-
 /** NOTE! Use the corresponding macro os_file_write(), not directly this
 function!
 Requests a synchronous write operation.
@@ -1058,23 +1014,6 @@ os_file_status(
 	bool*		exists,
 	os_file_type_t* type);
 
-/** This function returns a new path name after replacing the basename
-in an old path with a new basename.  The old_path is a full path
-name including the extension.  The tablename is in the normal
-form "databasename/tablename".  The new base name is found after
-the forward slash.  Both input strings are null terminated.
-
-This function allocates memory to be returned.  It is the callers
-responsibility to free the return value after it is no longer needed.
-
-@param[in]	old_path		pathname
-@param[in]	new_name		new file name
-@return own: new full pathname */
-char*
-os_file_make_new_pathname(
-	const char*	old_path,
-	const char*	new_name);
-
 /** This function reduces a null-terminated full remote path name into
 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
 the 'databasename/tablename.ibd' found at the end of the path with just
@@ -1120,14 +1059,19 @@ void os_aio_free();
 @retval DB_IO_ERROR on I/O error */
 dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
 
+/** @return number of pending reads */
+size_t os_aio_pending_reads();
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx();
+/** @return number of pending writes */
+size_t os_aio_pending_writes();
+
 /** Wait until there are no pending asynchronous writes. */
 void os_aio_wait_until_no_pending_writes();
 
-
-/** Wait until there are no pending asynchronous reads. */
+/** Wait until all pending asynchronous reads have completed. */
 void os_aio_wait_until_no_pending_reads();
 
-
 /** Prints info of the aio arrays.
 @param[in/out]	file		file where to print */
 void
@@ -1208,31 +1152,34 @@ os_file_punch_hole(
 	os_offset_t	len)
 	MY_ATTRIBUTE((warn_unused_result));
 
-/** Normalizes a directory path for the current OS:
-On Windows, we convert '/' to '\', else we convert '\' to '/'.
-@param[in,out] str A null-terminated directory and file path */
-void os_normalize_path(char*	str);
-
 /* Determine if a path is an absolute path or not.
 @param[in]	OS directory or file path to evaluate
 @retval true if an absolute path
 @retval false if a relative path */
-UNIV_INLINE
-bool
-is_absolute_path(
-	const char*	path)
+inline bool is_absolute_path(const char *path)
 {
-	if (path[0] == OS_PATH_SEPARATOR) {
-		return(true);
-	}
+  switch (path[0]) {
+#ifdef _WIN32
+  case '\0':
+    return false;
+  case '\\':
+#endif
+  case '/':
+    return true;
+  }
 
 #ifdef _WIN32
-	if (path[1] == ':' && path[2] == OS_PATH_SEPARATOR) {
-		return(true);
-	}
+  if (path[1] == ':')
+  {
+    switch (path[2]) {
+    case '/':
+    case '\\':
+      return true;
+    }
+  }
 #endif /* _WIN32 */
 
-	return(false);
+  return false;
 }
 
 #include "os0file.inl"
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
index e88f94b8ff3..7de3150540d 100644
--- a/storage/innobase/include/os0file.inl
+++ b/storage/innobase/include/os0file.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2020, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -210,6 +210,7 @@ os_file_read() which requests a synchronous read operation.
 @param[out]	buf		buffer where to read
 @param[in]	offset		file offset where to read
 @param[in]	n		number of bytes to read
+@param[out]	o		number of bytes actually read
 @param[in]	src_file	file name where func invoked
 @param[in]	src_line	line where the func invoked
 @return DB_SUCCESS if request was successful */
@@ -221,6 +222,7 @@ pfs_os_file_read_func(
 	void*			buf,
 	os_offset_t		offset,
 	ulint			n,
+	ulint*			o,
 	const char*		src_file,
 	uint			src_line)
 {
@@ -232,47 +234,7 @@ pfs_os_file_read_func(
 
 	dberr_t		result;
 
-	result = os_file_read_func(type, file, buf, offset, n);
-
-	register_pfs_file_io_end(locker, n);
-
-	return(result);
-}
-
-/** NOTE! Please use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-This is the performance schema instrumented wrapper function for
-os_file_read_no_error_handling_func() which requests a synchronous
-read operation.
-@param[in]	type		IO request context
-@param[in]	file		Open file handle
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset where to read
-@param[in]	n		number of bytes to read
-@param[out]	o		number of bytes actually read
-@param[in]	src_file	file name where func invoked
-@param[in]	src_line	line where the func invoked
-@return DB_SUCCESS if request was successful */
-UNIV_INLINE
-dberr_t
-pfs_os_file_read_no_error_handling_func(
-	const IORequest&	type,
-	pfs_os_file_t		file,
-	void*			buf,
-	os_offset_t		offset,
-	ulint			n,
-	ulint*			o,
-	const char*		src_file,
-	uint			src_line)
-{
-	PSI_file_locker_state	state;
-	struct PSI_file_locker*	locker = NULL;
-
-	register_pfs_file_io_begin(
-		&state, locker, file, n, PSI_FILE_READ, src_file, src_line);
-
-	dberr_t	result = os_file_read_no_error_handling_func(
-		type, file, buf, offset, n, o);
+	result = os_file_read_func(type, file, buf, offset, n, o);
 
 	register_pfs_file_io_end(locker, n);
 
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
index d80eb4567e5..28aa30565e4 100644
--- a/storage/innobase/include/page0cur.h
+++ b/storage/innobase/include/page0cur.h
@@ -54,14 +54,11 @@ page_zip_des_t*
 page_cur_get_page_zip(
 /*==================*/
 	page_cur_t*	cur);	/*!< in: page cursor */
-/*********************************************************//**
-Gets the record where the cursor is positioned.
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
 @return record */
 UNIV_INLINE
-rec_t*
-page_cur_get_rec(
-/*=============*/
-	page_cur_t*	cur);	/*!< in: page cursor */
+rec_t *page_cur_get_rec(const page_cur_t *cur);
 #else /* UNIV_DEBUG */
 # define page_cur_get_page(cur)		page_align((cur)->rec)
 # define page_cur_get_block(cur)	(cur)->block
@@ -113,20 +110,6 @@ page_cur_position(
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	page_cur_t*		cur);	/*!< out: page cursor */
-/**********************************************************//**
-Moves the cursor to the next record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_next(
-/*==================*/
-	page_cur_t*	cur);	/*!< in/out: cursor; must not be after last */
-/**********************************************************//**
-Moves the cursor to the previous record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_prev(
-/*==================*/
-	page_cur_t*	cur);	/*!< in/out: cursor; not before first */
 
 /***********************************************************//**
 Inserts a record next to page cursor. Returns pointer to inserted record if
@@ -146,7 +129,6 @@ page_cur_tuple_insert(
 /*==================*/
 	page_cur_t*	cursor,	/*!< in/out: a page cursor */
 	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
-	dict_index_t*	index,	/*!< in: record descriptor */
 	rec_offs**	offsets,/*!< out: offsets on *rec */
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
@@ -160,7 +142,6 @@ rec_t*
 page_cur_insert_rec_low(
 /*====================*/
 	const page_cur_t*cur,	/*!< in: page cursor */
-	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: record to insert after cur */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
@@ -168,21 +149,20 @@ page_cur_insert_rec_low(
 
 /***********************************************************//**
 Inserts a record next to page cursor on a compressed and uncompressed
-page. Returns pointer to inserted record if succeed, i.e.,
-enough space available, NULL otherwise.
-The cursor stays at the same position.
+page.
 
 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
 if this is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
 or by invoking ibuf_reset_free_bits() before mtr_commit().
 
-@return pointer to record if succeed, NULL otherwise */
+@return pointer to inserted record
+@return nullptr on failure */
 rec_t*
 page_cur_insert_rec_zip(
 /*====================*/
-	page_cur_t*	cursor,	/*!< in/out: page cursor */
-	dict_index_t*	index,	/*!< in: record descriptor */
+	page_cur_t*	cursor,	/*!< in/out: page cursor,
+				logical position unchanged  */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
@@ -194,7 +174,6 @@ void
 page_cur_delete_rec(
 /*================*/
 	page_cur_t*		cursor,	/*!< in/out: a page cursor */
-	const dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_offs*		offsets,/*!< in: rec_get_offsets(
 					cursor->rec, index) */
 	mtr_t*			mtr)	/*!< in/out: mini-transaction */
@@ -250,43 +229,12 @@ page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
 bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
                                size_t hdr_size, size_t data_size);
 
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
-	const buf_block_t*	block,
-	const dict_index_t*	index,
-	const dtuple_t*		tuple,
-	page_cur_mode_t		mode,
-	page_cur_t*		cursor);
-
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
-	const buf_block_t*	block,
-	const dict_index_t*	index,
-	const dtuple_t*		tuple,
-	page_cur_t*		cursor);
-
+MY_ATTRIBUTE((warn_unused_result))
 /****************************************************************//**
 Searches the right position for a page cursor. */
-void
+bool
 page_cur_search_with_match(
 /*=======================*/
-	const buf_block_t*	block,	/*!< in: buffer block */
-	const dict_index_t*	index,	/*!< in: record descriptor */
 	const dtuple_t*		tuple,	/*!< in: data tuple */
 	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
 					PAGE_CUR_LE, PAGE_CUR_G, or
@@ -297,12 +245,11 @@ page_cur_search_with_match(
 	ulint*			ilow_matched_fields,
 					/*!< in/out: already matched
 					fields in lower limit record */
-	page_cur_t*		cursor,	/*!< out: page cursor */
+	page_cur_t*		cursor,	/*!< in/out: page cursor */
 	rtr_info_t*		rtr_info);/*!< in/out: rtree search stack */
 #ifdef BTR_CUR_HASH_ADAPT
+MY_ATTRIBUTE((warn_unused_result))
 /** Search the right position for a page cursor.
-@param[in]	block			buffer block
-@param[in]	index			index tree
 @param[in]	tuple			key to be searched for
 @param[in]	mode			search mode
 @param[in,out]	iup_matched_fields	already matched fields in the
@@ -313,11 +260,9 @@ first partially matched field in the upper limit record
 lower limit record
 @param[in,out]	ilow_matched_bytes	already matched bytes in the
 first partially matched field in the lower limit record
-@param[out]	cursor			page cursor */
-void
+@param[in,out]	cursor			page cursor */
+bool
 page_cur_search_with_match_bytes(
-	const buf_block_t*	block,
-	const dict_index_t*	index,
 	const dtuple_t*		tuple,
 	page_cur_mode_t		mode,
 	ulint*			iup_matched_fields,
@@ -329,21 +274,30 @@ page_cur_search_with_match_bytes(
 /***********************************************************//**
 Positions a page cursor on a randomly chosen user record on a page. If there
 are no user records, sets the cursor on the infimum record. */
-void
-page_cur_open_on_rnd_user_rec(
-/*==========================*/
-	buf_block_t*	block,	/*!< in: page */
-	page_cur_t*	cursor);/*!< out: page cursor */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor);
 
 /** Index page cursor */
 
 struct page_cur_t{
-	const dict_index_t*	index;
+	dict_index_t*	index;
 	rec_t*		rec;	/*!< pointer to a record on page */
 	rec_offs*	offsets;
 	buf_block_t*	block;	/*!< pointer to the block containing rec */
 };
 
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_next(page_cur_t *cur)
+{
+  return cur->rec= page_rec_get_next(cur->rec);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_prev(page_cur_t *cur)
+{
+  return cur->rec= page_rec_get_prev(cur->rec);
+}
+
 #include "page0cur.inl"
 
 #endif
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
index 828be6840d2..1638b5749ff 100644
--- a/storage/innobase/include/page0cur.inl
+++ b/storage/innobase/include/page0cur.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -34,13 +34,7 @@ page_cur_get_page(
 /*==============*/
 	page_cur_t*	cur)	/*!< in: page cursor */
 {
-	ut_ad(cur);
-
-	if (cur->rec) {
-		ut_ad(page_align(cur->rec) == cur->block->frame);
-	}
-
-	return(page_align(cur->rec));
+  return page_align(page_cur_get_rec(cur));
 }
 
 /*********************************************************//**
@@ -52,13 +46,9 @@ page_cur_get_block(
 /*===============*/
 	page_cur_t*	cur)	/*!< in: page cursor */
 {
-	ut_ad(cur);
-
-	if (cur->rec) {
-		ut_ad(page_align(cur->rec) == cur->block->frame);
-	}
-
-	return(cur->block);
+  ut_ad(cur);
+  ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+  return cur->block;
 }
 
 /*********************************************************//**
@@ -73,22 +63,15 @@ page_cur_get_page_zip(
 	return(buf_block_get_page_zip(page_cur_get_block(cur)));
 }
 
-/*********************************************************//**
-Gets the record where the cursor is positioned.
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
 @return record */
 UNIV_INLINE
-rec_t*
-page_cur_get_rec(
-/*=============*/
-	page_cur_t*	cur)	/*!< in: page cursor */
+rec_t *page_cur_get_rec(const page_cur_t *cur)
 {
-	ut_ad(cur);
-
-	if (cur->rec) {
-		ut_ad(page_align(cur->rec) == cur->block->frame);
-	}
-
-	return(cur->rec);
+  ut_ad(cur);
+  ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+  return cur->rec;
 }
 #endif /* UNIV_DEBUG */
 
@@ -102,7 +85,7 @@ page_cur_set_before_first(
 	const buf_block_t*	block,	/*!< in: index page */
 	page_cur_t*		cur)	/*!< in: cursor */
 {
-	cur->block = (buf_block_t*) block;
+	cur->block = const_cast<buf_block_t*>(block);
 	cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
 }
 
@@ -116,7 +99,7 @@ page_cur_set_after_last(
 	const buf_block_t*	block,	/*!< in: index page */
 	page_cur_t*		cur)	/*!< in: cursor */
 {
-	cur->block = (buf_block_t*) block;
+	cur->block = const_cast<buf_block_t*>(block);
 	cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
 }
 
@@ -130,7 +113,7 @@ page_cur_is_before_first(
 	const page_cur_t*	cur)	/*!< in: cursor */
 {
 	ut_ad(cur);
-	ut_ad(page_align(cur->rec) == cur->block->frame);
+	ut_ad(page_align(cur->rec) == cur->block->page.frame);
 	return(page_rec_is_infimum(cur->rec));
 }
 
@@ -144,7 +127,7 @@ page_cur_is_after_last(
 	const page_cur_t*	cur)	/*!< in: cursor */
 {
 	ut_ad(cur);
-	ut_ad(page_align(cur->rec) == cur->block->frame);
+	ut_ad(page_align(cur->rec) == cur->block->page.frame);
 	return(page_rec_is_supremum(cur->rec));
 }
 
@@ -160,81 +143,12 @@ page_cur_position(
 	page_cur_t*		cur)	/*!< out: page cursor */
 {
 	ut_ad(rec && block && cur);
-	ut_ad(page_align(rec) == block->frame);
+	ut_ad(page_align(rec) == block->page.frame);
 
 	cur->rec = (rec_t*) rec;
 	cur->block = (buf_block_t*) block;
 }
 
-/**********************************************************//**
-Moves the cursor to the next record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_next(
-/*==================*/
-	page_cur_t*	cur)	/*!< in/out: cursor; must not be after last */
-{
-	ut_ad(!page_cur_is_after_last(cur));
-
-	cur->rec = page_rec_get_next(cur->rec);
-}
-
-/**********************************************************//**
-Moves the cursor to the previous record on page. */
-UNIV_INLINE
-void
-page_cur_move_to_prev(
-/*==================*/
-	page_cur_t*	cur)	/*!< in/out: page cursor, not before first */
-{
-	ut_ad(!page_cur_is_before_first(cur));
-
-	cur->rec = page_rec_get_prev(cur->rec);
-}
-
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, or PAGE_CUR_GE
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
-	const buf_block_t*	block,
-	const dict_index_t*	index,
-	const dtuple_t*		tuple,
-	page_cur_mode_t		mode,
-	page_cur_t*		cursor)
-{
-	ulint		low_match = 0;
-	ulint		up_match = 0;
-
-	ut_ad(dtuple_check_typed(tuple));
-
-	page_cur_search_with_match(block, index, tuple, mode,
-				   &up_match, &low_match, cursor, NULL);
-	return(low_match);
-}
-
-/** Search the right position for a page cursor.
-@param[in] block buffer block
-@param[in] index index tree
-@param[in] tuple data tuple
-@param[out] cursor page cursor
-@return number of matched fields on the left */
-UNIV_INLINE
-ulint
-page_cur_search(
-	const buf_block_t*	block,
-	const dict_index_t*	index,
-	const dtuple_t*		tuple,
-	page_cur_t*		cursor)
-{
-	return(page_cur_search(block, index, tuple, PAGE_CUR_LE, cursor));
-}
-
 /***********************************************************//**
 Inserts a record next to page cursor. Returns pointer to inserted record if
 succeed, i.e., enough space available, NULL otherwise. The cursor stays at
@@ -253,14 +167,12 @@ page_cur_tuple_insert(
 /*==================*/
 	page_cur_t*	cursor,	/*!< in/out: a page cursor */
 	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
-	dict_index_t*	index,	/*!< in: record descriptor */
 	rec_offs**	offsets,/*!< out: offsets on *rec */
 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
 	ulint		n_ext,	/*!< in: number of externally stored columns */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	rec_t*		rec;
-	ulint		size = rec_get_converted_size(index, tuple, n_ext);
+	ulint size = rec_get_converted_size(cursor->index, tuple, n_ext);
 
 	if (!*heap) {
 		*heap = mem_heap_create(size
@@ -269,21 +181,20 @@ page_cur_tuple_insert(
 					* sizeof **offsets);
 	}
 
-	rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(*heap, size),
-					index, tuple, n_ext);
+	rec_t* rec = rec_convert_dtuple_to_rec(
+		static_cast<byte*>(mem_heap_alloc(*heap, size)),
+		cursor->index, tuple, n_ext);
 
-	*offsets = rec_get_offsets(rec, index, *offsets,
-				   page_is_leaf(cursor->block->frame)
-				   ? index->n_core_fields : 0,
+	*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+				   page_is_leaf(cursor->block->page.frame)
+				   ? cursor->index->n_core_fields : 0,
 				   ULINT_UNDEFINED, heap);
 	ut_ad(size == rec_offs_size(*offsets));
 
 	if (is_buf_block_get_page_zip(cursor->block)) {
-		rec = page_cur_insert_rec_zip(
-			cursor, index, rec, *offsets, mtr);
+		rec = page_cur_insert_rec_zip(cursor, rec, *offsets, mtr);
 	} else {
-		rec = page_cur_insert_rec_low(cursor,
-					      index, rec, *offsets, mtr);
+		rec = page_cur_insert_rec_low(cursor, rec, *offsets, mtr);
 	}
 
 	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, *offsets));
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
index eb6bf56e8dd..0ad42474f84 100644
--- a/storage/innobase/include/page0page.h
+++ b/storage/innobase/include/page0page.h
@@ -1,6 +1,6 @@
 /*****************************************************************************
 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2013, 2021, MariaDB Corporation.
+Copyright (c) 2013, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -418,8 +418,8 @@ template<bool compressed>
 inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
                                  bool comp, mtr_t *mtr)
 {
-  ut_ad(block->frame == page_align(rec));
-  ut_ad(comp == (page_is_comp(block->frame) != 0));
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == (page_is_comp(block->page.frame) != 0));
 
   if (page_zip_des_t *page_zip= compressed
       ? buf_block_get_page_zip(block) : nullptr)
@@ -534,7 +534,8 @@ inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
 /************************************************************//**
 Returns the nth record of the record list.
 This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
+@return nth record
+@retval nullptr on corrupted page */
 const rec_t*
 page_rec_get_nth_const(
 /*===================*/
@@ -544,14 +545,12 @@ page_rec_get_nth_const(
 /************************************************************//**
 Returns the nth record of the record list.
 This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
-UNIV_INLINE
-rec_t*
-page_rec_get_nth(
-/*=============*/
-	page_t*	page,	/*< in: page */
-	ulint	nth)	/*!< in: nth record */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+@return nth record
+@retval nullptr on corrupted page */
+inline rec_t *page_rec_get_nth(page_t* page, ulint nth)
+{
+  return const_cast<rec_t*>(page_rec_get_nth_const(page, nth));
+}
 
 /************************************************************//**
 Returns the middle record of the records on the page. If there is an
@@ -592,15 +591,11 @@ page_get_n_recs(
 /*============*/
 	const page_t*	page);	/*!< in: index page */
 
-/***************************************************************//**
-Returns the number of records before the given record in chain.
-The number includes infimum and supremum records.
-This is the inverse function of page_rec_get_nth().
-@return number of records */
-ulint
-page_rec_get_n_recs_before(
-/*=======================*/
-	const rec_t*	rec);	/*!< in: the physical record */
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec);
 /*************************************************************//**
 Gets the number of records in the heap.
 @return number of user records */
@@ -649,6 +644,23 @@ inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
 {
   return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
 }
+
+inline rec_t *page_dir_slot_get_rec_validate(page_dir_slot_t *slot)
+{
+  const size_t s= mach_read_from_2(my_assume_aligned<2>(slot));
+  page_t *page= page_align(slot);
+
+  return UNIV_LIKELY(s >= PAGE_NEW_INFIMUM &&
+                     s <= page_header_get_field(page, PAGE_HEAP_TOP))
+    ? page + s
+    : nullptr;
+}
+inline const rec_t *page_dir_slot_get_rec_validate(const page_dir_slot_t *slot)
+{
+  return page_dir_slot_get_rec_validate(const_cast<rec_t*>(slot));
+}
+
+
 /***************************************************************//**
 Gets the number of records owned by a directory slot.
 @return number of records */
@@ -669,7 +681,8 @@ page_dir_calc_reserved_space(
 	ulint	n_recs);	/*!< in: number of records */
 /***************************************************************//**
 Looks for the directory slot which owns the given record.
-@return the directory slot number */
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
 ulint
 page_dir_find_owner_slot(
 /*=====================*/
@@ -752,19 +765,9 @@ page_rec_get_next_const(
 /*====================*/
 	const rec_t*	rec);	/*!< in: pointer to record */
 /************************************************************//**
-Gets the pointer to the next non delete-marked record on the page.
-If all subsequent records are delete-marked, then this function
-will return the supremum record.
-@return pointer to next non delete-marked record or pointer to supremum */
-UNIV_INLINE
-const rec_t*
-page_rec_get_next_non_del_marked(
-/*=============================*/
-	const rec_t*	rec);	/*!< in: pointer to record */
-/************************************************************//**
 Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
+@return pointer to previous record
+@retval nullptr on error */
 const rec_t*
 page_rec_get_prev_const(
 /*====================*/
@@ -772,13 +775,13 @@ page_rec_get_prev_const(
 				infimum */
 /************************************************************//**
 Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
-rec_t*
-page_rec_get_prev(
-/*==============*/
-	rec_t*		rec);	/*!< in: pointer to record,
-				must not be page infimum */
+@param rec  record (not page infimum)
+@return pointer to previous record
+@retval nullptr on error */
+inline rec_t *page_rec_get_prev(rec_t *rec)
+{
+  return const_cast<rec_t*>(page_rec_get_prev_const(rec));
+}
 
 /************************************************************//**
 true if the record is the first user record on a page.
@@ -792,17 +795,6 @@ page_rec_is_first(
 	MY_ATTRIBUTE((warn_unused_result));
 
 /************************************************************//**
-true if the record is the second user record on a page.
-@return true if the second user record */
-UNIV_INLINE
-bool
-page_rec_is_second(
-/*===============*/
-	const rec_t*	rec,	/*!< in: record */
-	const page_t*	page)	/*!< in: page */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/************************************************************//**
 true if the record is the last user record on a page.
 @return true if the last user record */
 UNIV_INLINE
@@ -814,33 +806,6 @@ page_rec_is_last(
 	MY_ATTRIBUTE((warn_unused_result));
 
 /************************************************************//**
-true if distance between the records (measured in number of times we have to
-move to the next record) is at most the specified value
-@param[in]	left_rec	lefter record
-@param[in]	right_rec	righter record
-@param[in]	val		specified value to compare
-@return true if the distance is smaller than the value */
-UNIV_INLINE
-bool
-page_rec_distance_is_at_most(
-/*=========================*/
-	const rec_t*	left_rec,
-	const rec_t*	right_rec,
-	ulint		val)
-	MY_ATTRIBUTE((warn_unused_result));
-
-/************************************************************//**
-true if the record is the second last user record on a page.
-@return true if the second last user record */
-UNIV_INLINE
-bool
-page_rec_is_second_last(
-/*====================*/
-	const rec_t*	rec,	/*!< in: record */
-	const page_t*	page)	/*!< in: page */
-	MY_ATTRIBUTE((warn_unused_result));
-
-/************************************************************//**
 Returns the maximum combined size of records which can be inserted on top
 of record heap.
 @return maximum combined size for inserted records */
@@ -930,6 +895,8 @@ page_create_empty(
 	dict_index_t*	index,	/*!< in: the index of the page */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 	MY_ATTRIBUTE((nonnull(1,2)));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /*************************************************************//**
 Differs from page_copy_rec_list_end, because this function does not
 touch the lock table and max trx id on page or compress the page.
@@ -937,8 +904,10 @@ touch the lock table and max trx id on page or compress the page.
 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
 if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit(). */
-void
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return error code */
+dberr_t
 page_copy_rec_list_end_no_locks(
 /*============================*/
 	buf_block_t*	new_block,	/*!< in: index page to copy to */
@@ -954,10 +923,10 @@ The records are copied to the start of the record list on new_page.
 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
 if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
 
-@return pointer to the original successor of the infimum record on
-new_page, or NULL on zip overflow (new_block will be decompressed) */
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
 rec_t*
 page_copy_rec_list_end(
 /*===================*/
@@ -965,8 +934,9 @@ page_copy_rec_list_end(
 	buf_block_t*	block,		/*!< in: index page containing rec */
 	rec_t*		rec,		/*!< in: record on page */
 	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull));
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result));
 /*************************************************************//**
 Copies records from page to new_page, up to the given record, NOT
 including that record. Infimum and supremum records are not copied.
@@ -977,8 +947,8 @@ if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
 or by invoking ibuf_reset_free_bits() before mtr_commit().
 
-@return pointer to the original predecessor of the supremum record on
-new_page, or NULL on zip overflow (new_block will be decompressed) */
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
 rec_t*
 page_copy_rec_list_start(
 /*=====================*/
@@ -986,12 +956,13 @@ page_copy_rec_list_start(
 	buf_block_t*	block,		/*!< in: index page containing rec */
 	rec_t*		rec,		/*!< in: record on page */
 	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull));
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*************************************************************//**
 Deletes records from a page from a given record onward, including that record.
 The infimum and supremum records are not deleted. */
-void
+dberr_t
 page_delete_rec_list_end(
 /*=====================*/
 	rec_t*		rec,	/*!< in: pointer to record on page */
@@ -1003,7 +974,7 @@ page_delete_rec_list_end(
 				records in the end of the chain to
 				delete, or ULINT_UNDEFINED if not known */
 	mtr_t*		mtr)	/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull));
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*************************************************************//**
 Deletes records from page, up to the given record, NOT including
 that record. Infimum and supremum records are not deleted. */
@@ -1015,45 +986,6 @@ page_delete_rec_list_start(
 	dict_index_t*	index,	/*!< in: record descriptor */
 	mtr_t*		mtr)	/*!< in: mtr */
 	MY_ATTRIBUTE((nonnull));
-/*************************************************************//**
-Moves record list end to another page. Moved records include
-split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return TRUE on success; FALSE on compression failure (new_block will
-be decompressed) */
-ibool
-page_move_rec_list_end(
-/*===================*/
-	buf_block_t*	new_block,	/*!< in/out: index page where to move */
-	buf_block_t*	block,		/*!< in: index page from where to move */
-	rec_t*		split_rec,	/*!< in: first record to move */
-	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
-/*************************************************************//**
-Moves record list start to another page. Moved records do not include
-split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return TRUE on success; FALSE on compression failure */
-ibool
-page_move_rec_list_start(
-/*=====================*/
-	buf_block_t*	new_block,	/*!< in/out: index page where to move */
-	buf_block_t*	block,		/*!< in/out: page containing split_rec */
-	rec_t*		split_rec,	/*!< in: first record not to move */
-	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-	MY_ATTRIBUTE((nonnull(1, 2, 4, 5)));
 /** Create an index page.
 @param[in,out]	block	buffer block
 @param[in]	comp	nonzero=compact page format */
@@ -1160,9 +1092,7 @@ page_find_rec_with_heap_no(
 @param[in]	page	index tree leaf page
 @return the last record, not delete-marked
 @retval infimum record if all records are delete-marked */
-const rec_t*
-page_find_rec_max_not_deleted(
-	const page_t*	page);
+const rec_t *page_find_rec_max_not_deleted(const page_t *page);
 
 #endif /* !UNIV_INNOCHECKSUM */
 
diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl
index 6514886dd67..6c0167edcf9 100644
--- a/storage/innobase/include/page0page.inl
+++ b/storage/innobase/include/page0page.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2021, MariaDB Corporation.
+Copyright (c) 2016, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,9 +24,6 @@ Index page routines
 Created 2/2/1994 Heikki Tuuri
 *******************************************************/
 
-#ifndef page0page_ic
-#define page0page_ic
-
 #ifndef UNIV_INNOCHECKSUM
 #include "rem0cmp.h"
 #include "mtr0log.h"
@@ -87,7 +84,7 @@ page_set_ssn_id(
                                    MTR_MEMO_PAGE_X_FIX));
   ut_ad(!page_zip || page_zip == &block->page.zip);
   constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
-  byte *b= my_assume_aligned<2>(&block->frame[field]);
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
   if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
       UNIV_LIKELY_NULL(page_zip))
     memcpy_aligned<2>(&page_zip->data[field], b, 8);
@@ -125,7 +122,7 @@ Reset PAGE_LAST_INSERT.
 inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
 {
   constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
-  byte *b= my_assume_aligned<2>(&block->frame[field]);
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
   if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
       UNIV_LIKELY_NULL(block->page.zip.data))
     memset_aligned<2>(&block->page.zip.data[field], 0, 2);
@@ -196,22 +193,6 @@ page_rec_is_first(
 }
 
 /************************************************************//**
-true if the record is the second user record on a page.
-@return true if the second user record */
-UNIV_INLINE
-bool
-page_rec_is_second(
-/*===============*/
-	const rec_t*	rec,	/*!< in: record */
-	const page_t*	page)	/*!< in: page */
-{
-	ut_ad(page_get_n_recs(page) > 1);
-
-	return(page_rec_get_next_const(
-		page_rec_get_next_const(page_get_infimum_rec(page))) == rec);
-}
-
-/************************************************************//**
 true if the record is the last user record on a page.
 @return true if the last user record */
 UNIV_INLINE
@@ -227,57 +208,6 @@ page_rec_is_last(
 }
 
 /************************************************************//**
-true if distance between the records (measured in number of times we have to
-move to the next record) is at most the specified value */
-UNIV_INLINE
-bool
-page_rec_distance_is_at_most(
-/*=========================*/
-	const rec_t*	left_rec,
-	const rec_t*	right_rec,
-	ulint		val)
-{
-	for (ulint i = 0; i <= val; i++) {
-		if (left_rec == right_rec) {
-			return (true);
-		}
-		left_rec = page_rec_get_next_const(left_rec);
-	}
-	return (false);
-}
-
-/************************************************************//**
-true if the record is the second last user record on a page.
-@return true if the second last user record */
-UNIV_INLINE
-bool
-page_rec_is_second_last(
-/*====================*/
-	const rec_t*	rec,	/*!< in: record */
-	const page_t*	page)	/*!< in: page */
-{
-	ut_ad(page_get_n_recs(page) > 1);
-	ut_ad(!page_rec_is_last(rec, page));
-
-	return(page_rec_get_next_const(
-		page_rec_get_next_const(rec)) == page_get_supremum_rec(page));
-}
-
-/************************************************************//**
-Returns the nth record of the record list.
-This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
-UNIV_INLINE
-rec_t*
-page_rec_get_nth(
-/*=============*/
-	page_t*	page,	/*!< in: page */
-	ulint	nth)	/*!< in: nth record */
-{
-	return((rec_t*) page_rec_get_nth_const(page, nth));
-}
-
-/************************************************************//**
 Returns the middle record of the records on the page. If there is an
 even number of records in the list, returns the first record of the
 upper half-list.
@@ -424,36 +354,19 @@ page_rec_get_next_low(
 	const rec_t*	rec,	/*!< in: pointer to record */
 	ulint		comp)	/*!< in: nonzero=compact page layout */
 {
-	ulint		offs;
-	const page_t*	page;
-
-	ut_ad(page_rec_check(rec));
-
-	page = page_align(rec);
-
-	offs = rec_get_next_offs(rec, comp);
-
-	if (offs >= srv_page_size) {
-		fprintf(stderr,
-			"InnoDB: Next record offset is nonsensical %lu"
-			" in record at offset %lu\n"
-			"InnoDB: rec address %p, space id %lu, page %lu\n",
-			(ulong) offs, (ulong) page_offset(rec),
-			(void*) rec,
-			(ulong) page_get_space_id(page),
-			(ulong) page_get_page_no(page));
-		ut_error;
-	} else if (offs == 0) {
-
-		return(NULL);
-	}
-
-	ut_ad(page_rec_is_infimum(rec)
-	      || (!page_is_leaf(page) && !page_has_prev(page))
-	      || !(rec_get_info_bits(page + offs, comp)
-		   & REC_INFO_MIN_REC_FLAG));
-
-	return(page + offs);
+  const page_t *page= page_align(rec);
+  ut_ad(page_rec_check(rec));
+  ulint offs= rec_get_next_offs(rec, comp);
+  if (!offs)
+    return nullptr;
+  if (UNIV_UNLIKELY(offs < (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)))
+    return nullptr;
+  if (UNIV_UNLIKELY(offs > page_header_get_field(page, PAGE_HEAP_TOP)))
+    return nullptr;
+  ut_ad(page_rec_is_infimum(rec) ||
+        (!page_is_leaf(page) && !page_has_prev(page)) ||
+        !(rec_get_info_bits(page + offs, comp) & REC_INFO_MIN_REC_FLAG));
+  return page + offs;
 }
 
 /************************************************************//**
@@ -479,91 +392,6 @@ page_rec_get_next_const(
 {
 	return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
 }
-
-/************************************************************//**
-Gets the pointer to the next non delete-marked record on the page.
-If all subsequent records are delete-marked, then this function
-will return the supremum record.
-@return pointer to next non delete-marked record or pointer to supremum */
-UNIV_INLINE
-const rec_t*
-page_rec_get_next_non_del_marked(
-/*=============================*/
-	const rec_t*	rec)	/*!< in: pointer to record */
-{
-	const rec_t*	r;
-	ulint		page_is_compact = page_rec_is_comp(rec);
-
-	for (r = page_rec_get_next_const(rec);
-	     !page_rec_is_supremum(r)
-	     && rec_get_deleted_flag(r, page_is_compact);
-	     r = page_rec_get_next_const(r)) {
-		/* noop */
-	}
-
-	return(r);
-}
-
-/************************************************************//**
-Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
-const rec_t*
-page_rec_get_prev_const(
-/*====================*/
-	const rec_t*	rec)	/*!< in: pointer to record, must not be page
-				infimum */
-{
-	const page_dir_slot_t*	slot;
-	ulint			slot_no;
-	const rec_t*		rec2;
-	const rec_t*		prev_rec = NULL;
-	const page_t*		page;
-
-	ut_ad(page_rec_check(rec));
-
-	page = page_align(rec);
-
-	ut_ad(!page_rec_is_infimum(rec));
-
-	slot_no = page_dir_find_owner_slot(rec);
-
-	ut_a(slot_no != 0);
-
-	slot = page_dir_get_nth_slot(page, slot_no - 1);
-
-	rec2 = page_dir_slot_get_rec(slot);
-
-	if (page_is_comp(page)) {
-		while (rec != rec2) {
-			prev_rec = rec2;
-			rec2 = page_rec_get_next_low(rec2, TRUE);
-		}
-	} else {
-		while (rec != rec2) {
-			prev_rec = rec2;
-			rec2 = page_rec_get_next_low(rec2, FALSE);
-		}
-	}
-
-	ut_a(prev_rec);
-
-	return(prev_rec);
-}
-
-/************************************************************//**
-Gets the pointer to the previous record.
-@return pointer to previous record */
-UNIV_INLINE
-rec_t*
-page_rec_get_prev(
-/*==============*/
-	rec_t*	rec)	/*!< in: pointer to record, must not be page
-			infimum */
-{
-	return((rec_t*) page_rec_get_prev_const(rec));
-}
-
 #endif /* UNIV_INNOCHECKSUM */
 
 /************************************************************//**
@@ -720,5 +548,3 @@ page_get_instant(const page_t* page)
 	return static_cast<uint16_t>(i >> 3);  /* i / 8 */
 }
 #endif /* !UNIV_INNOCHECKSUM */
-
-#endif
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
index 6c5a681f3b5..83fc45cdfc4 100644
--- a/storage/innobase/include/page0types.h
+++ b/storage/innobase/include/page0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +30,7 @@ Created 2/2/1994 Heikki Tuuri
 #include "dict0types.h"
 #include "mtr0types.h"
 #include "rem0types.h"
+#include "ut0new.h"
 
 #include <map>
 
@@ -87,26 +88,52 @@ enum page_cur_mode_t {
 	PAGE_CUR_RTREE_GET_FATHER	= 14
 };
 
+class buf_pool_t;
+class buf_page_t;
+
 /** Compressed page descriptor */
 struct page_zip_des_t
 {
 	page_zip_t*	data;		/*!< compressed page data */
 
-#ifdef UNIV_DEBUG
-	unsigned	m_start:16;	/*!< start offset of modification log */
-	bool		m_external;	/*!< Allocated externally, not from the
-					buffer pool */
-#endif /* UNIV_DEBUG */
-	unsigned	m_end:16;	/*!< end offset of modification log */
-	unsigned	m_nonempty:1;	/*!< TRUE if the modification log
+	uint32_t	m_end:16;	/*!< end offset of modification log */
+	uint32_t	m_nonempty:1;	/*!< TRUE if the modification log
 					is not empty */
-	unsigned	n_blobs:12;	/*!< number of externally stored
+	uint32_t	n_blobs:12;	/*!< number of externally stored
 					columns on the page; the maximum
 					is 744 on a 16 KiB page */
-	unsigned	ssize:PAGE_ZIP_SSIZE_BITS;
+	uint32_t	ssize:PAGE_ZIP_SSIZE_BITS;
 					/*!< 0 or compressed page shift size;
 					the size in bytes is
 					(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+#ifdef UNIV_DEBUG
+	uint16_t	m_start;	/*!< start offset of modification log */
+	bool		m_external;	/*!< Allocated externally, not from the
+					buffer pool */
+#endif /* UNIV_DEBUG */
+
+	void clear() {
+		/* Clear everything except the member "fix". */
+		memset((void*) this, 0,
+		       reinterpret_cast<char*>(&fix)
+		       - reinterpret_cast<char*>(this));
+	}
+
+	page_zip_des_t() = default;
+	page_zip_des_t(const page_zip_des_t&) = default;
+
+	/* Initialize everything except the member "fix". */
+	page_zip_des_t(const page_zip_des_t& old, bool) {
+		memcpy((void*) this, (void*) &old,
+		       reinterpret_cast<char*>(&fix)
+		       - reinterpret_cast<char*>(this));
+	}
+
+private:
+	friend buf_pool_t;
+	friend buf_page_t;
+	/** fix count and state used in buf_page_t */
+	Atomic_relaxed<uint32_t> fix;
 };
 
 /** Compression statistics for a given page size */
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 5b98fdea004..4332990619e 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -2,7 +2,7 @@
 
 Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -109,12 +109,7 @@ page_zip_is_too_big(
 
 /**********************************************************************//**
 Initialize a compressed page descriptor. */
-UNIV_INLINE
-void
-page_zip_des_init(
-/*==============*/
-	page_zip_des_t*	page_zip);	/*!< in/out: compressed page
-					descriptor */
+#define page_zip_des_init(page_zip) (page_zip)->clear()
 
 /**********************************************************************//**
 Configure the zlib allocator to use the given memory heap. */
@@ -332,9 +327,9 @@ IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
 non-clustered index, the caller must update the insert buffer free
 bits in the same mini-transaction in such a way that the modification
 will be redo-logged.
-@retval true on success
-@retval false on failure; the block_zip will be left intact */
-bool
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
 page_zip_reorganize(
 	buf_block_t*	block,	/*!< in/out: page with compressed page;
 				on the compressed page, in: size;
@@ -344,7 +339,7 @@ page_zip_reorganize(
 	ulint		z_level,/*!< in: compression level */
 	mtr_t*		mtr,	/*!< in: mini-transaction */
 	bool		restore = false)/*!< whether to restore on failure */
-	MY_ATTRIBUTE((nonnull));
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /**********************************************************************//**
 Copy the records of a page byte for byte.  Do not copy the page header
@@ -361,15 +356,11 @@ page_zip_copy_recs(
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** Calculate the compressed page checksum.
-@param[in]	data			compressed page
-@param[in]	size			size of compressed page
-@param[in]	algo			algorithm to use
+@param data		compressed page
+@param size		size of compressed page
+@param use_adler	whether to use Adler32 instead of a XOR of 3 CRC-32C
 @return page checksum */
-uint32_t
-page_zip_calc_checksum(
-	const void*			data,
-	ulint				size,
-	srv_checksum_algorithm_t	algo);
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler);
 
 /** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
 @param data    ROW_FORMAT=COMPRESSED page
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
index b0622ba79c3..afc877c3720 100644
--- a/storage/innobase/include/page0zip.inl
+++ b/storage/innobase/include/page0zip.inl
@@ -304,18 +304,6 @@ page_zip_available(
 }
 
 /**********************************************************************//**
-Initialize a compressed page descriptor. */
-UNIV_INLINE
-void
-page_zip_des_init(
-/*==============*/
-	page_zip_des_t*	page_zip)	/*!< in/out: compressed page
-					descriptor */
-{
-	memset(page_zip, 0, sizeof *page_zip);
-}
-
-/**********************************************************************//**
 Reset the counters used for filling
 INFORMATION_SCHEMA.innodb_cmp_per_index. */
 UNIV_INLINE
@@ -323,11 +311,7 @@ void
 page_zip_reset_stat_per_index()
 /*===========================*/
 {
-	mutex_enter(&page_zip_stat_per_index_mutex);
-
-	page_zip_stat_per_index.erase(
-		page_zip_stat_per_index.begin(),
-		page_zip_stat_per_index.end());
-
-	mutex_exit(&page_zip_stat_per_index_mutex);
+	mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+	page_zip_stat_per_index.clear();
+	mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 }
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
index 58d424abfdc..e7112d9996f 100644
--- a/storage/innobase/include/pars0grm.h
+++ b/storage/innobase/include/pars0grm.h
@@ -1,8 +1,8 @@
-/* A Bison parser, made by GNU Bison 3.4.2.  */
+/* A Bison parser, made by GNU Bison 3.7.6.  */
 
 /* Bison interface for Yacc-like parsers in C
 
-   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2019 Free Software Foundation,
+   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
    Inc.
 
    This program is free software: you can redistribute it and/or modify
@@ -16,7 +16,7 @@
    GNU General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
 
 /* As a special exception, you may create a larger work that contains
    part or all of the Bison parser skeleton and distribute that work
@@ -31,8 +31,9 @@
    This special exception was added by the Free Software Foundation in
    version 2.2 of Bison.  */
 
-/* Undocumented macros, especially those whose name start with YY_,
-   are private implementation details.  Do not rely on them.  */
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+   especially those whose name start with YY_ or yy_.  They are
+   private implementation details that can be changed or removed.  */
 
 #ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
 # define YY_YY_PARS0GRM_TAB_H_INCLUDED
@@ -44,90 +45,95 @@
 extern int yydebug;
 #endif
 
-/* Token type.  */
+/* Token kinds.  */
 #ifndef YYTOKENTYPE
 # define YYTOKENTYPE
   enum yytokentype
   {
-    PARS_INT_LIT = 258,
-    PARS_FLOAT_LIT = 259,
-    PARS_STR_LIT = 260,
-    PARS_NULL_LIT = 261,
-    PARS_ID_TOKEN = 262,
-    PARS_AND_TOKEN = 263,
-    PARS_OR_TOKEN = 264,
-    PARS_NOT_TOKEN = 265,
-    PARS_GE_TOKEN = 266,
-    PARS_LE_TOKEN = 267,
-    PARS_NE_TOKEN = 268,
-    PARS_PROCEDURE_TOKEN = 269,
-    PARS_IN_TOKEN = 270,
-    PARS_INT_TOKEN = 271,
-    PARS_CHAR_TOKEN = 272,
-    PARS_IS_TOKEN = 273,
-    PARS_BEGIN_TOKEN = 274,
-    PARS_END_TOKEN = 275,
-    PARS_IF_TOKEN = 276,
-    PARS_THEN_TOKEN = 277,
-    PARS_ELSE_TOKEN = 278,
-    PARS_ELSIF_TOKEN = 279,
-    PARS_LOOP_TOKEN = 280,
-    PARS_WHILE_TOKEN = 281,
-    PARS_RETURN_TOKEN = 282,
-    PARS_SELECT_TOKEN = 283,
-    PARS_COUNT_TOKEN = 284,
-    PARS_FROM_TOKEN = 285,
-    PARS_WHERE_TOKEN = 286,
-    PARS_FOR_TOKEN = 287,
-    PARS_DDOT_TOKEN = 288,
-    PARS_ORDER_TOKEN = 289,
-    PARS_BY_TOKEN = 290,
-    PARS_ASC_TOKEN = 291,
-    PARS_DESC_TOKEN = 292,
-    PARS_INSERT_TOKEN = 293,
-    PARS_INTO_TOKEN = 294,
-    PARS_VALUES_TOKEN = 295,
-    PARS_UPDATE_TOKEN = 296,
-    PARS_SET_TOKEN = 297,
-    PARS_DELETE_TOKEN = 298,
-    PARS_CURRENT_TOKEN = 299,
-    PARS_OF_TOKEN = 300,
-    PARS_CREATE_TOKEN = 301,
-    PARS_TABLE_TOKEN = 302,
-    PARS_INDEX_TOKEN = 303,
-    PARS_UNIQUE_TOKEN = 304,
-    PARS_CLUSTERED_TOKEN = 305,
-    PARS_ON_TOKEN = 306,
-    PARS_ASSIGN_TOKEN = 307,
-    PARS_DECLARE_TOKEN = 308,
-    PARS_CURSOR_TOKEN = 309,
-    PARS_SQL_TOKEN = 310,
-    PARS_OPEN_TOKEN = 311,
-    PARS_FETCH_TOKEN = 312,
-    PARS_CLOSE_TOKEN = 313,
-    PARS_NOTFOUND_TOKEN = 314,
-    PARS_TO_BINARY_TOKEN = 315,
-    PARS_SUBSTR_TOKEN = 316,
-    PARS_CONCAT_TOKEN = 317,
-    PARS_INSTR_TOKEN = 318,
-    PARS_LENGTH_TOKEN = 319,
-    PARS_COMMIT_TOKEN = 320,
-    PARS_ROLLBACK_TOKEN = 321,
-    PARS_WORK_TOKEN = 322,
-    PARS_EXIT_TOKEN = 323,
-    PARS_FUNCTION_TOKEN = 324,
-    PARS_LOCK_TOKEN = 325,
-    PARS_SHARE_TOKEN = 326,
-    PARS_MODE_TOKEN = 327,
-    PARS_LIKE_TOKEN = 328,
-    PARS_LIKE_TOKEN_EXACT = 329,
-    PARS_LIKE_TOKEN_PREFIX = 330,
-    PARS_LIKE_TOKEN_SUFFIX = 331,
-    PARS_LIKE_TOKEN_SUBSTR = 332,
-    PARS_TABLE_NAME_TOKEN = 333,
-    PARS_BIGINT_TOKEN = 334,
-    NEG = 335
+    YYEMPTY = -2,
+    YYEOF = 0,                     /* "end of file"  */
+    YYerror = 256,                 /* error  */
+    YYUNDEF = 257,                 /* "invalid token"  */
+    PARS_INT_LIT = 258,            /* PARS_INT_LIT  */
+    PARS_FLOAT_LIT = 259,          /* PARS_FLOAT_LIT  */
+    PARS_STR_LIT = 260,            /* PARS_STR_LIT  */
+    PARS_NULL_LIT = 261,           /* PARS_NULL_LIT  */
+    PARS_ID_TOKEN = 262,           /* PARS_ID_TOKEN  */
+    PARS_AND_TOKEN = 263,          /* PARS_AND_TOKEN  */
+    PARS_OR_TOKEN = 264,           /* PARS_OR_TOKEN  */
+    PARS_NOT_TOKEN = 265,          /* PARS_NOT_TOKEN  */
+    PARS_GE_TOKEN = 266,           /* PARS_GE_TOKEN  */
+    PARS_LE_TOKEN = 267,           /* PARS_LE_TOKEN  */
+    PARS_NE_TOKEN = 268,           /* PARS_NE_TOKEN  */
+    PARS_PROCEDURE_TOKEN = 269,    /* PARS_PROCEDURE_TOKEN  */
+    PARS_IN_TOKEN = 270,           /* PARS_IN_TOKEN  */
+    PARS_INT_TOKEN = 271,          /* PARS_INT_TOKEN  */
+    PARS_CHAR_TOKEN = 272,         /* PARS_CHAR_TOKEN  */
+    PARS_IS_TOKEN = 273,           /* PARS_IS_TOKEN  */
+    PARS_BEGIN_TOKEN = 274,        /* PARS_BEGIN_TOKEN  */
+    PARS_END_TOKEN = 275,          /* PARS_END_TOKEN  */
+    PARS_IF_TOKEN = 276,           /* PARS_IF_TOKEN  */
+    PARS_THEN_TOKEN = 277,         /* PARS_THEN_TOKEN  */
+    PARS_ELSE_TOKEN = 278,         /* PARS_ELSE_TOKEN  */
+    PARS_ELSIF_TOKEN = 279,        /* PARS_ELSIF_TOKEN  */
+    PARS_LOOP_TOKEN = 280,         /* PARS_LOOP_TOKEN  */
+    PARS_WHILE_TOKEN = 281,        /* PARS_WHILE_TOKEN  */
+    PARS_RETURN_TOKEN = 282,       /* PARS_RETURN_TOKEN  */
+    PARS_SELECT_TOKEN = 283,       /* PARS_SELECT_TOKEN  */
+    PARS_COUNT_TOKEN = 284,        /* PARS_COUNT_TOKEN  */
+    PARS_FROM_TOKEN = 285,         /* PARS_FROM_TOKEN  */
+    PARS_WHERE_TOKEN = 286,        /* PARS_WHERE_TOKEN  */
+    PARS_FOR_TOKEN = 287,          /* PARS_FOR_TOKEN  */
+    PARS_DDOT_TOKEN = 288,         /* PARS_DDOT_TOKEN  */
+    PARS_ORDER_TOKEN = 289,        /* PARS_ORDER_TOKEN  */
+    PARS_BY_TOKEN = 290,           /* PARS_BY_TOKEN  */
+    PARS_ASC_TOKEN = 291,          /* PARS_ASC_TOKEN  */
+    PARS_DESC_TOKEN = 292,         /* PARS_DESC_TOKEN  */
+    PARS_INSERT_TOKEN = 293,       /* PARS_INSERT_TOKEN  */
+    PARS_INTO_TOKEN = 294,         /* PARS_INTO_TOKEN  */
+    PARS_VALUES_TOKEN = 295,       /* PARS_VALUES_TOKEN  */
+    PARS_UPDATE_TOKEN = 296,       /* PARS_UPDATE_TOKEN  */
+    PARS_SET_TOKEN = 297,          /* PARS_SET_TOKEN  */
+    PARS_DELETE_TOKEN = 298,       /* PARS_DELETE_TOKEN  */
+    PARS_CURRENT_TOKEN = 299,      /* PARS_CURRENT_TOKEN  */
+    PARS_OF_TOKEN = 300,           /* PARS_OF_TOKEN  */
+    PARS_CREATE_TOKEN = 301,       /* PARS_CREATE_TOKEN  */
+    PARS_TABLE_TOKEN = 302,        /* PARS_TABLE_TOKEN  */
+    PARS_INDEX_TOKEN = 303,        /* PARS_INDEX_TOKEN  */
+    PARS_UNIQUE_TOKEN = 304,       /* PARS_UNIQUE_TOKEN  */
+    PARS_CLUSTERED_TOKEN = 305,    /* PARS_CLUSTERED_TOKEN  */
+    PARS_ON_TOKEN = 306,           /* PARS_ON_TOKEN  */
+    PARS_ASSIGN_TOKEN = 307,       /* PARS_ASSIGN_TOKEN  */
+    PARS_DECLARE_TOKEN = 308,      /* PARS_DECLARE_TOKEN  */
+    PARS_CURSOR_TOKEN = 309,       /* PARS_CURSOR_TOKEN  */
+    PARS_SQL_TOKEN = 310,          /* PARS_SQL_TOKEN  */
+    PARS_OPEN_TOKEN = 311,         /* PARS_OPEN_TOKEN  */
+    PARS_FETCH_TOKEN = 312,        /* PARS_FETCH_TOKEN  */
+    PARS_CLOSE_TOKEN = 313,        /* PARS_CLOSE_TOKEN  */
+    PARS_NOTFOUND_TOKEN = 314,     /* PARS_NOTFOUND_TOKEN  */
+    PARS_TO_BINARY_TOKEN = 315,    /* PARS_TO_BINARY_TOKEN  */
+    PARS_SUBSTR_TOKEN = 316,       /* PARS_SUBSTR_TOKEN  */
+    PARS_CONCAT_TOKEN = 317,       /* PARS_CONCAT_TOKEN  */
+    PARS_INSTR_TOKEN = 318,        /* PARS_INSTR_TOKEN  */
+    PARS_LENGTH_TOKEN = 319,       /* PARS_LENGTH_TOKEN  */
+    PARS_COMMIT_TOKEN = 320,       /* PARS_COMMIT_TOKEN  */
+    PARS_ROLLBACK_TOKEN = 321,     /* PARS_ROLLBACK_TOKEN  */
+    PARS_WORK_TOKEN = 322,         /* PARS_WORK_TOKEN  */
+    PARS_EXIT_TOKEN = 323,         /* PARS_EXIT_TOKEN  */
+    PARS_FUNCTION_TOKEN = 324,     /* PARS_FUNCTION_TOKEN  */
+    PARS_LOCK_TOKEN = 325,         /* PARS_LOCK_TOKEN  */
+    PARS_SHARE_TOKEN = 326,        /* PARS_SHARE_TOKEN  */
+    PARS_MODE_TOKEN = 327,         /* PARS_MODE_TOKEN  */
+    PARS_LIKE_TOKEN = 328,         /* PARS_LIKE_TOKEN  */
+    PARS_LIKE_TOKEN_EXACT = 329,   /* PARS_LIKE_TOKEN_EXACT  */
+    PARS_LIKE_TOKEN_PREFIX = 330,  /* PARS_LIKE_TOKEN_PREFIX  */
+    PARS_LIKE_TOKEN_SUFFIX = 331,  /* PARS_LIKE_TOKEN_SUFFIX  */
+    PARS_LIKE_TOKEN_SUBSTR = 332,  /* PARS_LIKE_TOKEN_SUBSTR  */
+    PARS_TABLE_NAME_TOKEN = 333,   /* PARS_TABLE_NAME_TOKEN  */
+    PARS_BIGINT_TOKEN = 334,       /* PARS_BIGINT_TOKEN  */
+    NEG = 335                      /* NEG  */
   };
+  typedef enum yytokentype yytoken_kind_t;
 #endif
 
 /* Value type.  */
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
index 4c588dca061..16823ce1461 100644
--- a/storage/innobase/include/pars0pars.h
+++ b/storage/innobase/include/pars0pars.h
@@ -367,19 +367,8 @@ pars_procedure_definition(
 					table */
 	que_node_t*	stat_list);	/*!< in: statement list */
 
-/*************************************************************//**
-Parses a stored procedure call, when this is not within another stored
-procedure, that is, the client issues a procedure call directly.
-In MySQL/InnoDB, stored InnoDB procedures are invoked via the
-parsed procedure tree, not via InnoDB SQL, so this function is not used.
-@return query graph */
-que_fork_t*
-pars_stored_procedure_call(
-/*=======================*/
-	sym_node_t*	sym_node);	/*!< in: stored procedure name */
 /** Completes a query graph by adding query thread and fork nodes
-above it and prepares the graph for running. The fork created is of
-type QUE_FORK_MYSQL_INTERFACE.
+above it and prepares the graph for running.
 @param[in]	node		root node for an incomplete query
 				graph, or NULL for dummy graph
 @param[in]	trx		transaction handle
@@ -402,13 +391,6 @@ pars_info_create(void);
 /*==================*/
 
 /****************************************************************//**
-Free info struct and everything it contains. */
-void
-pars_info_free(
-/*===========*/
-	pars_info_t*	info);	/*!< in, own: info struct */
-
-/****************************************************************//**
 Add bound literal. */
 void
 pars_info_add_literal(
@@ -570,11 +552,10 @@ struct pars_info_t {
 					(pars_bound_lit_t*) */
 	ib_vector_t*	bound_ids;	/*!< bound ids, or NULL
 					(pars_bound_id_t*) */
-
-	ibool		graph_owns_us;	/*!< if TRUE (which is the default),
-					que_graph_free() will free us */
 };
 
+inline void pars_info_free(pars_info_t *info) { mem_heap_free(info->heap); }
+
 /** User-supplied function and argument. */
 struct pars_user_func_t {
 	const char*		name;	/*!< function name */
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
index 962bd359f0b..c60f390a092 100644
--- a/storage/innobase/include/que0que.h
+++ b/storage/innobase/include/que0que.h
@@ -38,15 +38,7 @@ Created 5/27/1996 Heikki Tuuri
 /***********************************************************************//**
 Creates a query graph fork node.
 @return own: fork node */
-que_fork_t*
-que_fork_create(
-/*============*/
-	que_t*		graph,		/*!< in: graph, if NULL then this
-					fork node is assumed to be the
-					graph root */
-	que_node_t*	parent,		/*!< in: parent node */
-	ulint		fork_type,	/*!< in: fork type */
-	mem_heap_t*	heap);		/*!< in: memory heap where created */
+que_fork_t *que_fork_create(mem_heap_t* heap);
 /***********************************************************************//**
 Gets the first thr in a fork. */
 UNIV_INLINE
@@ -96,43 +88,14 @@ que_graph_free(
 			to this graph: if not, then use
 			que_graph_free_recursive and free the heap
 			afterwards! */
-/**********************************************************************//**
-Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx. The lock_sys_t::mutex
-has to be reserved.
-@return TRUE if stopped */
-ibool
-que_thr_stop(
-/*=========*/
-	que_thr_t*	thr);	/*!< in: query thread */
 
 /**********************************************************************//**
-A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
-query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.cc, but the lock has already
-been granted or the transaction chosen as a victim in deadlock resolution. */
-void
-que_thr_stop_for_mysql(
-/*===================*/
-	que_thr_t*	thr);	/*!< in: query thread */
-/**********************************************************************//**
 Run a query thread. Handles lock waits. */
 void
 que_run_threads(
 /*============*/
 	que_thr_t*	thr);	/*!< in: query thread */
 /**********************************************************************//**
-Moves a suspended query thread to the QUE_THR_RUNNING state and release
-a worker thread to execute it. This function should be used to end
-the wait state of a query thread waiting for a lock or a stored procedure
-completion.
-@return query thread instance of thread to wakeup or NULL */
-que_thr_t*
-que_thr_end_lock_wait(
-/*==================*/
-	trx_t*		trx);		/*!< in: transaction in the
-					QUE_THR_LOCK_WAIT state */
-/**********************************************************************//**
 Starts execution of a command in a query fork. Picks a query thread which
 is not in the QUE_THR_RUNNING state and moves it to that state. If none
 can be chosen, a situation which may arise in parallelized fetches, NULL
@@ -236,31 +199,6 @@ ulint
 que_node_list_get_len(
 /*==================*/
 	que_node_t*	node_list);	/*!< in: node list, or NULL */
-/**********************************************************************//**
-Checks if graph, trx, or session is in a state where the query thread should
-be stopped.
-@return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the trx_t::mutex, then another peek with the mutex
-reserved is necessary before deciding the actual stopping */
-UNIV_INLINE
-ibool
-que_thr_peek_stop(
-/*==============*/
-	que_thr_t*	thr);	/*!< in: query thread */
-/***********************************************************************//**
-Returns TRUE if the query graph is for a SELECT statement.
-@return TRUE if a select */
-UNIV_INLINE
-ibool
-que_graph_is_select(
-/*================*/
-	que_t*		graph);		/*!< in: graph */
-/**********************************************************************//**
-Prints info of an SQL query graph node. */
-void
-que_node_print_info(
-/*================*/
-	que_node_t*	node);	/*!< in: query graph node */
 /*********************************************************************//**
 Evaluate the given SQL
 @return error code or DB_SUCCESS */
@@ -269,9 +207,6 @@ que_eval_sql(
 /*=========*/
 	pars_info_t*	info,	/*!< in: info struct, or NULL */
 	const char*	sql,	/*!< in: SQL string */
-	bool		reserve_dict_mutex,
-				/*!< in: whether to acquire/release
-				dict_sys.mutex around call to pars_sql. */
 	trx_t*		trx);	/*!< in: trx */
 
 /**********************************************************************//**
@@ -287,14 +222,11 @@ que_fork_scheduler_round_robin(
 
 /** Query thread states */
 enum que_thr_state_t {
-	QUE_THR_RUNNING,
 	/** in selects this means that the thread is at the end of its
 	result set (or start, in case of a scroll cursor); in other
 	statements, this means the thread has done its task */
 	QUE_THR_COMPLETED,
-	QUE_THR_COMMAND_WAIT,
-	QUE_THR_LOCK_WAIT,
-	QUE_THR_SUSPENDED
+	QUE_THR_RUNNING
 };
 
 /** Query thread lock states */
@@ -312,7 +244,6 @@ struct que_thr_t{
 	que_node_t*	child;		/*!< graph child node */
 	que_t*		graph;		/*!< graph where this node belongs */
 	que_thr_state_t	state;		/*!< state of the query thread */
-	bool		is_active;	/*!< whether the thread is active */
 	/*------------------------------*/
 	/* The following fields are private to the OS thread executing the
 	query thread, and are not protected by any mutex: */
@@ -326,9 +257,6 @@ struct que_thr_t{
 					thus far */
 	ulint		lock_state;	/*!< lock state of thread (table or
 					row) */
-	struct srv_slot_t*
-			slot;		/* The thread slot in the wait
-					array in srv_sys_t */
 	/*------------------------------*/
 	/* The following fields are links for the various lists that
 	this type can be on. */
@@ -343,40 +271,12 @@ struct que_thr_t{
 					related delete/updates */
 	row_prebuilt_t*	prebuilt;	/*!< prebuilt structure processed by
 					the query thread */
-
-#ifdef UNIV_DEBUG
-  /** Change the 'active' status */
-  inline void set_active(bool active);
-#endif
-  /** Transition to the QUE_THR_RUNNING state. */
-  inline void start_running()
-  {
-    ut_d(if (!is_active) set_active(true));
-    is_active= true;
-    state= QUE_THR_RUNNING;
-  }
-
-  /** Stop query execution when there is no error or lock wait. */
-  void stop_no_error()
-  {
-    ut_ad(is_active);
-    ut_d(set_active(false));
-    state= QUE_THR_COMPLETED;
-    is_active= false;
-  }
 };
 
 /* Query graph fork node: its fields are protected by the query thread mutex */
 struct que_fork_t{
 	que_common_t	common;		/*!< type: QUE_NODE_FORK */
 	que_t*		graph;		/*!< query graph of this node */
-	ulint		fork_type;	/*!< fork type */
-#ifdef UNIV_DEBUG
-  /** For the query graph root, updated in set_active() */
-  ulint n_active_thrs;
-  /** Change the 'active' status */
-  void set_active(bool active);
-#endif
 	trx_t*		trx;		/*!< transaction: this is set only in
 					the root node */
 	ulint		state;		/*!< state of the fork node */
@@ -402,30 +302,9 @@ struct que_fork_t{
 
 };
 
-#ifdef UNIV_DEBUG
-inline void que_thr_t::set_active(bool active) { graph->set_active(active); };
-#endif
-
-/* Query fork (or graph) types */
-#define QUE_FORK_SELECT_NON_SCROLL	1	/* forward-only cursor */
-#define QUE_FORK_SELECT_SCROLL		2	/* scrollable cursor */
-#define QUE_FORK_INSERT			3
-#define QUE_FORK_UPDATE			4
-#define QUE_FORK_ROLLBACK		5
-			/* This is really the undo graph used in rollback,
-			no signal-sending roll_node in this graph */
-#define QUE_FORK_PURGE			6
-#define	QUE_FORK_EXECUTE		7
-#define QUE_FORK_PROCEDURE		8
-#define QUE_FORK_PROCEDURE_CALL		9
-#define QUE_FORK_MYSQL_INTERFACE	10
-#define	QUE_FORK_RECOVERY		11
-
 /* Query fork (or graph) states */
 #define QUE_FORK_ACTIVE		1
 #define QUE_FORK_COMMAND_WAIT	2
-#define QUE_FORK_INVALID	3
-#define QUE_FORK_BEING_FREED	4
 
 /* Flag which is ORed to control structure statement node types */
 #define QUE_NODE_CONTROL_STAT	1024
diff --git a/storage/innobase/include/que0que.inl b/storage/innobase/include/que0que.inl
index 1c3ac242bf2..e21cbad3815 100644
--- a/storage/innobase/include/que0que.inl
+++ b/storage/innobase/include/que0que.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2020, MariaDB Corporation.
+Copyright (c) 2020, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -243,51 +243,3 @@ que_node_get_parent(
 {
 	return(((que_common_t*) node)->parent);
 }
-
-/**********************************************************************//**
-Checks if graph, trx, or session is in a state where the query thread should
-be stopped.
-@return TRUE if should be stopped; NOTE that if the peek is made
-without reserving the trx mutex, then another peek with the mutex
-reserved is necessary before deciding the actual stopping */
-UNIV_INLINE
-ibool
-que_thr_peek_stop(
-/*==============*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	trx_t*	trx;
-	que_t*	graph;
-
-	graph = thr->graph;
-	trx = graph->trx;
-
-	if (graph->state != QUE_FORK_ACTIVE
-	    || trx->lock.que_state == TRX_QUE_LOCK_WAIT
-	    || (trx->lock.que_state != TRX_QUE_ROLLING_BACK
-		&& trx->lock.que_state != TRX_QUE_RUNNING)) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
-/***********************************************************************//**
-Returns TRUE if the query graph is for a SELECT statement.
-@return TRUE if a select */
-UNIV_INLINE
-ibool
-que_graph_is_select(
-/*================*/
-	que_t*		graph)		/*!< in: graph */
-{
-	if (graph->fork_type == QUE_FORK_SELECT_SCROLL
-	    || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
-
-		return(TRUE);
-	}
-
-	return(FALSE);
-}
-
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
index 21143ab609d..e002f1b77e1 100644
--- a/storage/innobase/include/read0types.h
+++ b/storage/innobase/include/read0types.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,14 +24,13 @@ Cursor read
 Created 2/16/1997 Heikki Tuuri
 *******************************************************/
 
-#ifndef read0types_h
-#define read0types_h
+#pragma once
 
 #include "dict0mem.h"
 #include "trx0types.h"
+#include "srw_lock.h"
 #include <algorithm>
 
-
 /**
   Read view lists the trx ids of those transactions for which a consistent read
   should not see the modifications to the database.
@@ -42,7 +41,7 @@ class ReadViewBase
     The read should not see any transaction with trx id >= this value.
     In other words, this is the "high water mark".
   */
-  trx_id_t m_low_limit_id;
+  trx_id_t m_low_limit_id= 0;
 
   /**
     The read should see all trx ids which are strictly
@@ -68,9 +67,6 @@ protected:
   trx_id_t up_limit_id() const { return m_up_limit_id; }
 
 public:
-  ReadViewBase(): m_low_limit_id(0) {}
-
-
   /**
     Append state from another view.
 
@@ -126,39 +122,20 @@ loop:
 
 
   /**
-    Check whether transaction id is valid.
-    @param[in] id transaction id to check
-    @param[in] name table name
-
-    @todo changes_visible() was an unfortunate choice for this check.
-    It should be moved towards the functions that load trx id like
-    trx_read_trx_id(). No need to issue a warning, error log message should
-    be enough. Although statement should ideally fail if it sees corrupt
-    data.
-  */
-  static void check_trx_id_sanity(trx_id_t id, const table_name_t &name);
-
-
-  /**
     Check whether the changes by id are visible.
     @param[in] id transaction id to check against the view
-    @param[in] name table name
     @return whether the view sees the modifications of id.
   */
-  bool changes_visible(trx_id_t id, const table_name_t &name) const
+  bool changes_visible(trx_id_t id) const
   MY_ATTRIBUTE((warn_unused_result))
   {
     if (id >= m_low_limit_id)
-    {
-      check_trx_id_sanity(id, name);
       return false;
-    }
     return id < m_up_limit_id ||
            m_ids.empty() ||
            !std::binary_search(m_ids.begin(), m_ids.end(), id);
   }
 
-
   /**
     @param id transaction to check
     @return true if view sees transaction id
@@ -170,6 +147,13 @@ loop:
 
   /** @return the low limit id */
   trx_id_t low_limit_id() const { return m_low_limit_id; }
+
+  /** Clamp the low limit id for purge_sys.end_view */
+  void clamp_low_limit_id(trx_id_t limit)
+  {
+    if (m_low_limit_id > limit)
+      m_low_limit_id= limit;
+  }
 };
 
 
@@ -190,7 +174,7 @@ class ReadView: public ReadViewBase
   std::atomic<bool> m_open;
 
   /** For synchronisation with purge coordinator. */
-  mutable ib_mutex_t m_mutex;
+  mutable srw_mutex m_mutex;
 
   /**
     trx id of creating transaction.
@@ -199,8 +183,12 @@ class ReadView: public ReadViewBase
   trx_id_t m_creator_trx_id;
 
 public:
-  ReadView(): m_open(false) { mutex_create(LATCH_ID_READ_VIEW, &m_mutex); }
-  ~ReadView() { mutex_free(&m_mutex); }
+  ReadView()
+  {
+    memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+    m_mutex.init();
+  }
+  ~ReadView() { m_mutex.destroy(); }
 
 
   /**
@@ -236,7 +224,6 @@ public:
   */
   void set_creator_trx_id(trx_id_t id)
   {
-    ut_ad(id > 0);
     ut_ad(m_creator_trx_id == 0);
     m_creator_trx_id= id;
   }
@@ -248,12 +235,12 @@ public:
   */
   void print_limits(FILE *file) const
   {
-    mutex_enter(&m_mutex);
+    m_mutex.wr_lock();
     if (is_open())
       fprintf(file, "Trx read view will not see trx with"
                     " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
                     low_limit_id(), up_limit_id());
-    mutex_exit(&m_mutex);
+    m_mutex.wr_unlock();
   }
 
 
@@ -261,9 +248,8 @@ public:
     A wrapper around ReadViewBase::changes_visible().
     Intended to be called by the ReadView owner thread.
   */
-  bool changes_visible(trx_id_t id, const table_name_t &name) const
-  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id, name); }
-
+  bool changes_visible(trx_id_t id) const
+  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }
 
   /**
     A wrapper around ReadViewBase::append().
@@ -271,23 +257,19 @@ public:
   */
   void append_to(ReadViewBase *to) const
   {
-    mutex_enter(&m_mutex);
+    m_mutex.wr_lock();
     if (is_open())
       to->append(*this);
-    mutex_exit(&m_mutex);
+    m_mutex.wr_unlock();
   }
 
-
   /**
     Declare the object mostly unaccessible.
-    innodb_monitor_set_option is operating also on freed transaction objects.
   */
   void mem_noaccess() const
   {
     MEM_NOACCESS(&m_open, sizeof m_open);
-    /* m_mutex is accessed by innodb_show_mutex_status()
-    and innodb_monitor_update() even after trx_t::free() */
+    /* m_mutex is accessed via trx_sys.rw_trx_hash */
     MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
   }
 };
-#endif
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
index a179c313235..2f038ab349f 100644
--- a/storage/innobase/include/rem0rec.h
+++ b/storage/innobase/include/rem0rec.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -141,28 +141,7 @@ constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
 /** Default value flag in offsets returned by rec_get_offsets() */
 constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
 constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
-/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-const rec_t*
-rec_get_next_ptr_const(
-/*===================*/
-	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp)	/*!< in: nonzero=compact page format */
-	MY_ATTRIBUTE((warn_unused_result));
-/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-rec_t*
-rec_get_next_ptr(
-/*=============*/
-	rec_t*	rec,	/*!< in: physical record */
-	ulint	comp)	/*!< in: nonzero=compact page format */
-	MY_ATTRIBUTE((warn_unused_result));
+
 /******************************************************//**
 The following function is used to get the offset of the
 next chained record on the same page.
@@ -727,11 +706,9 @@ in the clustered index for instant ADD COLUMN or ALTER TABLE.
 @param[in]	rec	leaf page record
 @param[in]	index	index of the record
 @return	whether the record is the metadata pseudo-record */
-inline bool rec_is_metadata(const rec_t* rec, const dict_index_t& index)
+inline bool rec_is_metadata(const rec_t *rec, const dict_index_t &index)
 {
-	bool is = rec_is_metadata(rec, dict_table_is_comp(index.table));
-	ut_ad(!is || index.is_instant());
-	return is;
+  return rec_is_metadata(rec, index.table->not_redundant());
 }
 
 /** Determine if the record is the metadata pseudo-record
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
index 30c72a7415a..46c209cbdec 100644
--- a/storage/innobase/include/rem0rec.inl
+++ b/storage/innobase/include/rem0rec.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -204,76 +204,6 @@ rec_set_bit_field_2(
 }
 
 /******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-const rec_t*
-rec_get_next_ptr_const(
-/*===================*/
-	const rec_t*	rec,	/*!< in: physical record */
-	ulint		comp)	/*!< in: nonzero=compact page format */
-{
-	ulint	field_value;
-
-	compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
-	compile_time_assert(REC_NEXT_SHIFT == 0);
-
-	field_value = mach_read_from_2(rec - REC_NEXT);
-
-	if (field_value == 0) {
-
-		return(NULL);
-	}
-
-	if (comp) {
-#if UNIV_PAGE_SIZE_MAX <= 32768
-		/* Note that for 64 KiB pages, field_value can 'wrap around'
-		and the debug assertion is not valid */
-
-		/* In the following assertion, field_value is interpreted
-		as signed 16-bit integer in 2's complement arithmetics.
-		If all platforms defined int16_t in the standard headers,
-		the expression could be written simpler as
-		(int16_t) field_value + ut_align_offset(...) < srv_page_size
-		*/
-		ut_ad((field_value >= 32768
-		       ? field_value - 65536
-		       : field_value)
-		      + ut_align_offset(rec, srv_page_size)
-		      < srv_page_size);
-#endif
-		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
-		between each record. */
-		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
-		       && field_value < 32768)
-		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
-
-		return((byte*) ut_align_down(rec, srv_page_size)
-		       + ut_align_offset(rec + field_value, srv_page_size));
-	} else {
-		ut_ad(field_value < srv_page_size);
-
-		return((byte*) ut_align_down(rec, srv_page_size)
-		       + field_value);
-	}
-}
-
-/******************************************************//**
-The following function is used to get the pointer of the next chained record
-on the same page.
-@return pointer to the next chained record, or NULL if none */
-UNIV_INLINE
-rec_t*
-rec_get_next_ptr(
-/*=============*/
-	rec_t*	rec,	/*!< in: physical record */
-	ulint	comp)	/*!< in: nonzero=compact page format */
-{
-	return(const_cast<rec_t*>(rec_get_next_ptr_const(rec, comp)));
-}
-
-/******************************************************//**
 The following function is used to get the offset of the next chained record
 on the same page.
 @return the page offset of the next chained record, or 0 if none */
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
index 99c85601d5d..65508caf751 100644
--- a/storage/innobase/include/row0ftsort.h
+++ b/storage/innobase/include/row0ftsort.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2019, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -65,7 +65,7 @@ struct fts_psort_common_t {
 	ulint			old_zip_size;
 	trx_t*			trx;		/*!< transaction */
 	fts_psort_t*		all_info;	/*!< all parallel sort info */
-	os_event_t		sort_event;	/*!< sort event */
+	pthread_cond_t		sort_cond;	/*!< sort completion */
 	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
 						instead of 8 bytes integer to
 						store Doc ID during sort, if
@@ -90,7 +90,7 @@ struct fts_psort_t {
 	tpool::waitable_task*	task;	/*!< threadpool task */
 	dberr_t			error;		/*!< db error during psort */
 	ulint			memory_used;	/*!< memory used by fts_doc_list */
-	ib_mutex_t		mutex;		/*!< mutex for fts_doc_list */
+	mysql_mutex_t		mutex;		/*!< mutex for fts_doc_list */
 };
 
 /** Row fts token for plugin parser */
@@ -152,7 +152,6 @@ typedef struct fts_psort_insert	fts_psort_insert_t;
 #define FTS_PARENT_COMPLETE	1
 #define FTS_PARENT_EXITING	2
 #define FTS_CHILD_COMPLETE	1
-#define FTS_CHILD_EXITING	2
 
 /** Print some debug information */
 #define	FTSORT_PRINT
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
index 75db0ad04b2..ac2479c4863 100644
--- a/storage/innobase/include/row0ins.h
+++ b/storage/innobase/include/row0ins.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -78,7 +78,7 @@ dberr_t
 row_ins_clust_index_entry_low(
 /*==========================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
 				depending on whether we wish optimistic or
 				pessimistic descent down the index tree */
 	dict_index_t*	index,	/*!< in: clustered index */
@@ -94,13 +94,13 @@ same fields is found, the other record is necessarily marked deleted.
 It is then unmarked. Otherwise, the entry is just inserted to the index.
 @retval DB_SUCCESS on success
 @retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
-@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
 @return error code */
 dberr_t
 row_ins_sec_index_entry_low(
 /*========================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
 				depending on whether we wish optimistic or
 				pessimistic descent down the index tree */
 	dict_index_t*	index,	/*!< in: secondary index */
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
index 978a3f906c0..469f1f8a356 100644
--- a/storage/innobase/include/row0log.h
+++ b/storage/innobase/include/row0log.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,15 +24,15 @@ Modification log for online index creation and online table rebuild
 Created 2011-05-26 Marko Makela
 *******************************************************/
 
-#ifndef row0log_h
-#define row0log_h
+#pragma once
 
 #include "que0types.h"
 #include "mtr0types.h"
 #include "row0types.h"
 #include "rem0types.h"
-#include "data0types.h"
+#include "dict0dict.h"
 #include "trx0types.h"
+#include "trx0undo.h"
 
 class ut_stage_alter_t;
 
@@ -74,37 +74,23 @@ row_log_free(
 
 /******************************************************//**
 Free the row log for an index on which online creation was aborted. */
-UNIV_INLINE
-void
-row_log_abort_sec(
-/*==============*/
-	dict_index_t*	index)	/*!< in/out: index (x-latched) */
-	MY_ATTRIBUTE((nonnull));
-
-/******************************************************//**
-Try to log an operation to a secondary index that is
-(or was) being created.
-@retval true if the operation was logged or can be ignored
-@retval false if online index creation is not taking place */
-UNIV_INLINE
-bool
-row_log_online_op_try(
-/*==================*/
-	dict_index_t*	index,	/*!< in/out: index, S or X latched */
-	const dtuple_t* tuple,	/*!< in: index tuple */
-	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
-				or 0 for delete */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/******************************************************//**
-Logs an operation to a secondary index that is (or was) being created. */
-void
-row_log_online_op(
-/*==============*/
-	dict_index_t*	index,	/*!< in/out: index, S or X latched */
-	const dtuple_t*	tuple,	/*!< in: index tuple */
-	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
-				or 0 for delete */
-	ATTRIBUTE_COLD __attribute__((nonnull));
+inline void row_log_abort_sec(dict_index_t *index)
+{
+  ut_ad(index->lock.have_u_or_x());
+  ut_ad(!index->is_clust());
+  dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+  row_log_free(index->online_log);
+  index->online_log= nullptr;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param	index	index, S or X latched
+@param	tuple	index tuple
+@param	trx_id	transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+                       trx_id_t trx_id) ATTRIBUTE_COLD;
 
 /******************************************************//**
 Gets the error status of the online index rebuild log.
@@ -185,22 +171,6 @@ row_log_table_insert(
 	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
 				or X-latched */
 	const rec_offs*	offsets);/*!< in: rec_get_offsets(rec,index) */
-/******************************************************//**
-Notes that a BLOB is being freed during online ALTER TABLE. */
-void
-row_log_table_blob_free(
-/*====================*/
-	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
-	ulint		page_no)/*!< in: starting page number of the BLOB */
-	ATTRIBUTE_COLD __attribute__((nonnull));
-/******************************************************//**
-Notes that a BLOB is being allocated during online ALTER TABLE. */
-void
-row_log_table_blob_alloc(
-/*=====================*/
-	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
-	ulint		page_no)/*!< in: starting page number of the BLOB */
-	ATTRIBUTE_COLD __attribute__((nonnull));
 
 /** Apply the row_log_table log to a table upon completing rebuild.
 @param[in]	thr		query graph
@@ -252,6 +222,11 @@ row_log_apply(
 @return number of n_core_fields */
 unsigned row_log_get_n_core_fields(const dict_index_t *index);
 
+/** Get the error code of online log for the index
+@param	index	online index
+@return error code present in online log */
+dberr_t row_log_get_error(const dict_index_t *index);
+
 #ifdef HAVE_PSI_STAGE_INTERFACE
 /** Estimate how much work is to be done by the log apply phase
 of an ALTER TABLE for this index.
@@ -262,7 +237,3 @@ ulint
 row_log_estimate_work(
 	const dict_index_t*	index);
 #endif /* HAVE_PSI_STAGE_INTERFACE */
-
-#include "row0log.inl"
-
-#endif /* row0log.h */
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index 1d7f9bb145b..52096d48313 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -145,28 +145,6 @@ row_merge_dup_report(
 	const dfield_t*		entry)	/*!< in: duplicate index entry */
 	MY_ATTRIBUTE((nonnull));
 
-/*********************************************************************//**
-Sets an exclusive lock on a table, for the duration of creating indexes.
-@return error code or DB_SUCCESS */
-dberr_t
-row_merge_lock_table(
-/*=================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table,		/*!< in: table to lock */
-	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
-	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
-
-/*********************************************************************//**
-Drop indexes that were created before an error occurred.
-The data dictionary must have been locked exclusively by the caller,
-because the transaction will not be committed. */
-void
-row_merge_drop_indexes_dict(
-/*========================*/
-	trx_t*		trx,	/*!< in/out: dictionary transaction */
-	table_id_t	table_id)/*!< in: table identifier */
-	MY_ATTRIBUTE((nonnull));
-
 /** Drop indexes that were created before an error occurred.
 The data dictionary must have been locked exclusively by the caller,
 because the transaction will not be committed.
@@ -182,11 +160,9 @@ row_merge_drop_indexes(
         bool            locked,
         const trx_t*    alter_trx=NULL);
 
-/*********************************************************************//**
-Drop all partially created indexes during crash recovery. */
-void
-row_merge_drop_temp_indexes(void);
-/*=============================*/
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes();
 
 /** Create temporary merge files in the given paramater path, and if
 UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
@@ -217,19 +193,6 @@ row_merge_rename_index_to_add(
 	index_id_t	index_id)	/*!< in: index identifier */
 	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
 
-/*********************************************************************//**
-Rename an index in the dictionary that is to be dropped. The data
-dictionary must have been locked exclusively by the caller, because
-the transaction will not be committed.
-@return DB_SUCCESS if all OK */
-dberr_t
-row_merge_rename_index_to_drop(
-/*===========================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	table_id_t	table_id,	/*!< in: table identifier */
-	index_id_t	index_id)	/*!< in: index identifier */
-	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
-
 /** Create the index and load in to the dictionary.
 @param[in,out]	table		the index is on this table
 @param[in]	index_def	the index definition
@@ -253,18 +216,10 @@ row_merge_is_index_usable(
 	const dict_index_t*	index)	/*!< in: index to check */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/*********************************************************************//**
-Drop a table. The caller must have ensured that the background stats
-thread is not processing the table. This can be done by calling
-dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
-before calling this function.
-@return DB_SUCCESS or error code */
-dberr_t
-row_merge_drop_table(
-/*=================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table)		/*!< in: table instance to drop */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Map from column numbers to column definitions that include
+changes to the collation, when the encoding is compatible with
+the original column and no table rebuild is needed */
+typedef std::map<unsigned, dict_col_t*> col_collations;
 
 /** Build indexes on a table by reading a clustered index, creating a temporary
 file containing index entries, merge sorting these index entries and inserting
@@ -294,6 +249,7 @@ this function and it will be passed to other functions for further accounting.
 @param[in]	eval_table	mysql table used to evaluate virtual column
 				value, see innobase_get_computed_value().
 @param[in]	allow_non_null	allow the conversion from null to not-null
+@param[in]	col_collate	columns whose collations changed, or nullptr
 @return DB_SUCCESS or error code */
 dberr_t
 row_merge_build_indexes(
@@ -313,7 +269,8 @@ row_merge_build_indexes(
 	ut_stage_alter_t*	stage,
 	const dict_add_v_col_t*	add_v,
 	struct TABLE*		eval_table,
-	bool			allow_non_null)
+	bool			allow_non_null,
+	const col_collations*	col_collate)
 	MY_ATTRIBUTE((warn_unused_result));
 
 /********************************************************************//**
@@ -341,10 +298,8 @@ Write a merge block to the file system.
 @return whether the request was completed successfully
 @retval	false	on error
 @retval	true	on success */
-UNIV_INTERN
 bool
 row_merge_write(
-/*============*/
 	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
 	ulint		offset,	/*!< in: offset where to write,
 				in number of row_merge_block_t elements */
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index eb90ec0f04c..a9f1c87d600 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -37,11 +37,6 @@ Created 9/17/2000 Heikki Tuuri
 #include "fts0fts.h"
 #include "gis0type.h"
 
-#include "sql_list.h"
-#include "sql_cmd.h"
-
-extern ibool row_rollback_on_timeout;
-
 struct row_prebuilt_t;
 class ha_innobase;
 
@@ -187,13 +182,8 @@ row_create_prebuilt(
 	dict_table_t*	table,		/*!< in: Innobase table handle */
 	ulint		mysql_row_len);	/*!< in: length in bytes of a row in
 					the MySQL format */
-/********************************************************************//**
-Free a prebuilt struct for a MySQL table handle. */
-void
-row_prebuilt_free(
-/*==============*/
-	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
-	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt);
 /*********************************************************************//**
 Updates the transaction pointers in query graphs stored in the prebuilt
 struct. */
@@ -273,7 +263,7 @@ row_update_for_mysql(
 
 /** This can only be used when the current transaction is at
 READ COMMITTED or READ UNCOMMITTED isolation level.
-Before calling this function row_search_for_mysql() must have
+Before calling this function row_search_mvcc() must have
 initialized prebuilt->new_rec_locks to store the information which new
 record locks really were set. This function removes a newly set
 clustered index record lock under prebuilt->pcur or
@@ -310,40 +300,24 @@ row_update_cascade_for_mysql(
                                 or set null operation */
         dict_table_t*   table)  /*!< in: table where we do the operation */
         MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*********************************************************************//**
-Locks the data dictionary exclusively for performing a table create or other
-data dictionary modification operation. */
-void
-row_mysql_lock_data_dictionary_func(
-/*================================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	const char*	file,	/*!< in: file name */
-	unsigned	line);	/*!< in: line number */
-#define row_mysql_lock_data_dictionary(trx)				\
-	row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
-/*********************************************************************//**
-Unlocks the data dictionary exclusive lock. */
-void
-row_mysql_unlock_data_dictionary(
-/*=============================*/
-	trx_t*	trx);	/*!< in/out: transaction */
-/*********************************************************************//**
-Locks the data dictionary in shared mode from modifications, for performing
-foreign key check, rollback, or other operation invisible to MySQL. */
-void
-row_mysql_freeze_data_dictionary_func(
-/*==================================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	const char*	file,	/*!< in: file name */
-	unsigned	line);	/*!< in: line number */
-#define row_mysql_freeze_data_dictionary(trx)				\
-	row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
-/*********************************************************************//**
-Unlocks the data dictionary shared lock. */
-void
-row_mysql_unfreeze_data_dictionary(
-/*===============================*/
-	trx_t*	trx);	/*!< in/out: transaction */
+
+/** Lock the data dictionary cache exclusively. */
+#define row_mysql_lock_data_dictionary(trx)			\
+	do {							\
+		ut_ad(!trx->dict_operation_lock_mode);		\
+		dict_sys.lock(SRW_LOCK_CALL);			\
+		trx->dict_operation_lock_mode = true;		\
+	} while (0)
+
+/** Unlock the data dictionary. */
+#define row_mysql_unlock_data_dictionary(trx)			\
+	do {							\
+		ut_ad(!lock_trx_has_sys_table_locks(trx));	\
+		ut_ad(trx->dict_operation_lock_mode);		\
+		trx->dict_operation_lock_mode = false;		\
+		dict_sys.unlock();				\
+	} while (0)
+
 /*********************************************************************//**
 Creates a table for MySQL. On failure the transaction will be rolled back
 and the 'table' object will be freed.
@@ -354,9 +328,7 @@ row_create_table_for_mysql(
 	dict_table_t*	table,	/*!< in, own: table definition
 				(will be freed, or on DB_SUCCESS
 				added to the data dictionary cache) */
-	trx_t*		trx,	/*!< in/out: transaction */
-	fil_encryption_t mode,	/*!< in: encryption mode */
-	uint32_t	key_id)	/*!< in: encryption key_id */
+	trx_t*		trx)	/*!< in/out: transaction */
 	MY_ATTRIBUTE((warn_unused_result));
 
 /*********************************************************************//**
@@ -369,78 +341,22 @@ row_create_index_for_mysql(
 	dict_index_t*	index,		/*!< in, own: index definition
 					(will be freed) */
 	trx_t*		trx,		/*!< in: transaction handle */
-	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
+	const ulint*	field_lengths,	/*!< in: if not NULL, must contain
 					dict_index_get_n_fields(index)
 					actual field lengths for the
 					index columns, which are
 					then checked for not being too
 					large. */
+	fil_encryption_t mode,	/*!< in: encryption mode */
+	uint32_t	key_id)	/*!< in: encryption key_id */
 	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-The master thread in srv0srv.cc calls this regularly to drop tables which
-we must drop in background after queries to them have ended. Such lazy
-dropping of tables is needed in ALTER TABLE on Unix.
-@return how many tables dropped + remaining tables in list */
-ulint
-row_drop_tables_for_mysql_in_background(void);
-/*=========================================*/
-/*********************************************************************//**
-Get the background drop list length. NOTE: the caller must own the kernel
-mutex!
-@return how many tables in list */
-ulint
-row_get_background_drop_list_len_low(void);
-/*======================================*/
-
-/** Drop garbage tables during recovery. */
-void
-row_mysql_drop_garbage_tables();
-
-/*********************************************************************//**
-Sets an exclusive lock on a table.
-@return error code or DB_SUCCESS */
-dberr_t
-row_mysql_lock_table(
-/*=================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table,		/*!< in: table to lock */
-	enum lock_mode	mode,		/*!< in: LOCK_X or LOCK_S */
-	const char*	op_info)	/*!< in: string for trx->op_info */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
-
-/** Drop a table.
-If the data dictionary was not already locked by the transaction,
-the transaction will be committed.  Otherwise, the data dictionary
-will remain locked.
-@param[in]	name		Table name
-@param[in,out]	trx		Transaction handle
-@param[in]	sqlcom		type of SQL operation
-@param[in]	create_failed	true=create table failed
-				because e.g. foreign key column
-@param[in]	nonatomic	Whether it is permitted to release
-				and reacquire dict_sys.latch
-@return error code */
-dberr_t
-row_drop_table_for_mysql(
-	const char*		name,
-	trx_t*			trx,
-	enum_sql_command	sqlcom,
-	bool			create_failed = false,
-	bool			nonatomic = true);
-
-/** Drop a table after failed CREATE TABLE. */
-dberr_t row_drop_table_after_create_fail(const char* name, trx_t* trx);
 
 /*********************************************************************//**
 Discards the tablespace of a table which stored in an .ibd file. Discarding
 means that this function deletes the .ibd file and assigns a new table id for
 the table. Also the file_unreadable flag is set.
 @return error code or DB_SUCCESS */
-dberr_t
-row_discard_tablespace_for_mysql(
-/*=============================*/
-	const char*	name,	/*!< in: table name */
-	trx_t*		trx)	/*!< in: transaction handle */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /*****************************************************************//**
 Imports a tablespace. The space id in the .ibd file must match the space id
@@ -453,17 +369,6 @@ row_import_tablespace_for_mysql(
 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
         MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/** Drop a database for MySQL.
-@param[in]	name	database name which ends at '/'
-@param[in]	trx	transaction handle
-@param[out]	found	number of dropped tables/partitions
-@return error code or DB_SUCCESS */
-dberr_t
-row_drop_database_for_mysql(
-	const char*	name,
-	trx_t*		trx,
-	ulint*		found);
-
 /*********************************************************************//**
 Renames a table for MySQL.
 @return error code or DB_SUCCESS */
@@ -473,38 +378,10 @@ row_rename_table_for_mysql(
 	const char*	old_name,	/*!< in: old table name */
 	const char*	new_name,	/*!< in: new table name */
 	trx_t*		trx,		/*!< in/out: transaction */
-	bool		commit,		/*!< in: whether to commit trx */
 	bool		use_fk)		/*!< in: whether to parse and enforce
 					FOREIGN KEY constraints */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/*********************************************************************//**
-Scans an index for either COOUNT(*) or CHECK TABLE.
-If CHECK TABLE; Checks that the index contains entries in an ascending order,
-unique constraint is not broken, and calculates the number of index entries
-in the read view of the current transaction.
-@return DB_SUCCESS or other error */
-dberr_t
-row_scan_index_for_mysql(
-/*=====================*/
-	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
-						in MySQL handle */
-	const dict_index_t*	index,		/*!< in: index */
-	ulint*			n_rows)		/*!< out: number of entries
-						seen in the consistent read */
-	MY_ATTRIBUTE((warn_unused_result));
-/*********************************************************************//**
-Initialize this module */
-void
-row_mysql_init(void);
-/*================*/
-
-/*********************************************************************//**
-Close this module */
-void
-row_mysql_close(void);
-/*=================*/
-
 /* A struct describing a place for an individual column in the MySQL
 row format which is presented to the table handler in ha_innobase.
 This template struct is used to speed up row transformations between
@@ -686,6 +563,7 @@ struct row_prebuilt_t {
 	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
 					sel/upd/del */
 	lock_mode	select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+	bool		skip_locked;	/*!< TL_{READ,WRITE}_SKIP_LOCKED */
 	lock_mode	stored_select_lock_type;/*!< this field is used to
 					remember the original select_lock_type
 					that was decided in ha_innodb.cc,
@@ -712,7 +590,7 @@ struct row_prebuilt_t {
 					ROW_READ_TRY_SEMI_CONSISTENT and
 					to simply skip the row.	 If
 					the row matches, the next call to
-					row_search_for_mysql() will lock
+					row_search_mvcc() will lock
 					the row.
 					This eliminates lock waits in some
 					cases; note that this breaks
@@ -721,7 +599,7 @@ struct row_prebuilt_t {
 					the session is using READ
 					COMMITTED or READ UNCOMMITTED
 					isolation level, set in
-					row_search_for_mysql() if we set a new
+					row_search_mvcc() if we set a new
 					record lock on the secondary
 					or clustered index; this is
 					used in row_unlock_for_mysql()
@@ -861,9 +739,8 @@ struct VCOL_STORAGE
 @return		TRUE  malloc failure
 */
 
-bool innobase_allocate_row_for_vcol(
-				    THD *	  thd,
-				    dict_index_t* index,
+bool innobase_allocate_row_for_vcol(THD *thd,
+				    const dict_index_t* index,
 				    mem_heap_t**  heap,
 				    TABLE**	  table,
 				    VCOL_STORAGE* storage);
@@ -879,17 +756,13 @@ public:
 
   ib_vcol_row(mem_heap_t *heap) : heap(heap) {}
 
-  byte *record(THD *thd, dict_index_t *index, TABLE **table)
+  byte *record(THD *thd, const dict_index_t *index, TABLE **table)
   {
-    if (!storage.innobase_record)
-    {
-      bool ok = innobase_allocate_row_for_vcol(thd, index, &heap, table,
-                                               &storage);
-      if (!ok)
-        return NULL;
-    }
+    if (!storage.innobase_record &&
+        !innobase_allocate_row_for_vcol(thd, index, &heap, table, &storage))
+      return nullptr;
     return storage.innobase_record;
-  };
+  }
 
   ~ib_vcol_row()
   {
@@ -958,7 +831,7 @@ innobase_rename_vc_templ(
 #define ROW_MYSQL_REC_FIELDS	1
 #define ROW_MYSQL_NO_TEMPLATE	2
 #define ROW_MYSQL_DUMMY_TEMPLATE 3	/* dummy template used in
-					row_scan_and_check_index */
+					row_check_index() */
 
 /* Values for hint_need_to_fetch_extra_cols */
 #define ROW_RETRIEVE_PRIMARY_KEY	1
@@ -969,10 +842,4 @@ innobase_rename_vc_templ(
 #define ROW_READ_TRY_SEMI_CONSISTENT	1
 #define ROW_READ_DID_SEMI_CONSISTENT	2
 
-#ifdef UNIV_DEBUG
-/** Wait for the background drop list to become empty. */
-void
-row_wait_for_background_drop_list_empty();
-#endif /* UNIV_DEBUG */
-
 #endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
index 091d80adec5..b1390fd1ef1 100644
--- a/storage/innobase/include/row0purge.h
+++ b/storage/innobase/include/row0purge.h
@@ -72,9 +72,8 @@ row_purge_poss_sec(
 	bool		is_tree=false);
 
 /***************************************************************
-Does the purge operation for a single undo log record. This is a high-level
-function used in an SQL execution graph.
-@return query thread to run next or NULL */
+Does the purge operation.
+@return query thread to run next */
 que_thr_t*
 row_purge_step(
 /*===========*/
@@ -198,21 +197,7 @@ public:
 	}
 
   /** Start processing an undo log record. */
-  void start()
-  {
-    ut_ad(in_progress);
-    DBUG_ASSERT(common.type == QUE_NODE_PURGE);
-
-    row= nullptr;
-    ref= nullptr;
-    index= nullptr;
-    update= nullptr;
-    found_clust= FALSE;
-    rec_type= ULINT_UNDEFINED;
-    cmpl_info= ULINT_UNDEFINED;
-    if (!purge_thd)
-      purge_thd= current_thd;
-  }
+  inline void start();
 
 
   /** Close the existing table and release the MDL for it. */
@@ -226,7 +211,7 @@ public:
     }
 
     innobase_reset_background_thd(purge_thd);
-    dict_table_close(table, false, false, purge_thd, mdl_ticket);
+    dict_table_close(table, false, purge_thd, mdl_ticket);
     table= nullptr;
     mdl_ticket= nullptr;
   }
@@ -253,16 +238,7 @@ public:
 
   /** Reset the state at end
   @return the query graph parent */
-  que_node_t* end()
-  {
-    DBUG_ASSERT(common.type == QUE_NODE_PURGE);
-    close_table();
-    ut_ad(undo_recs.empty());
-    ut_d(in_progress= false);
-    purge_thd= nullptr;
-    mem_heap_empty(heap);
-    return common.parent;
-  }
+  inline que_node_t *end();
 };
 
 #endif
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
index 1e0fdc65238..a1350740e2a 100644
--- a/storage/innobase/include/row0row.h
+++ b/storage/innobase/include/row0row.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2020, MariaDB Corporation.
+Copyright (c) 2016, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -303,13 +303,13 @@ row_build_row_ref_fast(
 /***************************************************************//**
 Searches the clustered index record for a row, if we have the row
 reference.
-@return TRUE if found */
-ibool
+@return true if found */
+bool
 row_search_on_row_ref(
 /*==================*/
 	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
 					be closed by the caller */
-	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_latch_mode		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	const dict_table_t*	table,	/*!< in: table */
 	const dtuple_t*		ref,	/*!< in: row reference */
 	mtr_t*			mtr)	/*!< in/out: mtr */
@@ -321,7 +321,7 @@ on the secondary index record are preserved.
 rec_t*
 row_get_clust_rec(
 /*==============*/
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	const rec_t*	rec,	/*!< in: record in a secondary index */
 	dict_index_t*	index,	/*!< in: secondary index */
 	dict_index_t**	clust_index,/*!< out: clustered index */
@@ -363,9 +363,8 @@ Searches an index record.
 enum row_search_result
 row_search_index_entry(
 /*===================*/
-	dict_index_t*	index,	/*!< in: index */
 	const dtuple_t*	entry,	/*!< in: index entry */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
 				be closed by the caller */
 	mtr_t*		mtr)	/*!< in: mtr */
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
index eb83a4bcad6..8134c60fe72 100644
--- a/storage/innobase/include/row0sel.h
+++ b/storage/innobase/include/row0sel.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates.
-Copyright (c) 2017, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,8 +24,7 @@ Select
 Created 12/19/1997 Heikki Tuuri
 *******************************************************/
 
-#ifndef row0sel_h
-#define row0sel_h
+#pragma once
 
 #include "data0data.h"
 #include "que0types.h"
@@ -58,15 +57,6 @@ void
 sel_col_prefetch_buf_free(
 /*======================*/
 	sel_buf_t*	prefetch_buf);	/*!< in, own: prefetch buffer */
-/*********************************************************************//**
-Gets the plan node for the nth table in a join.
-@return plan node */
-UNIV_INLINE
-plan_t*
-sel_node_get_nth_plan(
-/*==================*/
-	sel_node_t*	node,	/*!< in: select node */
-	ulint		i);	/*!< in: get ith plan node */
 /**********************************************************************//**
 Performs a select step. This is a high-level function used in SQL execution
 graphs.
@@ -76,14 +66,6 @@ row_sel_step(
 /*=========*/
 	que_thr_t*	thr);	/*!< in: query thread */
 /**********************************************************************//**
-Performs an execution step of an open or close cursor statement node.
-@return query thread to run next or NULL */
-UNIV_INLINE
-que_thr_t*
-open_step(
-/*======*/
-	que_thr_t*	thr);	/*!< in: query thread */
-/**********************************************************************//**
 Performs a fetch for a cursor.
 @return query thread to run next or NULL */
 que_thr_t*
@@ -136,37 +118,7 @@ row_sel_convert_mysql_key_to_innobase(
 	ulint		key_len);	/*!< in: MySQL key value length */
 
 
-/** Searches for rows in the database. This is used in the interface to
-MySQL. This function opens a cursor, and also implements fetch next
-and fetch prev. NOTE that if we do a search with a full key value
-from a unique index (ROW_SEL_EXACT), then we will not store the cursor
-position and fetch next or fetch prev must not be tried to the cursor!
-
-@param[out]	buf		buffer for the fetched row in MySQL format
-@param[in]	mode		search mode PAGE_CUR_L
-@param[in,out]	prebuilt	prebuilt struct for the table handler;
-				this contains the info to search_tuple,
-				index; if search tuple contains 0 field then
-				we position the cursor at start or the end of
-				index, depending on 'mode'
-@param[in]	match_mode	0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
-@param[in]	direction	0 or ROW_SEL_NEXT or ROW_SEL_PREV;
-				Note: if this is != 0, then prebuilt must has a
-				pcur with stored position! In opening of a
-				cursor 'direction' should be 0.
-@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
-DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
-UNIV_INLINE
-dberr_t
-row_search_for_mysql(
-	byte*		buf,
-	page_cur_mode_t	mode,
-	row_prebuilt_t*	prebuilt,
-	ulint		match_mode,
-	ulint		direction)
-	MY_ATTRIBUTE((warn_unused_result));
-
-/** Searches for rows in the database using cursor.
+/** Search for rows in the database using cursor.
 Function is mainly used for tables that are shared across connections and
 so it employs technique that can help re-construct the rows that
 transaction is suppose to see.
@@ -184,7 +136,8 @@ It also has optimization such as pre-caching the rows, using AHI, etc.
 				Note: if this is != 0, then prebuilt must has a
 				pcur with stored position! In opening of a
 				cursor 'direction' should be 0.
-@return DB_SUCCESS or error code */
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
 dberr_t
 row_search_mvcc(
 	byte*		buf,
@@ -210,6 +163,21 @@ row_count_rtree_recs(
 	ulint*		n_rows);	/*!< out: number of entries
 					seen in the consistent read */
 
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt    index and transaction
+@param n_rows      number of records counted
+
+@return error code
+@retval DB_SUCCESS  if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
 /** Read the max AUTOINC value from an index.
 @param[in] index	index starting with an AUTO_INCREMENT column
 @return	the largest AUTO_INCREMENT value
@@ -382,6 +350,17 @@ struct sel_node_t{
 					fetches */
 };
 
+/**
+Get the plan node for a table in a join.
+@param node  query graph node for SELECT
+@param i     plan node element
+@return ith plan node */
+inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i)
+{
+  ut_ad(i < node->n_tables);
+  return &node->plans[i];
+}
+
 /** Fetch statement node */
 struct fetch_node_t{
 	que_common_t	common;		/*!< type: QUE_NODE_FETCH */
@@ -476,7 +455,3 @@ row_sel_field_store_in_mysql_format_func(
 #endif /* UNIV_DEBUG */
         const byte*     data,   /*!< in: data to store */
         ulint           len);    /*!< in: length of the data */
-
-#include "row0sel.inl"
-
-#endif
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
index 59ed14aeff6..f60fc3595dc 100644
--- a/storage/innobase/include/row0upd.h
+++ b/storage/innobase/include/row0upd.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -118,14 +118,6 @@ row_upd_changes_field_size_or_external(
 	dict_index_t*	index,	/*!< in: index */
 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
 	const upd_t*	update);/*!< in: update vector */
-/***********************************************************//**
-Returns true if row update contains disowned external fields.
-@return true if the update contains disowned external fields. */
-bool
-row_upd_changes_disowned_external(
-/*==============================*/
-	const upd_t*	update)	/*!< in: update vector */
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /***************************************************************//**
 Builds an update vector from those fields which in a secondary index entry
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
index d54384f837c..60f310e1b0f 100644
--- a/storage/innobase/include/row0vers.h
+++ b/storage/innobase/include/row0vers.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -45,7 +45,7 @@ index record.
 @param[in]	index	secondary index
 @param[in]	offsets	rec_get_offsets(rec, index)
 @return	the active transaction; state must be rechecked after
-trx_mutex_enter(), and trx->release_reference() must be invoked
+acquiring trx->mutex, and trx->release_reference() must be invoked
 @retval	NULL if the record was committed */
 trx_t*
 row_vers_impl_x_locked(
@@ -55,7 +55,7 @@ row_vers_impl_x_locked(
 	const rec_offs*	offsets);
 
 /** Finds out if a version of the record, where the version >= the current
-purge view, should have ientry as its secondary index entry. We check
+purge_sys.view, should have ientry as its secondary index entry. We check
 if there is any not delete marked version of the record where the trx
 id >= purge view, and the secondary index entry == ientry; exactly in
 this case we return TRUE.
@@ -85,7 +85,9 @@ row_vers_old_has_index_entry(
 Constructs the version of a clustered index record which a consistent
 read should see. We assume that the trx id stored in rec is such that
 the consistent read should not see rec in its present version.
-@return DB_SUCCESS or DB_MISSING_HISTORY */
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
 dberr_t
 row_vers_build_for_consistent_read(
 /*===============================*/
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
index f3d005ff764..4881f2f1d35 100644
--- a/storage/innobase/include/rw_lock.h
+++ b/storage/innobase/include/rw_lock.h
@@ -49,6 +49,18 @@ protected:
     lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
 #endif
   }
+  /** Start waiting for an exclusive lock.
+  @return current value of the lock word */
+  uint32_t write_lock_wait_start_read()
+  { return lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
+  /** Wait for an exclusive lock.
+  @param l the value of the lock word
+  @return whether the exclusive lock was acquired */
+  bool write_lock_wait_try(uint32_t &l)
+  {
+    return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
   /** Try to acquire a shared lock.
   @param l the value of the lock word
   @return whether the lock was acquired */
@@ -64,36 +76,46 @@ protected:
     }
     return true;
   }
+
   /** Wait for an exclusive lock.
   @return whether the exclusive lock was acquired */
   bool write_lock_poll()
   {
     auto l= WRITER_WAITING;
-    if (lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
-                                     std::memory_order_relaxed))
+    if (write_lock_wait_try(l))
       return true;
     if (!(l & WRITER_WAITING))
       /* write_lock() must have succeeded for another thread */
       write_lock_wait_start();
     return false;
   }
+  /** @return the lock word value */
+  uint32_t value() const { return lock.load(std::memory_order_acquire); }
 
 public:
   /** Default constructor */
   rw_lock() : lock(UNLOCKED) {}
 
-  /** Release a shared lock */
-  void read_unlock()
+  /** Release a shared lock.
+  @return whether any writers may have to be woken up */
+  bool read_unlock()
   {
-    IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(1, std::memory_order_release);
-    DBUG_ASSERT(l & ~WRITER_PENDING); /* at least one read lock */
+    auto l= lock.fetch_sub(1, std::memory_order_release);
     DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+    DBUG_ASSERT(~(WRITER_PENDING) & l); /* at least one read lock */
+    return (~WRITER_PENDING & l) == 1;
   }
   /** Release an exclusive lock */
   void write_unlock()
   {
+    /* Below, we use fetch_sub(WRITER) instead of fetch_and(~WRITER).
+    The reason is that on IA-32 and AMD64 it translates into the 80486
+    instruction LOCK XADD, while fetch_and() translates into a loop
+    around LOCK CMPXCHG. For other ISA either form should be fine. */
+    static_assert(WRITER == 1U << 31, "compatibility");
     IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
-    DBUG_ASSERT(l & WRITER); /* the write lock must have existed */
+    /* the write lock must have existed */
+    DBUG_ASSERT(l & WRITER);
   }
   /** Try to acquire a shared lock.
   @return whether the lock was acquired */
@@ -108,15 +130,9 @@ public:
   }
 
   /** @return whether an exclusive lock is being held by any thread */
-  bool is_write_locked() const
-  { return !!(lock.load(std::memory_order_relaxed) & WRITER); }
-  /** @return whether a shared lock is being held by any thread */
-  bool is_read_locked() const
-  {
-    auto l= lock.load(std::memory_order_relaxed);
-    return (l & ~WRITER_PENDING) && !(l & WRITER);
-  }
+  bool is_write_locked() const { return !!(value() & WRITER); }
+  /** @return whether any lock is being held or waited for by any thread */
+  bool is_locked_or_waiting() const { return value() != 0; }
   /** @return whether any lock is being held by any thread */
-  bool is_locked() const
-  { return (lock.load(std::memory_order_relaxed) & ~WRITER_WAITING) != 0; }
+  bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; }
 };
diff --git a/storage/innobase/include/small_vector.h b/storage/innobase/include/small_vector.h
new file mode 100644
index 00000000000..d28a36184b8
--- /dev/null
+++ b/storage/innobase/include/small_vector.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+/* A normally small vector, inspired by llvm::SmallVector */
+#include "my_global.h"
+#include <iterator>
+#include <memory>
+
+class small_vector_base
+{
+protected:
+  typedef uint32_t Size_T;
+  void *BeginX;
+  Size_T Size= 0, Capacity;
+  small_vector_base()= delete;
+  small_vector_base(void *small, size_t small_size)
+    : BeginX(small), Capacity(Size_T(small_size)) {}
+  ATTRIBUTE_COLD void grow_by_1(void *small, size_t element_size);
+public:
+  size_t size() const { return Size; }
+  size_t capacity() const { return Capacity; }
+  bool empty() const { return !Size; }
+  void clear() { Size= 0; }
+protected:
+  void set_size(size_t N) { Size= Size_T(N); }
+};
+
+template <typename T, unsigned N>
+class small_vector : public small_vector_base
+{
+  /** The fixed storage allocation */
+  T small[N];
+
+  using small_vector_base::set_size;
+
+  void grow_if_needed()
+  {
+    if (unlikely(size() >= capacity()))
+      grow_by_1(small, sizeof *small);
+  }
+
+public:
+  small_vector() : small_vector_base(small, N)
+  {
+    TRASH_ALLOC(small, sizeof small);
+  }
+  ~small_vector()
+  {
+    if (small != begin())
+      my_free(begin());
+    MEM_MAKE_ADDRESSABLE(small, sizeof small);
+  }
+
+  using iterator= T *;
+  using const_iterator= const T *;
+  using reverse_iterator= std::reverse_iterator<iterator>;
+  using reference= T &;
+  using const_reference= const T&;
+
+  iterator begin() { return static_cast<iterator>(BeginX); }
+  const_iterator begin() const { return static_cast<const_iterator>(BeginX); }
+  iterator end() { return begin() + size(); }
+  const_iterator end() const { return begin() + size(); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  reference operator[](size_t i) { assert(i < size()); return begin()[i]; }
+  const_reference operator[](size_t i) const
+  { return const_cast<small_vector&>(*this)[i]; }
+
+  void erase(const_iterator S, const_iterator E)
+  {
+    set_size(std::move(const_cast<iterator>(E), end(),
+                       const_cast<iterator>(S)) - begin());
+  }
+
+  void emplace_back(T &&arg)
+  {
+    grow_if_needed();
+    ::new (end()) T(arg);
+    set_size(size() + 1);
+  }
+};
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index e65d31bfa04..971f6363bdb 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -36,7 +36,7 @@ Created 12/15/2009	Jimmy Yang
 #define __STDC_LIMIT_MACROS
 #endif /* __STDC_LIMIT_MACROS */
 
-#include <stdint.h>
+#include <cstdint>
 #include "my_atomic.h"
 #include "my_atomic_wrapper.h"
 
@@ -136,8 +136,6 @@ enum monitor_id_t {
 	/* Start of Metadata counter */
 	MONITOR_MODULE_METADATA,
 	MONITOR_TABLE_OPEN,
-	MONITOR_TABLE_CLOSE,
-	MONITOR_TABLE_REFERENCE,
 
 	/* Lock manager related counters */
 	MONITOR_MODULE_LOCK,
@@ -218,11 +216,7 @@ enum monitor_id_t {
 	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
 	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
 	MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
-	MONITOR_LRU_BATCH_FLUSH_COUNT,
-	MONITOR_LRU_BATCH_FLUSH_PAGES,
 	MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
-	MONITOR_LRU_BATCH_EVICT_COUNT,
-	MONITOR_LRU_BATCH_EVICT_PAGES,
 	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
 	MONITOR_LRU_GET_FREE_SEARCH,
 	MONITOR_LRU_SEARCH_SCANNED,
@@ -287,7 +281,6 @@ enum monitor_id_t {
 	MONITOR_TRX_COMMIT_UNDO,
 	MONITOR_TRX_ROLLBACK,
 	MONITOR_TRX_ROLLBACK_SAVEPOINT,
-	MONITOR_TRX_ACTIVE,
 	MONITOR_RSEG_HISTORY_LEN,
 	MONITOR_NUM_UNDO_SLOT_USED,
 	MONITOR_NUM_UNDO_SLOT_CACHED,
@@ -350,9 +343,7 @@ enum monitor_id_t {
 	/* Adaptive Hash Index related counters */
 	MONITOR_MODULE_ADAPTIVE_HASH,
 	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
-#endif /* BTR_CUR_HASH_ADAPT */
 	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
-#ifdef BTR_CUR_HASH_ADAPT
 	MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
 	MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
 	MONITOR_ADAPTIVE_HASH_ROW_ADDED,
@@ -382,7 +373,6 @@ enum monitor_id_t {
 	MONITOR_OVLD_SERVER_ACTIVITY,
 	MONITOR_MASTER_ACTIVE_LOOPS,
 	MONITOR_MASTER_IDLE_LOOPS,
-	MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
 	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
 	MONITOR_SRV_DICT_LRU_MICROSECOND,
 	MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
@@ -390,15 +380,6 @@ enum monitor_id_t {
 	MONITOR_OVLD_SRV_DBLWR_WRITES,
 	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
 	MONITOR_OVLD_SRV_PAGE_SIZE,
-	MONITOR_OVLD_RWLOCK_S_SPIN_WAITS,
-	MONITOR_OVLD_RWLOCK_X_SPIN_WAITS,
-	MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS,
-	MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS,
-	MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS,
-	MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS,
-	MONITOR_OVLD_RWLOCK_S_OS_WAITS,
-	MONITOR_OVLD_RWLOCK_X_OS_WAITS,
-	MONITOR_OVLD_RWLOCK_SX_OS_WAITS,
 
 	/* Data DML related counters */
 	MONITOR_MODULE_DML_STATS,
@@ -414,7 +395,6 @@ enum monitor_id_t {
 	/* Data DDL related counters */
 	MONITOR_MODULE_DDL_STATS,
 	MONITOR_BACKGROUND_DROP_INDEX,
-	MONITOR_BACKGROUND_DROP_TABLE,
 	MONITOR_ONLINE_CREATE_INDEX,
 	MONITOR_PENDING_ALTER_TABLE,
 	MONITOR_ALTER_TABLE_SORT_FILES,
@@ -426,10 +406,6 @@ enum monitor_id_t {
 	MONITOR_ICP_OUT_OF_RANGE,
 	MONITOR_ICP_MATCH,
 
-	/* Mutex/RW-Lock related counters */
-	MONITOR_MODULE_LATCHES,
-	MONITOR_LATCHES,
-
 	/* This is used only for control system to turn
 	on/off and reset all monitor counters */
 	MONITOR_ALL_COUNTER,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 75718a92a10..96cfe886c02 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -3,7 +3,7 @@
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, 2009, Google Inc.
 Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2023, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -45,16 +45,40 @@ Created 10/10/1995 Heikki Tuuri
 #include "que0types.h"
 #include "trx0types.h"
 #include "fil0fil.h"
+#include "ut0counter.h"
 
 #include "mysql/psi/mysql_stage.h"
 #include "mysql/psi/psi.h"
 #include <tpool.h>
 #include <memory>
 
+/** Simple non-atomic counter
+@tparam	Type  the integer type of the counter */
+template <typename Type>
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter
+{
+  /** Increment the counter */
+  Type inc() { return add(1); }
+  /** Decrement the counter */
+  Type dec() { return add(Type(~0)); }
+
+  /** Add to the counter
+  @param i  amount to be added
+  @return the value of the counter after adding */
+  Type add(Type i) { return m_counter += i; }
+
+  /** @return the value of the counter */
+  operator Type() const { return m_counter; }
+
+private:
+  /** The counter */
+  Type m_counter;
+};
+
 /** Global counters used inside InnoDB. */
 struct srv_stats_t
 {
-	typedef ib_counter_t<ulint, 64> ulint_ctr_64_t;
+	typedef ib_counter_t<ulint> ulint_ctr_n_t;
 	typedef simple_counter<lsn_t> lsn_ctr_1_t;
 	typedef simple_counter<ulint> ulint_ctr_1_t;
 	typedef simple_counter<int64_t> int64_ctr_1_t;
@@ -84,91 +108,74 @@ struct srv_stats_t
 	/** Store the number of write requests issued */
 	ulint_ctr_1_t		buf_pool_write_requests;
 
-	/** Number of buffer pool reads that led to the reading of
-	a disk page */
-	ulint_ctr_1_t		buf_pool_reads;
-
 	/** Number of bytes saved by page compression */
-	ulint_ctr_64_t          page_compression_saved;
+	ulint_ctr_n_t          page_compression_saved;
 	/* Number of pages compressed with page compression */
-        ulint_ctr_64_t          pages_page_compressed;
+        ulint_ctr_n_t          pages_page_compressed;
 	/* Number of TRIM operations induced by page compression */
-        ulint_ctr_64_t          page_compressed_trim_op;
+        ulint_ctr_n_t          page_compressed_trim_op;
 	/* Number of pages decompressed with page compression */
-        ulint_ctr_64_t          pages_page_decompressed;
+        ulint_ctr_n_t          pages_page_decompressed;
 	/* Number of page compression errors */
-	ulint_ctr_64_t          pages_page_compression_error;
+	ulint_ctr_n_t          pages_page_compression_error;
 	/* Number of pages encrypted */
-	ulint_ctr_64_t          pages_encrypted;
+	ulint_ctr_n_t          pages_encrypted;
    	/* Number of pages decrypted */
-	ulint_ctr_64_t          pages_decrypted;
+	ulint_ctr_n_t          pages_decrypted;
 	/* Number of merge blocks encrypted */
-	ulint_ctr_64_t          n_merge_blocks_encrypted;
+	ulint_ctr_n_t          n_merge_blocks_encrypted;
 	/* Number of merge blocks decrypted */
-	ulint_ctr_64_t          n_merge_blocks_decrypted;
+	ulint_ctr_n_t          n_merge_blocks_decrypted;
 	/* Number of row log blocks encrypted */
-	ulint_ctr_64_t          n_rowlog_blocks_encrypted;
+	ulint_ctr_n_t          n_rowlog_blocks_encrypted;
 	/* Number of row log blocks decrypted */
-	ulint_ctr_64_t          n_rowlog_blocks_decrypted;
+	ulint_ctr_n_t          n_rowlog_blocks_decrypted;
 
 	/** Number of data read in total (in bytes) */
 	ulint_ctr_1_t		data_read;
 
-	/** Wait time of database locks */
-	int64_ctr_1_t		n_lock_wait_time;
-
-	/** Number of database lock waits */
-	ulint_ctr_1_t		n_lock_wait_count;
-
-	/** Number of threads currently waiting on database locks */
-	MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<ulint>
-				n_lock_wait_current_count;
-
 	/** Number of rows read. */
-	ulint_ctr_64_t		n_rows_read;
+	ulint_ctr_n_t		n_rows_read;
 
 	/** Number of rows updated */
-	ulint_ctr_64_t		n_rows_updated;
+	ulint_ctr_n_t		n_rows_updated;
 
 	/** Number of rows deleted */
-	ulint_ctr_64_t		n_rows_deleted;
+	ulint_ctr_n_t		n_rows_deleted;
 
 	/** Number of rows inserted */
-	ulint_ctr_64_t		n_rows_inserted;
+	ulint_ctr_n_t		n_rows_inserted;
 
 	/** Number of system rows read. */
-	ulint_ctr_64_t		n_system_rows_read;
+	ulint_ctr_n_t		n_system_rows_read;
 
 	/** Number of system rows updated */
-	ulint_ctr_64_t		n_system_rows_updated;
+	ulint_ctr_n_t		n_system_rows_updated;
 
 	/** Number of system rows deleted */
-	ulint_ctr_64_t		n_system_rows_deleted;
+	ulint_ctr_n_t		n_system_rows_deleted;
 
 	/** Number of system rows inserted */
-	ulint_ctr_64_t		n_system_rows_inserted;
+	ulint_ctr_n_t		n_system_rows_inserted;
 
 	/** Number of times secondary index lookup triggered cluster lookup */
-	ulint_ctr_64_t		n_sec_rec_cluster_reads;
+	ulint_ctr_n_t		n_sec_rec_cluster_reads;
 
 	/** Number of times prefix optimization avoided triggering cluster lookup */
-	ulint_ctr_64_t		n_sec_rec_cluster_reads_avoided;
+	ulint_ctr_n_t		n_sec_rec_cluster_reads_avoided;
 
 	/** Number of encryption_get_latest_key_version calls */
-	ulint_ctr_64_t		n_key_requests;
+	ulint_ctr_n_t		n_key_requests;
 
 	/** Number of temporary tablespace blocks encrypted */
-	ulint_ctr_64_t		n_temp_blocks_encrypted;
+	ulint_ctr_n_t		n_temp_blocks_encrypted;
 
 	/** Number of temporary tablespace blocks decrypted */
-	ulint_ctr_64_t		n_temp_blocks_decrypted;
-
-	/** Number of lock deadlocks */
-	ulint_ctr_1_t		lock_deadlock_count;
+	ulint_ctr_n_t		n_temp_blocks_decrypted;
 };
 
 /** We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. srv_start() sets the value. */
+a transactional lock inside InnoDB. srv_start() sets the value. */
 extern ulint srv_max_n_threads;
 
 extern const char*	srv_main_thread_op_info;
@@ -193,15 +200,13 @@ at a time */
 #define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment())
 
 /** Mutex protecting page_zip_stat_per_index */
-extern ib_mutex_t	page_zip_stat_per_index_mutex;
-/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
-extern ib_mutex_t	srv_monitor_file_mutex;
+extern mysql_mutex_t page_zip_stat_per_index_mutex;
+/** Mutex for locking srv_monitor_file */
+extern mysql_mutex_t srv_monitor_file_mutex;
 /* Temporary file for innodb monitor output */
 extern FILE*	srv_monitor_file;
-/* Mutex for locking srv_misc_tmpfile. Only created if !srv_read_only_mode.
-This mutex has a very low rank; threads reserving it should not
-acquire any further latches or sleep before releasing this one. */
-extern ib_mutex_t	srv_misc_tmpfile_mutex;
+/** Mutex for locking srv_misc_tmpfile */
+extern mysql_mutex_t srv_misc_tmpfile_mutex;
 /* Temporary file for miscellanous diagnostic output */
 extern FILE*	srv_misc_tmpfile;
 
@@ -284,11 +289,6 @@ extern ulong	srv_log_write_ahead_size;
 extern my_bool	srv_adaptive_flushing;
 extern my_bool	srv_flush_sync;
 
-/* If this flag is TRUE, then we will load the indexes' (and tables') metadata
-even if they are marked as "corrupted". Mostly it is for DBA to process
-corrupted index and table */
-extern my_bool	srv_load_corrupted;
-
 /** Requested size in bytes */
 extern ulint		srv_buf_pool_size;
 /** Requested buffer pool chunk size. Each buffer pool instance consists
@@ -313,6 +313,8 @@ extern ulong srv_buf_pool_load_pages_abort;
 /** Lock table size in bytes */
 extern ulint	srv_lock_table_size;
 
+/** the value of innodb_checksum_algorithm */
+extern ulong	srv_checksum_algorithm;
 extern my_bool	srv_random_read_ahead;
 extern ulong	srv_read_ahead_threshold;
 extern uint	srv_n_read_io_threads;
@@ -397,12 +399,18 @@ enum srv_operation_mode {
 	/** Mariabackup restoring the incremental part of a backup */
 	SRV_OPERATION_RESTORE_DELTA,
 	/** Mariabackup restoring a backup for subsequent --export */
-	SRV_OPERATION_RESTORE_EXPORT
+	SRV_OPERATION_RESTORE_EXPORT,
+	/** Mariabackup taking a backup and avoid deferring
+	any tablespace */
+	SRV_OPERATION_BACKUP_NO_DEFER
 };
 
 /** Current mode of operation */
 extern enum srv_operation_mode srv_operation;
 
+/** whether this is the server's first start after mariabackup --prepare */
+extern bool srv_start_after_restore;
+
 extern my_bool	srv_print_innodb_monitor;
 extern my_bool	srv_print_innodb_lock_monitor;
 extern ibool	srv_print_verbose_log;
@@ -425,7 +433,6 @@ extern ulint	srv_log_writes_and_flush;
 
 #ifdef UNIV_DEBUG
 extern my_bool	innodb_evict_tables_on_commit_debug;
-extern my_bool	srv_sync_debug;
 extern my_bool	srv_purge_view_update_only_debug;
 
 /** InnoDB system tablespace to set during recovery */
@@ -442,9 +449,6 @@ extern uint srv_n_purge_threads;
 /* the number of pages to purge in one batch */
 extern ulong srv_purge_batch_size;
 
-/* the number of sync wait arrays */
-extern ulong srv_sync_array_size;
-
 /* print all user-level transactions deadlocks to mysqld stderr */
 extern my_bool srv_print_all_deadlocks;
 
@@ -532,11 +536,9 @@ enum {
 	SRV_FORCE_NO_BACKGROUND	= 2,	/*!< prevent the main thread from
 					running: if a crash would occur
 					in purge, this prevents it */
-	SRV_FORCE_NO_TRX_UNDO = 3,	/*!< do not run trx rollback after
+	SRV_FORCE_NO_TRX_UNDO = 3,	/*!< do not run DML rollback after
 					recovery */
-	SRV_FORCE_NO_IBUF_MERGE = 4,	/*!< prevent also ibuf operations:
-					if they would cause a crash, better
-					not do them */
+	SRV_FORCE_NO_DDL_UNDO = 4,	/*!< prevent also DDL rollback */
 	SRV_FORCE_NO_UNDO_LOG_SCAN = 5,	/*!< do not look at undo logs when
 					starting the database: InnoDB will
 					treat even incomplete transactions
@@ -580,8 +582,7 @@ ibool
 srv_printf_innodb_monitor(
 /*======================*/
 	FILE*	file,		/*!< in: output stream */
-	ibool	nowait,		/*!< in: whether to wait for the
-				lock_sys_t::mutex */
+	ibool	nowait,		/*!< in: whether to wait for lock_sys.latch */
 	ulint*	trx_start,	/*!< out: file position of the start of
 				the list of active transactions */
 	ulint*	trx_end);	/*!< out: file position of the end of
@@ -659,29 +660,21 @@ void srv_init_purge_tasks();
 
 /** Status variables to be passed to MySQL */
 struct export_var_t{
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint innodb_ahi_hit;
+	ulint innodb_ahi_miss;
+#endif /* BTR_CUR_HASH_ADAPT */
 	char  innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
 	char  innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
 	char  innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
 	my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
 	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
-	ulint innodb_buffer_pool_pages_data;	/*!< Data pages */
 	ulint innodb_buffer_pool_bytes_data;	/*!< File bytes used */
-	ulint innodb_buffer_pool_pages_dirty;	/*!< Dirty data pages */
-	ulint innodb_buffer_pool_bytes_dirty;	/*!< File bytes modified */
 	ulint innodb_buffer_pool_pages_misc;	/*!< Miscellanous pages */
-	ulint innodb_buffer_pool_pages_free;	/*!< Free pages */
 #ifdef UNIV_DEBUG
 	ulint innodb_buffer_pool_pages_latched;	/*!< Latched pages */
 #endif /* UNIV_DEBUG */
-	ulint innodb_buffer_pool_pages_made_not_young;
-	ulint innodb_buffer_pool_pages_made_young;
-	ulint innodb_buffer_pool_pages_old;
-	ulint innodb_buffer_pool_read_requests;	/*!< buf_pool.stat.n_page_gets */
-	ulint innodb_buffer_pool_reads;		/*!< srv_buf_pool_reads */
 	ulint innodb_buffer_pool_write_requests;/*!< srv_stats.buf_pool_write_requests */
-	ulint innodb_buffer_pool_read_ahead_rnd;/*!< srv_read_ahead_rnd */
-	ulint innodb_buffer_pool_read_ahead;	/*!< srv_read_ahead */
-	ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
 	ulint innodb_checkpoint_age;
 	ulint innodb_checkpoint_max_age;
 	ulint innodb_data_pending_reads;	/*!< Pending reads */
@@ -791,30 +784,6 @@ struct export_var_t{
 	int64_t innodb_encryption_key_requests;
 };
 
-/** Thread slot in the thread table.  */
-struct srv_slot_t{
-	ibool		in_use;			/*!< TRUE if this slot
-						is in use */
- 	/** time(NULL) when the thread was suspended.
- 	FIXME: Use my_interval_timer() or similar, to avoid bogus
- 	timeouts in lock_wait_check_and_cancel() or lock_wait_suspend_thread()
-	when the system time is adjusted to the past!
-
-	FIXME: This is duplicating trx_lock_t::wait_started,
-	which is being used for diagnostic purposes only. */
-	time_t		suspend_time;
-	ulong		wait_timeout;		/*!< wait time that if exceeded
-						the thread will be timed out.
-						Initialized by
-						lock_wait_table_reserve_slot()
-						for lock wait */
-	os_event_t	event;			/*!< event used in suspending
-						the thread when it has nothing
-						to do */
-	que_thr_t*	thr;			/*!< suspended query thread
-						(only used for user threads) */
-};
-
 extern tpool::thread_pool *srv_thread_pool;
 extern std::unique_ptr<tpool::timer> srv_master_timer;
 extern std::unique_ptr<tpool::timer> srv_monitor_timer;
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
index 58488df4be6..44b19aa666b 100644
--- a/storage/innobase/include/srv0start.h
+++ b/storage/innobase/include/srv0start.h
@@ -93,8 +93,6 @@ extern	lsn_t	srv_shutdown_lsn;
 
 /** TRUE if the server is being started */
 extern	bool	srv_is_being_started;
-/** TRUE if SYS_TABLESPACES is available for lookups */
-extern	bool	srv_sys_tablespaces_open;
 /** TRUE if the server is being started, before rolling back any
 incomplete transactions */
 extern	bool	srv_startup_is_before_trx_rollback_phase;
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
new file mode 100644
index 00000000000..1dca0cc1054
--- /dev/null
+++ b/storage/innobase/include/srw_lock.h
@@ -0,0 +1,554 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "univ.i"
+#include "rw_lock.h"
+
+#if defined __linux__
+/* futex(2): FUTEX_WAIT_PRIVATE, FUTEX_WAKE_PRIVATE */
+#elif defined __OpenBSD__ || defined __FreeBSD__ || defined __DragonFly__
+/* system calls similar to Linux futex(2) */
+#elif defined _WIN32
+/* SRWLOCK as well as WaitOnAddress(), WakeByAddressSingle() */
+#else
+# define SUX_LOCK_GENERIC /* fall back to generic synchronization primitives */
+#endif
+
+#if !defined SUX_LOCK_GENERIC && 0 /* defined SAFE_MUTEX */
+# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
+#endif
+
+#ifdef SUX_LOCK_GENERIC
+/** An exclusive-only variant of srw_lock */
+template<bool spinloop>
+class pthread_mutex_wrapper final
+{
+  pthread_mutex_t lock;
+public:
+  void init()
+  {
+    if (spinloop)
+      pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);
+    else
+      pthread_mutex_init(&lock, nullptr);
+  }
+  void destroy() { pthread_mutex_destroy(&lock); }
+# ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+  void wr_lock() { pthread_mutex_lock(&lock); }
+# else
+private:
+  void wr_wait();
+public:
+  inline void wr_lock();
+# endif
+  void wr_unlock() { pthread_mutex_unlock(&lock); }
+  bool wr_lock_try() { return !pthread_mutex_trylock(&lock); }
+};
+
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait();
+template<>
+inline void pthread_mutex_wrapper<false>::wr_lock()
+{ pthread_mutex_lock(&lock); }
+template<>
+inline void pthread_mutex_wrapper<true>::wr_lock()
+{ if (!wr_lock_try()) wr_wait(); }
+# endif
+#endif
+
+/** Futex-based mutex */
+template<bool spinloop>
+class srw_mutex_impl final
+{
+  /** The lock word, containing HOLDER + 1 if the lock is being held,
+  plus the number of waiters */
+  std::atomic<uint32_t> lock;
+  /** Identifies that the lock is being held */
+  static constexpr uint32_t HOLDER= 1U << 31;
+
+#ifdef SUX_LOCK_GENERIC
+public:
+  /** The mutex for the condition variables. */
+  pthread_mutex_t mutex;
+private:
+  /** Condition variable for the lock word. Used with mutex. */
+  pthread_cond_t cond;
+#endif
+
+  /** Wait until the mutex has been acquired */
+  void wait_and_lock();
+  /** Wait for lock!=lk */
+  inline void wait(uint32_t lk);
+  /** Wake up one wait() thread */
+  void wake();
+public:
+  /** @return whether the mutex is being held or waited for */
+  bool is_locked_or_waiting() const
+  { return lock.load(std::memory_order_acquire) != 0; }
+  /** @return whether the mutex is being held by any thread */
+  bool is_locked() const
+  { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; }
+
+  void init()
+  {
+    DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+    pthread_mutex_init(&mutex, nullptr);
+    pthread_cond_init(&cond, nullptr);
+#endif
+  }
+  void destroy()
+  {
+    DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+    pthread_mutex_destroy(&mutex);
+    pthread_cond_destroy(&cond);
+#endif
+  }
+
+  /** @return whether the mutex was acquired */
+  bool wr_lock_try()
+  {
+    uint32_t lk= 0;
+    return lock.compare_exchange_strong(lk, HOLDER + 1,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+
+  void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
+  void wr_unlock()
+  {
+    const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release);
+    if (lk != HOLDER + 1)
+    {
+      DBUG_ASSERT(lk & HOLDER);
+      wake();
+    }
+  }
+};
+
+#ifdef SUX_LOCK_GENERIC
+typedef pthread_mutex_wrapper<true> srw_spin_mutex;
+typedef pthread_mutex_wrapper<false> srw_mutex;
+#else
+typedef srw_mutex_impl<true> srw_spin_mutex;
+typedef srw_mutex_impl<false> srw_mutex;
+#endif
+
+template<bool spinloop> class srw_lock_impl;
+
+/** Slim shared-update-exclusive lock with no recursion */
+template<bool spinloop>
+class ssux_lock_impl final
+{
+#ifdef UNIV_PFS_RWLOCK
+  friend class ssux_lock;
+# ifdef SUX_LOCK_GENERIC
+# elif defined _WIN32
+# else
+  friend srw_lock_impl<spinloop>;
+# endif
+#endif
+  /** mutex for synchronization; held by U or X lock holders */
+  srw_mutex_impl<spinloop> writer;
+#ifdef SUX_LOCK_GENERIC
+  /** Condition variable for "readers"; used with writer.mutex. */
+  pthread_cond_t readers_cond;
+#endif
+  /** S or U holders, and WRITER flag for X holder or waiter */
+  std::atomic<uint32_t> readers;
+  /** indicates an X request; readers=WRITER indicates granted X lock */
+  static constexpr uint32_t WRITER= 1U << 31;
+
+  /** Wait for readers!=lk */
+  inline void wait(uint32_t lk);
+
+  /** Wait for readers!=lk|WRITER */
+  void wr_wait(uint32_t lk);
+  /** Wake up wait() on the last rd_unlock() */
+  void wake();
+  /** Acquire a read lock */
+  void rd_wait();
+public:
+  void init()
+  {
+    writer.init();
+    DBUG_ASSERT(is_vacant());
+#ifdef SUX_LOCK_GENERIC
+    pthread_cond_init(&readers_cond, nullptr);
+#endif
+  }
+  void destroy()
+  {
+    DBUG_ASSERT(is_vacant());
+    writer.destroy();
+#ifdef SUX_LOCK_GENERIC
+    pthread_cond_destroy(&readers_cond);
+#endif
+  }
+  /** @return whether any writer is waiting */
+  bool is_waiting() const
+  { return (readers.load(std::memory_order_relaxed) & WRITER) != 0; }
+#ifndef DBUG_OFF
+  /** @return whether the lock is being held or waited for */
+  bool is_vacant() const { return !is_locked_or_waiting(); }
+#endif /* !DBUG_OFF */
+
+  bool rd_lock_try()
+  {
+    uint32_t lk= 0;
+    while (!readers.compare_exchange_weak(lk, lk + 1,
+                                          std::memory_order_acquire,
+                                          std::memory_order_relaxed))
+      if (lk & WRITER)
+        return false;
+    return true;
+  }
+
+  bool u_lock_try()
+  {
+    if (!writer.wr_lock_try())
+      return false;
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_add(1, std::memory_order_acquire);
+    DBUG_ASSERT(lk < WRITER - 1);
+    return true;
+  }
+
+  bool wr_lock_try()
+  {
+    if (!writer.wr_lock_try())
+      return false;
+    uint32_t lk= 0;
+    if (readers.compare_exchange_strong(lk, WRITER,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed))
+      return true;
+    writer.wr_unlock();
+    return false;
+  }
+
+  void rd_lock() { if (!rd_lock_try()) rd_wait(); }
+  void u_lock()
+  {
+    writer.wr_lock();
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_add(1, std::memory_order_acquire);
+    DBUG_ASSERT(lk < WRITER - 1);
+  }
+  void wr_lock()
+  {
+    writer.wr_lock();
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+    /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
+    as a loop around LOCK CMPXCHG. In this particular case, setting the
+    most significant bit using fetch_add() is equivalent, and is
+    translated into a simple LOCK XADD. */
+    static_assert(WRITER == 1U << 31, "compatibility");
+    if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
+      wr_wait(lk);
+#else
+    if (uint32_t lk= readers.fetch_or(WRITER, std::memory_order_acquire))
+      wr_wait(lk);
+#endif
+  }
+
+  void u_wr_upgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    uint32_t lk= readers.fetch_add(WRITER - 1, std::memory_order_acquire);
+    if (lk != 1)
+      wr_wait(lk - 1);
+  }
+  void wr_u_downgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    DBUG_ASSERT(is_write_locked());
+    readers.store(1, std::memory_order_release);
+    /* Note: Any pending rd_lock() will not be woken up until u_unlock() */
+  }
+
+  void rd_unlock()
+  {
+    uint32_t lk= readers.fetch_sub(1, std::memory_order_release);
+    ut_ad(~WRITER & lk);
+    if (lk == WRITER + 1)
+      wake();
+  }
+  void u_unlock()
+  {
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_sub(1, std::memory_order_release);
+    DBUG_ASSERT(lk);
+    DBUG_ASSERT(lk < WRITER);
+    writer.wr_unlock();
+  }
+  void wr_unlock()
+  {
+    DBUG_ASSERT(is_write_locked());
+    readers.store(0, std::memory_order_release);
+    writer.wr_unlock();
+  }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept
+  { return readers.load(std::memory_order_acquire) == WRITER; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked() const noexcept
+  { return readers.load(std::memory_order_acquire) != 0; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept
+  { return is_locked() || writer.is_locked_or_waiting(); }
+
+  void lock_shared() { rd_lock(); }
+  void unlock_shared() { rd_unlock(); }
+  void lock() { wr_lock(); }
+  void unlock() { wr_unlock(); }
+};
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+/** Slim read-write lock */
+template<bool spinloop>
+class srw_lock_
+{
+# ifdef UNIV_PFS_RWLOCK
+  friend srw_lock_impl<spinloop>;
+# endif
+# ifdef _WIN32
+  SRWLOCK lk;
+# else
+  rw_lock_t lk;
+# endif
+
+  void rd_wait();
+  void wr_wait();
+public:
+  void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); }
+  void destroy() { IF_WIN(,rwlock_destroy(&lk)); }
+  inline void rd_lock();
+  inline void wr_lock();
+  bool rd_lock_try()
+  { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); }
+  void rd_unlock()
+  { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); }
+  bool wr_lock_try()
+  { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); }
+  void wr_unlock()
+  { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); }
+#ifdef _WIN32
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked() const noexcept { return is_locked_or_waiting(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept
+  {
+    // FIXME: this returns false positives for shared locks
+    return is_locked();
+  }
+
+  void lock_shared() { rd_lock(); }
+  void unlock_shared() { rd_unlock(); }
+  void lock() { wr_lock(); }
+  void unlock() { wr_unlock(); }
+#endif
+};
+
+template<> void srw_lock_<true>::rd_wait();
+template<> void srw_lock_<true>::wr_wait();
+
+template<>
+inline void srw_lock_<false>::rd_lock()
+{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); }
+template<>
+inline void srw_lock_<false>::wr_lock()
+{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); }
+
+template<>
+inline void srw_lock_<true>::rd_lock() { if (!rd_lock_try()) rd_wait(); }
+template<>
+inline void srw_lock_<true>::wr_lock() { if (!wr_lock_try()) wr_wait(); }
+
+typedef srw_lock_<false> srw_lock_low;
+typedef srw_lock_<true> srw_spin_lock_low;
+#else
+typedef ssux_lock_impl<false> srw_lock_low;
+typedef ssux_lock_impl<true> srw_spin_lock_low;
+#endif
+
+#ifndef UNIV_PFS_RWLOCK
+# define SRW_LOCK_INIT(key) init()
+# define SRW_LOCK_ARGS(file, line) /* nothing */
+# define SRW_LOCK_CALL /* nothing */
+typedef srw_lock_low srw_lock;
+typedef srw_spin_lock_low srw_spin_lock;
+#else
+# define SRW_LOCK_INIT(key) init(key)
+# define SRW_LOCK_ARGS(file, line) file, line
+# define SRW_LOCK_CALL __FILE__, __LINE__
+
+/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */
+class ssux_lock
+{
+  PSI_rwlock *pfs_psi;
+  ssux_lock_impl<false> lock;
+
+  ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_u_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_u_wr_upgrade(const char *file, unsigned line);
+public:
+  void init(mysql_pfs_key_t key)
+  {
+    pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+    lock.init();
+  }
+  void destroy()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+    {
+      PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+      pfs_psi= nullptr;
+    }
+    lock.destroy();
+  }
+  void rd_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_rd_lock(file, line);
+    else
+      lock.rd_lock();
+  }
+  void rd_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.rd_unlock();
+  }
+  void u_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_u_lock(file, line);
+    else
+      lock.u_lock();
+  }
+  void u_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.u_unlock();
+  }
+  void wr_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_wr_lock(file, line);
+    else
+      lock.wr_lock();
+  }
+  void wr_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.wr_unlock();
+  }
+  void u_wr_upgrade(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_u_wr_upgrade(file, line);
+    else
+      lock.u_wr_upgrade();
+  }
+  bool rd_lock_try() { return lock.rd_lock_try(); }
+  bool u_lock_try() { return lock.u_lock_try(); }
+  bool wr_lock_try() { return lock.wr_lock_try(); }
+  bool is_waiting() const { return lock.is_waiting(); }
+};
+
+/** Slim reader-writer lock with PERFORMANCE_SCHEMA instrumentation */
+template<bool spinloop>
+class srw_lock_impl
+{
+  PSI_rwlock *pfs_psi;
+# if defined _WIN32 || defined SUX_LOCK_GENERIC
+  srw_lock_<spinloop> lock;
+# else
+  ssux_lock_impl<spinloop> lock;
+# endif
+
+  ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+public:
+  void init(mysql_pfs_key_t key)
+  {
+    pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+    lock.init();
+  }
+  void destroy()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+    {
+      PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+      pfs_psi= nullptr;
+    }
+    lock.destroy();
+  }
+  void rd_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_rd_lock(file, line);
+    else
+      lock.rd_lock();
+  }
+  void rd_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.rd_unlock();
+  }
+  void wr_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_wr_lock(file, line);
+    else
+      lock.wr_lock();
+  }
+  void wr_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.wr_unlock();
+  }
+  bool rd_lock_try() { return lock.rd_lock_try(); }
+  bool wr_lock_try() { return lock.wr_lock_try(); }
+  void lock_shared() { return rd_lock(SRW_LOCK_CALL); }
+  void unlock_shared() { return rd_unlock(); }
+#ifndef SUX_LOCK_GENERIC
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept
+  { return lock.is_locked_or_waiting(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_locked() const noexcept { return lock.is_locked(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept { return lock.is_write_locked(); }
+#endif
+};
+
+typedef srw_lock_impl<false> srw_lock;
+typedef srw_lock_impl<true> srw_spin_lock;
+
+#endif
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
new file mode 100644
index 00000000000..2c0167ac651
--- /dev/null
+++ b/storage/innobase/include/sux_lock.h
@@ -0,0 +1,472 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "srw_lock.h"
+#include "my_atomic_wrapper.h"
+#ifdef UNIV_DEBUG
+# include <unordered_set>
+#endif
+
+/** A "fat" rw-lock that supports
+S (shared), U (update, or shared-exclusive), and X (exclusive) modes
+as well as recursive U and X latch acquisition
+@tparam ssux ssux_lock_impl or ssux_lock */
+template<typename ssux>
+class sux_lock final
+{
+  /** The underlying non-recursive lock */
+  ssux lock;
+  /** Numbers of U and X locks. Protected by lock. */
+  uint32_t recursive;
+  /** The owner of the U or X lock (0 if none); protected by lock */
+  std::atomic<pthread_t> writer;
+  /** Special writer!=0 value to indicate that the lock is non-recursive
+  and will be released by an I/O thread */
+#if defined __linux__ || defined _WIN32
+  static constexpr pthread_t FOR_IO= pthread_t(~0UL);
+#else
+# define FOR_IO ((pthread_t) ~0UL) /* it could be a pointer */
+#endif
+#ifdef UNIV_DEBUG
+  /** Protects readers */
+  mutable srw_mutex readers_lock;
+  /** Threads that hold the lock in shared mode */
+  std::atomic<std::unordered_multiset<pthread_t>*> readers;
+#endif
+
+  /** The multiplier in recursive for X locks */
+  static constexpr uint32_t RECURSIVE_X= 1U;
+  /** The multiplier in recursive for U locks */
+  static constexpr uint32_t RECURSIVE_U= 1U << 16;
+  /** The maximum allowed level of recursion */
+  static constexpr uint32_t RECURSIVE_MAX= RECURSIVE_U - 1;
+
+public:
+#ifdef UNIV_PFS_RWLOCK
+  inline void init();
+#endif
+  void SRW_LOCK_INIT(mysql_pfs_key_t key)
+  {
+    lock.SRW_LOCK_INIT(key);
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_ad(!recursive);
+    ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+    if (auto r= readers.load(std::memory_order_relaxed))
+      ut_ad(r->empty());
+#endif
+  }
+
+  /** Free the rw-lock after init() */
+  void free()
+  {
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_ad(!recursive);
+#ifdef UNIV_DEBUG
+    readers_lock.destroy();
+    if (auto r= readers.load(std::memory_order_relaxed))
+    {
+      ut_ad(r->empty());
+      delete r;
+      readers.store(nullptr, std::memory_order_relaxed);
+    }
+#endif
+    lock.destroy();
+  }
+
+  /** needed for dict_index_t::clone() */
+  inline void operator=(const sux_lock&);
+
+#ifdef UNIV_DEBUG
+  /** @return whether no recursive locks are being held */
+  bool not_recursive() const
+  {
+    ut_ad(recursive);
+    return recursive == RECURSIVE_X || recursive == RECURSIVE_U;
+  }
+
+  /** @return the number of X locks being held (by any thread) */
+  unsigned x_lock_count() const { return recursive & RECURSIVE_MAX; }
+#endif
+
+  /** Acquire a recursive lock */
+  template<bool allow_readers> void writer_recurse()
+  {
+    ut_ad(writer == pthread_self());
+    ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+         RECURSIVE_MAX);
+    ut_ad(allow_readers ? recursive : rec);
+    ut_ad(rec < RECURSIVE_MAX);
+    recursive+= allow_readers ? RECURSIVE_U : RECURSIVE_X;
+  }
+
+private:
+  /** Transfer the ownership of a write lock to another thread
+  @param id the new owner of the U or X lock */
+  void set_new_owner(pthread_t id)
+  {
+    IF_DBUG(DBUG_ASSERT(writer.exchange(id, std::memory_order_relaxed)),
+            writer.store(id, std::memory_order_relaxed));
+  }
+  /** Assign the ownership of a write lock to a thread
+  @param id the owner of the U or X lock */
+  void set_first_owner(pthread_t id)
+  {
+    IF_DBUG(DBUG_ASSERT(!writer.exchange(id, std::memory_order_relaxed)),
+            writer.store(id, std::memory_order_relaxed));
+  }
+#ifdef UNIV_DEBUG
+  /** Register the current thread as a holder of a shared lock */
+  void s_lock_register()
+  {
+    const pthread_t id= pthread_self();
+    readers_lock.wr_lock();
+    auto r= readers.load(std::memory_order_relaxed);
+    if (!r)
+    {
+      r= new std::unordered_multiset<pthread_t>();
+      readers.store(r, std::memory_order_relaxed);
+    }
+    r->emplace(id);
+    readers_lock.wr_unlock();
+  }
+#endif
+
+public:
+  /** In crash recovery or the change buffer, claim the ownership
+  of the exclusive block lock to the current thread */
+  void claim_ownership() { set_new_owner(pthread_self()); }
+
+  /** @return whether the current thread is holding X or U latch */
+  bool have_u_or_x() const
+  {
+    if (pthread_self() != writer.load(std::memory_order_relaxed))
+      return false;
+    ut_ad(recursive);
+    return true;
+  }
+  /** @return whether the current thread is holding U but not X latch */
+  bool have_u_not_x() const
+  { return have_u_or_x() && !((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+  /** @return whether the current thread is holding X latch */
+  bool have_x() const
+  { return have_u_or_x() && ((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread is holding S latch */
+  bool have_s() const
+  {
+    if (auto r= readers.load(std::memory_order_relaxed))
+    {
+      readers_lock.wr_lock();
+      bool found= r->find(pthread_self()) != r->end();
+      readers_lock.wr_unlock();
+      return found;
+    }
+    return false;
+  }
+  /** @return whether the current thread is holding the latch */
+  bool have_any() const { return have_u_or_x() || have_s(); }
+#endif
+
+  /** Acquire a shared lock */
+  inline void s_lock();
+  inline void s_lock(const char *file, unsigned line);
+  /** Acquire an update lock */
+  inline void u_lock();
+  inline void u_lock(const char *file, unsigned line);
+  /** Acquire an exclusive lock */
+  inline void x_lock(bool for_io= false);
+  inline void x_lock(const char *file, unsigned line);
+  /** Acquire a recursive exclusive lock */
+  void x_lock_recursive() { writer_recurse<false>(); }
+  /** Upgrade an update lock */
+  inline void u_x_upgrade();
+  inline void u_x_upgrade(const char *file, unsigned line);
+  /** Downgrade a single exclusive lock to an update lock */
+  void x_u_downgrade()
+  {
+    ut_ad(have_u_or_x());
+    ut_ad(recursive <= RECURSIVE_MAX);
+    recursive*= RECURSIVE_U;
+    lock.wr_u_downgrade();
+  }
+
+  /** Acquire an exclusive lock or upgrade an update lock
+  @return whether U locks were upgraded to X */
+  inline bool x_lock_upgraded();
+
+  /** @return whether a shared lock was acquired */
+  bool s_lock_try()
+  {
+    bool acquired= lock.rd_lock_try();
+    ut_d(if (acquired) s_lock_register());
+    return acquired;
+  }
+
+  /** Try to acquire an update lock
+  @param for_io  whether the lock will be released by another thread
+  @return whether the update lock was acquired */
+  inline bool u_lock_try(bool for_io);
+
+  /** Try to acquire an exclusive lock
+  @return whether an exclusive lock was acquired */
+  inline bool x_lock_try();
+
+  /** Release a shared lock */
+  void s_unlock()
+  {
+#ifdef UNIV_DEBUG
+    const pthread_t id= pthread_self();
+    auto r= readers.load(std::memory_order_relaxed);
+    ut_ad(r);
+    readers_lock.wr_lock();
+    auto i= r->find(id);
+    ut_ad(i != r->end());
+    r->erase(i);
+    readers_lock.wr_unlock();
+#endif
+    lock.rd_unlock();
+  }
+  /** Release an update or exclusive lock
+  @param allow_readers    whether we are releasing a U lock
+  @param claim_ownership  whether the lock was acquired by another thread */
+  void u_or_x_unlock(bool allow_readers, bool claim_ownership= false)
+  {
+    ut_d(auto owner= writer.load(std::memory_order_relaxed));
+    ut_ad(owner == pthread_self() ||
+          (owner == FOR_IO && claim_ownership &&
+           recursive == (allow_readers ? RECURSIVE_U : RECURSIVE_X)));
+    ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+         RECURSIVE_MAX);
+    ut_ad(rec);
+    if (!(recursive-= allow_readers ? RECURSIVE_U : RECURSIVE_X))
+    {
+      set_new_owner(0);
+      if (allow_readers)
+        lock.u_unlock();
+      else
+        lock.wr_unlock();
+    }
+  }
+  /** Release an update lock */
+  void u_unlock(bool claim_ownership= false)
+  { u_or_x_unlock(true, claim_ownership); }
+  /** Release an exclusive lock */
+  void x_unlock(bool claim_ownership= false)
+  { u_or_x_unlock(false, claim_ownership); }
+
+  /** @return whether any writer is waiting */
+  bool is_waiting() const { return lock.is_waiting(); }
+
+  bool is_write_locked() const { return lock.is_write_locked(); }
+
+  bool is_locked_or_waiting() const { return lock.is_locked_or_waiting(); }
+
+  inline void lock_shared();
+  inline void unlock_shared();
+};
+
+typedef sux_lock<ssux_lock_impl<true>> block_lock;
+
+#ifndef UNIV_PFS_RWLOCK
+typedef sux_lock<ssux_lock_impl<false>> index_lock;
+#else
+typedef sux_lock<ssux_lock> index_lock;
+
+template<> inline void sux_lock<ssux_lock_impl<true>>::init()
+{
+  lock.init();
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_ad(!recursive);
+  ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+  if (auto r= readers.load(std::memory_order_relaxed))
+    ut_ad(r->empty());
+#endif
+}
+
+template<>
+inline void sux_lock<ssux_lock>::s_lock(const char *file, unsigned line)
+{
+  ut_ad(!have_x());
+  ut_ad(!have_s());
+  lock.rd_lock(file, line);
+  ut_d(s_lock_register());
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_lock(const char *file, unsigned line)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<true>();
+  else
+  {
+    lock.u_lock(file, line);
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(id);
+  }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::x_lock(const char *file, unsigned line)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<false>();
+  else
+  {
+    lock.wr_lock(file, line);
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+  }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_x_upgrade(const char *file, unsigned line)
+{
+  ut_ad(have_u_not_x());
+  lock.u_wr_upgrade(file, line);
+  recursive/= RECURSIVE_U;
+}
+#endif
+
+/** needed for dict_index_t::clone() */
+template<> inline void index_lock::operator=(const sux_lock&)
+{
+  memset((void*) this, 0, sizeof *this);
+}
+
+template<typename ssux> inline void sux_lock<ssux>::s_lock()
+{
+  ut_ad(!have_x());
+  ut_ad(!have_s());
+  lock.rd_lock();
+  ut_d(s_lock_register());
+}
+
+template<typename ssux>
+inline void sux_lock<ssux>::lock_shared() { s_lock(); }
+template<typename ssux>
+inline void sux_lock<ssux>::unlock_shared() { s_unlock(); }
+
+template<typename ssux> inline void sux_lock<ssux>::u_lock()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<true>();
+  else
+  {
+    lock.u_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(id);
+  }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::x_lock(bool for_io)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    ut_ad(!for_io);
+    writer_recurse<false>();
+  }
+  else
+  {
+    lock.wr_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(for_io ? FOR_IO : id);
+  }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::u_x_upgrade()
+{
+  ut_ad(have_u_not_x());
+  lock.u_wr_upgrade();
+  recursive/= RECURSIVE_U;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_upgraded()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    ut_ad(recursive);
+    static_assert(RECURSIVE_X == 1, "compatibility");
+    if (recursive & RECURSIVE_MAX)
+    {
+      writer_recurse<false>();
+      return false;
+    }
+    /* Upgrade the lock. */
+    lock.u_wr_upgrade();
+    recursive/= RECURSIVE_U;
+    return true;
+  }
+  else
+  {
+    lock.wr_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+    return false;
+  }
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::u_lock_try(bool for_io)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    if (for_io)
+      return false;
+    writer_recurse<true>();
+    return true;
+  }
+  if (lock.u_lock_try())
+  {
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(for_io ? FOR_IO : id);
+    return true;
+  }
+  return false;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_try()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    writer_recurse<false>();
+    return true;
+  }
+  if (lock.wr_lock_try())
+  {
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+    return true;
+  }
+  return false;
+}
diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h
new file mode 100644
index 00000000000..168a68977a7
--- /dev/null
+++ b/storage/innobase/include/transactional_lock_guard.h
@@ -0,0 +1,174 @@
+/*****************************************************************************
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+
+#if defined __powerpc64__
+#elif defined __s390__
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) && !defined(__clang__)
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if __GNUC__ >= 8
+# elif defined __clang_major__ && __clang_major__ > 6
+# else
+#  define NO_ELISION
+# endif
+#else /* Transactional memory has not been implemented for this ISA */
+# define NO_ELISION
+#endif
+
+#ifdef NO_ELISION
+constexpr bool have_transactional_memory= false;
+# ifdef UNIV_DEBUG
+static inline bool xtest() { return false; }
+# endif
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+#else
+# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+
+#  include <immintrin.h>
+#  if defined __GNUC__ && !defined __INTEL_COMPILER
+#   define TRANSACTIONAL_TARGET __attribute__((target("rtm"),hot))
+#   define TRANSACTIONAL_INLINE __attribute__((target("rtm"),hot,always_inline))
+#  else
+#   define TRANSACTIONAL_TARGET /* nothing */
+#   define TRANSACTIONAL_INLINE /* nothing */
+#  endif
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+  return have_transactional_memory && _xbegin() == _XBEGIN_STARTED;
+}
+
+#  ifdef UNIV_DEBUG
+#   ifdef __GNUC__
+/** @return whether a memory transaction is active */
+bool xtest();
+#   else
+static inline bool xtest() { return have_transactional_memory && _xtest(); }
+#   endif
+#  endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); }
+
+TRANSACTIONAL_INLINE static inline void xend() { _xend(); }
+# elif defined __powerpc64__ || defined __s390__
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+#   define TRANSACTIONAL_TARGET __attribute__((hot))
+#   define TRANSACTIONAL_INLINE __attribute__((hot,always_inline))
+
+/**
+  Newer gcc compilers only provide __builtin_{htm}
+  functions when the -mhtm CFLAG is actually provided. So
+  we've got the option of including it globally, or
+  pushing down the inclusion of htmxlintrin.h to one
+  file with -mhtm enabled and removing the inline
+  optimization.
+
+  Per FIXME in s390x's htmxlintrin.h, the __TM_simple_begin
+  isn't always_inline resulting in duplicate definitions if
+  it where included more than once.  While xabort and xend
+  could be implemented here, we keep the implementation the
+  same as ppc64.
+ */
+TRANSACTIONAL_TARGET bool xbegin();
+TRANSACTIONAL_TARGET void xabort();
+TRANSACTIONAL_TARGET void xend();
+#  ifdef UNIV_DEBUG
+bool xtest();
+#  endif
+
+# endif
+#endif
+
+template<class mutex>
+class transactional_lock_guard
+{
+  mutex &m;
+
+public:
+  TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    m.lock();
+  }
+  transactional_lock_guard(const transactional_lock_guard &)= delete;
+  TRANSACTIONAL_INLINE ~transactional_lock_guard()
+  {
+#ifndef NO_ELISION
+    if (was_elided()) xend(); else
+#endif
+    m.unlock();
+  }
+
+#ifndef NO_ELISION
+  bool was_elided() const noexcept { return !m.is_locked_or_waiting(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+template<class mutex>
+class transactional_shared_lock_guard
+{
+  mutex &m;
+#ifndef NO_ELISION
+  bool elided;
+#else
+  static constexpr bool elided= false;
+#endif
+
+public:
+  TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (!m.is_write_locked())
+      {
+        elided= true;
+        return;
+      }
+      xabort();
+    }
+    elided= false;
+#endif
+    m.lock_shared();
+  }
+  transactional_shared_lock_guard(const transactional_shared_lock_guard &)=
+    delete;
+  TRANSACTIONAL_INLINE ~transactional_shared_lock_guard()
+  {
+#ifndef NO_ELISION
+    if (was_elided()) xend(); else
+#endif
+    m.unlock_shared();
+  }
+
+  bool was_elided() const noexcept { return elided; }
+};
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
index 40160ce4362..caacfa0972a 100644
--- a/storage/innobase/include/trx0i_s.h
+++ b/storage/innobase/include/trx0i_s.h
@@ -114,8 +114,7 @@ struct i_s_locks_row_t {
 /** This structure represents INFORMATION_SCHEMA.innodb_trx row */
 struct i_s_trx_row_t {
 	trx_id_t		trx_id;		/*!< transaction identifier */
-	const char*		trx_state;	/*!< transaction state from
-						trx_get_que_state_str() */
+	const char*		trx_state;
 	time_t			trx_started;	/*!< trx_t::start_time */
 	const i_s_locks_row_t*	requested_lock_row;
 					/*!< pointer to a row
@@ -138,7 +137,7 @@ struct i_s_trx_row_t {
 	ulint		trx_lock_memory_bytes;
 					/*!< mem_heap_get_size(
 					trx->lock_heap) */
-	ulint		trx_rows_locked;/*!< lock_number_of_rows_locked() */
+	ulint		trx_rows_locked;/*!< trx_lock_t::n_rec_locks */
 	uintmax_t	trx_rows_modified;/*!< trx_t::undo_no */
 	uint		trx_isolation_level;
 					/*!< trx_t::isolation_level */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
index 14cf6a2958b..ac39d3ec45b 100644
--- a/storage/innobase/include/trx0purge.h
+++ b/storage/innobase/include/trx0purge.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,18 +24,14 @@ Purge old versions
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef trx0purge_h
-#define trx0purge_h
+#pragma once
 
-#include "trx0rseg.h"
+#include "trx0sys.h"
 #include "que0types.h"
+#include "srw_lock.h"
 
 #include <queue>
 
-/** A dummy undo record used as a return value when we have a whole undo log
-which needs no purge */
-extern trx_undo_rec_t	trx_purge_dummy_rec;
-
 /** Prepend the history list with an undo log.
 Remove the undo log segment from the rseg slot if it is too big for reuse.
 @param[in]	trx		transaction
@@ -123,17 +119,26 @@ private:
 class purge_sys_t
 {
 public:
-	/** latch protecting view, m_enabled */
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	mutable rw_lock_t		latch;
+  /** latch protecting view, m_enabled */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch;
 private:
-	/** The purge will not remove undo logs which are >= this view */
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	ReadViewBase	view;
-	/** whether purge is enabled; protected by latch and std::atomic */
-	std::atomic<bool>		m_enabled;
-	/** number of pending stop() calls without resume() */
-	Atomic_counter<int32_t>		m_paused;
+  /** Read view at the start of a purge batch. Any encountered index records
+  that are older than view will be removed. */
+  ReadViewBase view;
+  /** whether purge is enabled; protected by latch and std::atomic */
+  std::atomic<bool> m_enabled;
+  /** number of pending stop() calls without resume() */
+  Atomic_counter<uint32_t> m_paused;
+  /** number of stop_SYS() calls without resume_SYS() */
+  Atomic_counter<uint32_t> m_SYS_paused;
+  /** number of stop_FTS() calls without resume_FTS() */
+  Atomic_counter<uint32_t> m_FTS_paused;
+
+  /** latch protecting end_view */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch;
+  /** Read view at the end of a purge batch (copied from view). Any undo pages
+  containing records older than end_view may be freed. */
+  ReadViewBase end_view;
 public:
 	que_t*		query;		/*!< The query graph which will do the
 					parallelized purge operation */
@@ -184,7 +189,7 @@ public:
 	purge_pq_t	purge_queue;	/*!< Binary min-heap, ordered on
 					TrxUndoRsegs::trx_no. It is protected
 					by the pq_mutex */
-	PQMutex		pq_mutex;	/*!< Mutex protecting purge_queue */
+	mysql_mutex_t	pq_mutex;	/*!< Mutex protecting purge_queue */
 
 	/** Undo tablespace file truncation (only accessed by the
 	srv_purge_coordinator_thread) */
@@ -235,34 +240,108 @@ public:
 
   /** @return whether the purge tasks are active */
   bool running() const;
-  /** Stop purge during FLUSH TABLES FOR EXPORT */
+  /** Stop purge during FLUSH TABLES FOR EXPORT. */
   void stop();
   /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
   void resume();
-  /** A wrapper around ReadView::changes_visible(). */
-  bool changes_visible(trx_id_t id, const table_name_t &name) const
-  {
-    ut_ad(rw_lock_own(&latch, RW_LOCK_S));
-    return view.changes_visible(id, name);
-  }
+
+private:
+  void wait_SYS();
+  void wait_FTS();
+public:
+  /** Suspend purge in data dictionary tables */
+  void stop_SYS();
+  /** Resume purge in data dictionary tables */
+  static void resume_SYS(void *);
+  /** @return whether stop_SYS() is in effect */
+  bool must_wait_SYS() const { return m_SYS_paused; }
+  /** check stop_SYS() */
+  void check_stop_SYS() { if (must_wait_SYS()) wait_SYS(); }
+
+  /** Pause purge during a DDL operation that could drop FTS_ tables. */
+  void stop_FTS() { m_FTS_paused++; }
+  /** Resume purge after stop_FTS(). */
+  void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); }
+  /** @return whether stop_SYS() is in effect */
+  bool must_wait_FTS() const { return m_FTS_paused; }
+  /** check stop_SYS() */
+  void check_stop_FTS() { if (must_wait_FTS()) wait_FTS(); }
+
+  /** Determine if the history of a transaction is purgeable.
+  @param trx_id  transaction identifier
+  @return whether the history is purgeable */
+  TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const;
+
   /** A wrapper around ReadView::low_limit_no(). */
   trx_id_t low_limit_no() const
   {
-#if 0 /* Unfortunately we don't hold this assertion, see MDEV-22718. */
-    ut_ad(rw_lock_own(&latch, RW_LOCK_S));
-#endif
+    /* This function may only be called by purge_coordinator_callback().
+
+    The purge coordinator task may call this without holding any latch,
+    because it is the only thread that may modify purge_sys.view.
+
+    Any other threads that access purge_sys.view must hold purge_sys.latch,
+    typically via purge_sys_t::view_guard. */
     return view.low_limit_no();
   }
   /** A wrapper around trx_sys_t::clone_oldest_view(). */
+  template<bool also_end_view= false>
   void clone_oldest_view()
   {
-    rw_lock_x_lock(&latch);
+    latch.wr_lock(SRW_LOCK_CALL);
     trx_sys.clone_oldest_view(&view);
-    rw_lock_x_unlock(&latch);
+    if (also_end_view)
+      (end_view= view).
+        clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no);
+    latch.wr_unlock();
   }
+
+  /** Update end_view at the end of a purge batch. */
+  inline void clone_end_view();
+
+  struct view_guard
+  {
+    inline view_guard();
+    inline ~view_guard();
+
+    /** @return purge_sys.view */
+    inline const ReadViewBase &view() const;
+  };
+
+  struct end_view_guard
+  {
+    inline end_view_guard();
+    inline ~end_view_guard();
+
+    /** @return purge_sys.end_view */
+    inline const ReadViewBase &view() const;
+  };
+
+  /** Stop the purge thread and check n_ref_count of all auxiliary
+  and common table associated with the fts table.
+  @param	table		parent FTS table
+  @param	already_stopped	True indicates purge threads were
+				already stopped */
+  void stop_FTS(const dict_table_t &table, bool already_stopped=false);
 };
 
 /** The global data structure coordinating a purge */
 extern purge_sys_t	purge_sys;
 
-#endif /* trx0purge_h */
+purge_sys_t::view_guard::view_guard()
+{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); }
+
+purge_sys_t::view_guard::~view_guard()
+{ purge_sys.latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::view_guard::view() const
+{ return purge_sys.view; }
+
+purge_sys_t::end_view_guard::end_view_guard()
+{ purge_sys.end_latch.rd_lock(); }
+
+purge_sys_t::end_view_guard::~end_view_guard()
+{ purge_sys.end_latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::end_view_guard::view() const
+{ return purge_sys.end_view; }
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
index 66b2220a457..58ec5ab1707 100644
--- a/storage/innobase/include/trx0rec.h
+++ b/storage/innobase/include/trx0rec.h
@@ -24,8 +24,7 @@ Transaction undo log record
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef trx0rec_h
-#define trx0rec_h
+#pragma once
 
 #include "trx0types.h"
 #include "row0types.h"
@@ -37,29 +36,31 @@ Created 3/26/1996 Heikki Tuuri
 
 /***********************************************************************//**
 Copies the undo record to the heap.
-@return own: copy of undo log record */
-UNIV_INLINE
-trx_undo_rec_t*
-trx_undo_rec_copy(
-/*==============*/
-	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
-	mem_heap_t*		heap);		/*!< in: heap where copied */
-/**********************************************************************//**
-Reads the undo log record type.
-@return record type */
-UNIV_INLINE
-ulint
-trx_undo_rec_get_type(
-/*==================*/
-	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+@param undo_rec   record in an undo log page
+@param heap       memory heap
+@return copy of undo_rec
+@retval nullptr if the undo log record is corrupted */
+inline trx_undo_rec_t* trx_undo_rec_copy(const trx_undo_rec_t *undo_rec,
+                                         mem_heap_t *heap)
+{
+  const size_t offset= ut_align_offset(undo_rec, srv_page_size);
+  const size_t end= mach_read_from_2(undo_rec);
+  if (end <= offset || end >= srv_page_size - FIL_PAGE_DATA_END)
+    return nullptr;
+  const size_t len= end - offset;
+  trx_undo_rec_t *rec= static_cast<trx_undo_rec_t*>
+    (mem_heap_dup(heap, undo_rec, len));
+  mach_write_to_2(rec, len);
+  return rec;
+}
+
 /**********************************************************************//**
 Reads the undo log record number.
 @return undo no */
-UNIV_INLINE
-undo_no_t
-trx_undo_rec_get_undo_no(
-/*=====================*/
-	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec)
+{
+  return mach_u64_read_much_compressed(undo_rec + 3);
+}
 
 /**********************************************************************//**
 Returns the start of the undo record data area. */
@@ -69,10 +70,10 @@ Returns the start of the undo record data area. */
 /**********************************************************************//**
 Reads from an undo log record the general parameters.
 @return remaining part of undo log record after reading these values */
-byte*
+const byte*
 trx_undo_rec_get_pars(
 /*==================*/
-	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
 	ulint*		type,		/*!< out: undo record type:
 					TRX_UNDO_INSERT_REC, ... */
 	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
@@ -82,13 +83,14 @@ trx_undo_rec_get_pars(
 	undo_no_t*	undo_no,	/*!< out: undo log record number */
 	table_id_t*	table_id)	/*!< out: table id */
 	MY_ATTRIBUTE((nonnull));
+
 /*******************************************************************//**
 Builds a row reference from an undo log record.
 @return pointer to remaining part of undo record */
-byte*
+const byte*
 trx_undo_rec_get_row_ref(
 /*=====================*/
-	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
 				record, at the start of the row reference;
 				NOTE that this copy of the undo log record must
 				be preserved as long as the row reference is
@@ -96,8 +98,9 @@ trx_undo_rec_get_row_ref(
 				record! */
 	dict_index_t*	index,	/*!< in: clustered index */
 	const dtuple_t**ref,	/*!< out, own: row reference */
-	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
 				needed is allocated */
+	MY_ATTRIBUTE((nonnull));
 /**********************************************************************//**
 Reads from an undo log update record the system field values of the old
 version.
@@ -178,53 +181,59 @@ trx_undo_report_row_operation(
 is being called purge view and we would like to get the purge record
 even it is in the purge view (in normal case, it will return without
 fetching the purge record */
-#define		TRX_UNDO_PREV_IN_PURGE		0x1
+static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
 
 /** This tells trx_undo_prev_version_build() to fetch the old value in
 the undo log (which is the after image for an update) */
-#define		TRX_UNDO_GET_OLD_V_VALUE	0x2
+static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
 
-/*******************************************************************//**
-Build a previous version of a clustered index record. The caller must
-hold a latch on the index page of the clustered index record.
-@retval true if previous version was built, or if it was an insert
-or the table has been rebuilt
-@retval false if the previous version is earlier than purge_view,
-which means that it may have been removed */
-bool
+/** indicate a call from row_vers_old_has_index_entry() */
+static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param	rec		version of a clustered index record
+@param	index		clustered index
+@param	offsets		rec_get_offsets(rec, index)
+@param	heap		memory heap from which the memory needed is
+			allocated
+@param	old_vers	previous version or NULL if rec is the
+			first inserted version, or if history data
+			has been deleted (an error), or if the purge
+			could have removed the version
+			though it has not yet done so
+@param	v_heap		memory heap used to create vrow
+			dtuple if it is not yet created. This heap
+                        diffs from "heap" above in that it could be
+                        prebuilt->old_vers_heap for selection
+@param	vrow		virtual column info, if any
+@param	v_status	status determine if it is going into this
+			function by purge thread or not.
+			And if we read "after image" of undo log
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+dberr_t
 trx_undo_prev_version_build(
-/*========================*/
-	const rec_t*	index_rec,/*!< in: clustered index record in the
-				index tree */
-	mtr_t*		index_mtr,/*!< in: mtr which contains the latch to
-				index_rec page and purge_view */
-	const rec_t*	rec,	/*!< in: version of a clustered index record */
-	dict_index_t*	index,	/*!< in: clustered index */
-	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
-				needed is allocated */
-	rec_t**		old_vers,/*!< out, own: previous version, or NULL if
-				rec is the first inserted version, or if
-				history data has been deleted */
-	mem_heap_t*	v_heap,	/* !< in: memory heap used to create vrow
-				dtuple if it is not yet created. This heap
-				diffs from "heap" above in that it could be
-				prebuilt->old_vers_heap for selection */
-	dtuple_t**	vrow,	/*!< out: virtual column info, if any */
+	const rec_t 	*rec,
+	dict_index_t	*index,
+	rec_offs	*offsets,
+	mem_heap_t	*heap,
+	rec_t		**old_vers,
+	mem_heap_t	*v_heap,
+	dtuple_t	**vrow,
 	ulint		v_status);
-				/*!< in: status determine if it is going
-				into this function by purge thread or not.
-				And if we read "after image" of undo log */
 
 /** Read from an undo log record a non-virtual column value.
-@param[in,out]	ptr		pointer to remaining part of the undo record
-@param[in,out]	field		stored field
-@param[in,out]	len		length of the field, or UNIV_SQL_NULL
-@param[in,out]	orig_len	original length of the locally stored part
+@param ptr	pointer to remaining part of the undo record
+@param field	stored field
+@param len	length of the field, or UNIV_SQL_NULL
+@param orig_len	original length of the locally stored part
 of an externally stored column, or 0
 @return remaining part of undo log record after reading these values */
-byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
-                               uint32_t *len, uint32_t *orig_len);
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                                     uint32_t *len, uint32_t *orig_len);
 
 /** Read virtual column value from undo log
 @param[in]	table		the table
@@ -261,9 +270,22 @@ trx_undo_read_v_idx(
 compilation info multiplied by 16 is ORed to this value in an undo log
 record */
 
-#define	TRX_UNDO_RENAME_TABLE	9	/*!< RENAME TABLE */
-#define	TRX_UNDO_INSERT_METADATA 10	/*!< insert a metadata
-					pseudo-record for instant ALTER */
+/** Undo log records for DDL operations
+
+Note: special rollback and purge triggers exist for SYS_INDEXES records:
+@see dict_drop_index_tree() */
+enum trx_undo_ddl_type
+{
+  /** RENAME TABLE (logging the old table name).
+
+  Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records
+  for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */
+  TRX_UNDO_RENAME_TABLE= 9,
+  /** insert a metadata pseudo-record for instant ALTER TABLE */
+  TRX_UNDO_INSERT_METADATA= 10
+};
+
+/* DML operations */
 #define	TRX_UNDO_INSERT_REC	11	/* fresh insert into clustered index */
 #define	TRX_UNDO_UPD_EXIST_REC	12	/* update of a non-delete-marked
 					record */
@@ -272,6 +294,13 @@ record */
 					fields of the record can change */
 #define	TRX_UNDO_DEL_MARK_REC	14	/* delete marking of a record; fields
 					do not change */
+/** Bulk insert operation. It is written only when the table is
+under exclusive lock and the clustered index root page latch is being held,
+and the clustered index is empty. Rollback will empty the table and
+free the leaf segment of all indexes, re-create the new
+leaf segment and re-initialize the root page alone. */
+#define	TRX_UNDO_EMPTY		15
+
 #define	TRX_UNDO_CMPL_INFO_MULT	16U	/* compilation info is multiplied by
 					this and ORed to the type above */
 #define	TRX_UNDO_UPD_EXTERN	128U	/* This bit can be ORed to type_cmpl
@@ -291,7 +320,3 @@ inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
   mach_read_next_much_compressed(&rec);
   return mach_read_next_much_compressed(&rec);
 }
-
-#include "trx0rec.inl"
-
-#endif /* trx0rec_h */
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
index 6a562dcb425..9ef9ebe93b2 100644
--- a/storage/innobase/include/trx0roll.h
+++ b/storage/innobase/include/trx0roll.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -34,14 +34,6 @@ Created 3/26/1996 Heikki Tuuri
 extern bool		trx_rollback_is_active;
 extern const trx_t*	trx_roll_crash_recv_trx;
 
-/*******************************************************************//**
-Returns a transaction savepoint taken at this point in time.
-@return savepoint */
-trx_savept_t
-trx_savept_take(
-/*============*/
-	trx_t*	trx);	/*!< in: transaction */
-
 /** Report progress when rolling back a row of a recovered transaction. */
 void trx_roll_report_progress();
 /*******************************************************************//**
@@ -58,11 +50,8 @@ Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
 committed, then we clean up a possible insert undo log. If the
 transaction was not yet committed, then we roll it back.
-Note: this is done in a background thread.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(trx_rollback_all_recovered)(void*);
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*);
 /*********************************************************************//**
 Creates a rollback command node struct.
 @return own: rollback node struct */
@@ -141,15 +130,7 @@ trx_release_savepoint_for_mysql(
 	trx_t*		trx,			/*!< in: transaction handle */
 	const char*	savepoint_name)		/*!< in: savepoint name */
 	MY_ATTRIBUTE((nonnull, warn_unused_result));
-/*******************************************************************//**
-Frees savepoint structs starting from savep. */
-void
-trx_roll_savepoints_free(
-/*=====================*/
-	trx_t*			trx,	/*!< in: transaction handle */
-	trx_named_savept_t*	savep);	/*!< in: free all savepoints > this one;
-					if this is NULL, free all savepoints
-					of trx */
+
 /** Rollback node states */
 enum roll_node_state {
 	ROLL_NODE_NONE = 0,		/*!< Unknown state */
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
index 96655c7020f..1d95b7d2e7a 100644
--- a/storage/innobase/include/trx0rseg.h
+++ b/storage/innobase/include/trx0rseg.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,67 +24,28 @@ Rollback segment
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef trx0rseg_h
-#define trx0rseg_h
-
-#include "trx0sys.h"
+#pragma once
+#include "trx0types.h"
 #include "fut0lst.h"
 
-/** Gets a rollback segment header.
-@param[in]	space		space where placed
-@param[in]	page_no		page number of the header
-@param[in,out]	mtr		mini-transaction
-@return rollback segment header, page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_rsegf_get(fil_space_t* space, uint32_t page_no, mtr_t* mtr);
-
-/** Gets a newly created rollback segment header.
-@param[in]	space		space where placed
-@param[in]	page_no		page number of the header
-@param[in,out]	mtr		mini-transaction
-@return rollback segment header, page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_rsegf_get_new(
-	ulint			space,
-	uint32_t		page_no,
-	mtr_t*			mtr);
-
 /** Create a rollback segment header.
-@param[in,out]	space		system, undo, or temporary tablespace
-@param[in]	rseg_id		rollback segment identifier
-@param[in]	max_trx_id	new value of TRX_RSEG_MAX_TRX_ID
-@param[in,out]	sys_header	the TRX_SYS page (NULL for temporary rseg)
-@param[in,out]	mtr		mini-transaction
+@param[in,out]  space           system, undo, or temporary tablespace
+@param[in]      rseg_id         rollback segment identifier
+@param[in]      max_trx_id      new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
 @return the created rollback segment
-@retval	NULL	on failure */
-buf_block_t*
-trx_rseg_header_create(
-	fil_space_t*	space,
-	ulint		rseg_id,
-	trx_id_t	max_trx_id,
-	buf_block_t*	sys_header,
-	mtr_t*		mtr);
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+                                    trx_id_t max_trx_id, mtr_t *mtr,
+                                    dberr_t *err)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** Initialize or recover the rollback segments at startup. */
 dberr_t trx_rseg_array_init();
 
-/** Free a rollback segment in memory. */
-void
-trx_rseg_mem_free(trx_rseg_t* rseg);
-
-/** Create a persistent rollback segment.
-@param[in]	space_id	system or undo tablespace id
-@return pointer to new rollback segment
-@retval	NULL	on failure */
-trx_rseg_t*
-trx_rseg_create(ulint space_id)
-	MY_ATTRIBUTE((warn_unused_result));
-
 /** Create the temporary rollback segments. */
-void
-trx_temp_rseg_create();
+dberr_t trx_temp_rseg_create(mtr_t *mtr);
 
 /* Number of undo log slots in a rollback segment file copy */
 #define TRX_RSEG_N_SLOTS	(srv_page_size / 16)
@@ -93,34 +54,117 @@ trx_temp_rseg_create();
 #define TRX_RSEG_MAX_N_TRXS	(TRX_RSEG_N_SLOTS / 2)
 
 /** The rollback segment memory object */
-struct trx_rseg_t {
-	/*--------------------------------------------------------*/
-	/** rollback segment id == the index of its slot in the trx
-	system file copy */
-	ulint				id;
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
+{
+  /** tablespace containing the rollback segment; constant after init() */
+  fil_space_t *space;
+  /** latch protecting everything except page_no, space */
+  srw_spin_lock latch;
+  /** rollback segment header page number; constant after init() */
+  uint32_t page_no;
+  /** length of the TRX_RSEG_HISTORY list (number of transactions) */
+  uint32_t history_size;
 
-	/** mutex protecting the fields in this struct except id,space,page_no
-	which are constant */
-	RsegMutex			mutex;
+  /** Last known transaction that has not been purged yet,
+  or 0 if everything has been purged. */
+  trx_id_t needs_purge;
 
-	/** space where the rollback segment header is placed */
-	fil_space_t*			space;
+private:
+  /** Reference counter to track is_persistent() transactions,
+  with SKIP flag. */
+  std::atomic<uint32_t> ref;
 
-	/** page number of the rollback segment header */
-	uint32_t			page_no;
+  /** Whether undo tablespace truncation is pending */
+  static constexpr uint32_t SKIP= 1;
+  /** Transaction reference count multiplier */
+  static constexpr uint32_t REF= 2;
 
-	/** current size in pages */
-	uint32_t			curr_size;
+  uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); }
 
-	/*--------------------------------------------------------*/
-	/* Fields for undo logs */
-	/** List of undo logs */
-	UT_LIST_BASE_NODE_T(trx_undo_t)	undo_list;
+  /** Set the SKIP bit */
+  void ref_set_skip()
+  {
+    static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+    ref.fetch_or(SKIP, std::memory_order_relaxed);
+#endif
+  }
+  /** Clear a bit in ref */
+  void ref_reset_skip()
+  {
+    static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+    ref.fetch_and(~SKIP, std::memory_order_relaxed);
+#endif
+  }
+
+public:
 
-	/** List of undo log segments cached for fast reuse */
-	UT_LIST_BASE_NODE_T(trx_undo_t)	undo_cached;
+  /** Initialize the fields that are not zero-initialized. */
+  void init(fil_space_t *space, uint32_t page);
+  /** Reinitialize the fields on undo tablespace truncation. */
+  void reinit(uint32_t page);
+  /** Clean up. */
+  void destroy();
 
-	/*--------------------------------------------------------*/
+  /** Note that undo tablespace truncation was started. */
+  void set_skip_allocation() { ut_ad(is_persistent()); ref_set_skip(); }
+  /** Note that undo tablespace truncation was completed. */
+  void clear_skip_allocation()
+  {
+    ut_ad(is_persistent());
+#if defined DBUG_OFF
+    ref_reset_skip();
+#else
+    ut_d(auto r=) ref.fetch_and(~SKIP, std::memory_order_relaxed);
+    ut_ad(r == SKIP);
+#endif
+  }
+  /** @return whether the segment is marked for undo truncation */
+  bool skip_allocation() const
+  { return ref.load(std::memory_order_acquire) & SKIP; }
+  /** Increment the reference count */
+  void acquire()
+  { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); }
+  /** Increment the reference count if possible
+  @retval true  if the reference count was incremented
+  @retval false if skip_allocation() holds */
+  bool acquire_if_available()
+  {
+    uint32_t r= 0;
+    while (!ref.compare_exchange_weak(r, r + REF,
+                                      std::memory_order_relaxed,
+                                      std::memory_order_relaxed))
+      if (r & SKIP)
+        return false;
+    return true;
+  }
+
+  /** Decrement the reference count */
+  void release()
+  {
+    ut_d(const auto r=)
+    ref.fetch_sub(REF, std::memory_order_relaxed);
+    ut_ad(r >= REF);
+  }
+  /** @return whether references exist */
+  bool is_referenced() const { return ref_load() >= REF; }
+
+  /** current size in pages */
+  uint32_t curr_size;
+
+  /** List of undo logs (transactions) */
+  UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+  /** List of undo log segments cached for fast reuse */
+  UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
 
   /** Last not yet purged undo log header; FIL_NULL if all purged */
   uint32_t last_page_no;
@@ -128,20 +172,6 @@ struct trx_rseg_t {
   /** trx_t::no | last_offset << 48 */
   uint64_t last_commit_and_offset;
 
-  /** Last known transaction that has not been purged yet,
-  or 0 if everything has been purged. */
-  trx_id_t needs_purge;
-
-  /** Number of active (non-committed) transactions associated with a
-  an is_persistent() rollback segment. Needed for protecting
-  trx->rsegs.m_redo.rseg assignments
-  before trx->rsegs.m_redo.undo has been assigned. */
-  ulint trx_ref_count;
-
-  /** whether undo log truncation was initiated, and transactions
-  cannot be allocated in this is_persistent() rollback segment */
-  bool skip_allocation;
-
   /** @return the commit ID of the last committed transaction */
   trx_id_t last_trx_no() const
   { return last_commit_and_offset & ((1ULL << 48) - 1); }
@@ -154,24 +184,27 @@ struct trx_rseg_t {
     last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
   }
 
-	/** @return whether the rollback segment is persistent */
-	bool is_persistent() const
-	{
-		ut_ad(space == fil_system.temp_space
-		      || space == fil_system.sys_space
-		      || (srv_undo_space_id_start > 0
-			  && space->id >= srv_undo_space_id_start
-			  && space->id <= srv_undo_space_id_start
-			  + TRX_SYS_MAX_UNDO_SPACES));
-		ut_ad(space == fil_system.temp_space
-		      || space == fil_system.sys_space
-		      || (srv_undo_space_id_start > 0
-			  && space->id >= srv_undo_space_id_start
-			  && space->id <= srv_undo_space_id_start
-			  + srv_undo_tablespaces_open)
-		      || !srv_was_started);
-		return(space->id != SRV_TMP_SPACE_ID);
-	}
+  /** @return the page identifier */
+  page_id_t page_id() const { return page_id_t{space->id, page_no}; }
+
+  /** @return the rollback segment header page, exclusively latched */
+  buf_block_t *get(mtr_t *mtr, dberr_t *err) const;
+
+  /** @return whether the rollback segment is persistent */
+  bool is_persistent() const
+  {
+    ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+          (srv_undo_space_id_start > 0 &&
+           space->id >= srv_undo_space_id_start &&
+           space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES));
+    ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+          !srv_was_started ||
+          (srv_undo_space_id_start > 0 &&
+           space->id >= srv_undo_space_id_start
+           && space->id <= srv_undo_space_id_start +
+           srv_undo_tablespaces_open));
+    return space->id != SRV_TMP_SPACE_ID;
+  }
 };
 
 /* Undo log segment slot in a rollback segment header */
@@ -212,32 +245,8 @@ If no binlog information is present, the first byte is NUL. */
 #define TRX_RSEG_BINLOG_NAME_LEN	512
 
 #ifdef WITH_WSREP
-/** The offset to WSREP XID headers */
-#define	TRX_RSEG_WSREP_XID_INFO		TRX_RSEG_MAX_TRX_ID + 16 + 512
-
-/** WSREP XID format (1 if present and valid, 0 if not present) */
-#define TRX_RSEG_WSREP_XID_FORMAT	TRX_RSEG_WSREP_XID_INFO
-/** WSREP XID GTRID length */
-#define TRX_RSEG_WSREP_XID_GTRID_LEN	TRX_RSEG_WSREP_XID_INFO + 4
-/** WSREP XID bqual length */
-#define TRX_RSEG_WSREP_XID_BQUAL_LEN	TRX_RSEG_WSREP_XID_INFO + 8
-/** WSREP XID data (XIDDATASIZE bytes) */
-#define TRX_RSEG_WSREP_XID_DATA		TRX_RSEG_WSREP_XID_INFO + 12
-#endif /* WITH_WSREP*/
-
-/*-------------------------------------------------------------*/
+# include "trx0xa.h"
 
-/** Read the page number of an undo log slot.
-@param[in]      rseg_header     rollback segment header
-@param[in]      n               slot number */
-inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
-{
-  ut_ad(n < TRX_RSEG_N_SLOTS);
-  return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
-                          n * TRX_RSEG_SLOT_SIZE + rseg_header->frame);
-}
-
-#ifdef WITH_WSREP
 /** Update the WSREP XID information in rollback segment header.
 @param[in,out]	rseg_header	rollback segment header
 @param[in]	xid		WSREP XID
@@ -263,6 +272,16 @@ void trx_rseg_update_wsrep_checkpoint(const XID* xid);
 bool trx_rseg_read_wsrep_checkpoint(XID& xid);
 #endif /* WITH_WSREP */
 
+/** Read the page number of an undo log slot.
+@param[in]      rseg_header     rollback segment header
+@param[in]      n               slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
+{
+  ut_ad(n < TRX_RSEG_N_SLOTS);
+  return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                          n * TRX_RSEG_SLOT_SIZE + rseg_header->page.frame);
+}
+
 /** Upgrade a rollback segment header page to MariaDB 10.3 format.
 @param[in,out]	rseg_header	rollback segment header page
 @param[in,out]	mtr		mini-transaction */
@@ -277,7 +296,3 @@ up to which replication has proceeded.
 @param[in,out]	mtr		mini-transaction */
 void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
                                    mtr_t *mtr);
-
-#include "trx0rseg.inl"
-
-#endif
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index e033a3e1fe4..245b981974b 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -24,24 +24,23 @@ Transaction system
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef trx0sys_h
-#define trx0sys_h
-
+#pragma once
 #include "buf0buf.h"
 #include "fil0fil.h"
-#include "trx0types.h"
+#include "trx0rseg.h"
 #include "mem0mem.h"
 #include "mtr0mtr.h"
 #include "ut0byte.h"
 #include "ut0lst.h"
 #include "read0types.h"
 #include "page0types.h"
-#include "ut0mutex.h"
 #include "trx0trx.h"
-#ifdef WITH_WSREP
-#include "trx0xa.h"
-#endif /* WITH_WSREP */
 #include "ilist.h"
+#include "my_cpu.h"
+
+#ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t trx_sys_mutex_key;
+#endif
 
 /** Checks if a page address is the trx sys header page.
 @param[in]	page_id	page id
@@ -53,9 +52,8 @@ inline bool trx_sys_hdr_page(const page_id_t page_id)
 
 /*****************************************************************//**
 Creates and initializes the transaction system at the database creation. */
-void
-trx_sys_create_sys_pages(void);
-/*==========================*/
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr);
+
 /** Find an available rollback segment.
 @param[in]	sys_header
 @return an unallocated rollback segment slot in the TRX_SYS header
@@ -68,10 +66,8 @@ trx_sys_rseg_find_free(const buf_block_t* sys_header);
 @retval	NULL	if the page cannot be read */
 inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
 {
-  buf_block_t* block = buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
-				    0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
-  ut_d(if (block) buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);)
-  return block;
+  return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                      0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
 }
 
 #ifdef UNIV_DEBUG
@@ -134,9 +130,6 @@ trx_sys_print_mysql_binlog_offset();
 bool
 trx_sys_create_rsegs();
 
-/** The automatically created system rollback segment has this id */
-#define TRX_SYS_SYSTEM_RSEG_ID	0
-
 /** The offset of the transaction system header on the page */
 #define	TRX_SYS		FSEG_PAGE_DATA
 
@@ -156,13 +149,6 @@ from older MySQL or MariaDB versions. */
 					/*!< the start of the array of
 					rollback segment specification
 					slots */
-/*------------------------------------------------------------- @} */
-
-/** The number of rollback segments; rollback segment id must fit in
-the 7 bits reserved for it in DB_ROLL_PTR. */
-#define	TRX_SYS_N_RSEGS			128
-/** Maximum number of undo tablespaces (not counting the system tablespace) */
-#define TRX_SYS_MAX_UNDO_SPACES		(TRX_SYS_N_RSEGS - 1)
 
 /* Rollback segment specification slot offsets */
 
@@ -185,7 +171,7 @@ trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
 	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
 	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
 				+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
-				+ sys_header->frame);
+				+ sys_header->page.frame);
 }
 
 /** Read the page number of a rollback segment slot.
@@ -198,7 +184,7 @@ trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
   ut_ad(rseg_id < TRX_SYS_N_RSEGS);
   return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
 			  rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
-			  sys_header->frame);
+			  sys_header->page.frame);
 }
 
 /** Maximum length of MySQL binlog file name, in bytes.
@@ -344,16 +330,14 @@ trx_t* current_trx();
 
 struct rw_trx_hash_element_t
 {
-  rw_trx_hash_element_t(): trx(0)
+  rw_trx_hash_element_t()
   {
-    mutex_create(LATCH_ID_RW_TRX_HASH_ELEMENT, &mutex);
+    memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+    mutex.init();
   }
 
 
-  ~rw_trx_hash_element_t()
-  {
-    mutex_free(&mutex);
-  }
+  ~rw_trx_hash_element_t() { mutex.destroy(); }
 
 
   trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
@@ -366,7 +350,7 @@ struct rw_trx_hash_element_t
   */
   Atomic_counter<trx_id_t> no;
   trx_t *trx;
-  ib_mutex_t mutex;
+  srw_mutex mutex;
 };
 
 
@@ -515,12 +499,12 @@ class rw_trx_hash_t
     ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
     ut_ad(!trx->is_autocommit_non_locking());
     /* trx->state can be anything except TRX_STATE_NOT_STARTED */
-    mutex_enter(&trx->mutex);
+    ut_d(trx->mutex_lock());
     ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
           trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
           trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
           trx_state_eq(trx, TRX_STATE_PREPARED));
-    mutex_exit(&trx->mutex);
+    ut_d(trx->mutex_unlock());
   }
 
 
@@ -535,10 +519,11 @@ class rw_trx_hash_t
   static my_bool debug_iterator(rw_trx_hash_element_t *element,
                                 debug_iterator_arg<T> *arg)
   {
-    mutex_enter(&element->mutex);
+    element->mutex.wr_lock();
     if (element->trx)
       validate_element(element->trx);
-    mutex_exit(&element->mutex);
+    element->mutex.wr_unlock();
+    ut_ad(element->id < element->no);
     return arg->action(element, arg->argument);
   }
 #endif
@@ -591,10 +576,10 @@ public:
     the transaction may get committed before this method returns.
 
     With do_ref_count == false the caller may dereference returned trx pointer
-    only if lock_sys.mutex was acquired before calling find().
+    only if lock_sys.latch was acquired before calling find().
 
     With do_ref_count == true caller may dereference trx even if it is not
-    holding lock_sys.mutex. Caller is responsible for calling
+    holding lock_sys.latch. Caller is responsible for calling
     trx->release_reference() when it is done playing with trx.
 
     Ideally this method should get caller rw_trx_hash_pins along with trx
@@ -640,7 +625,7 @@ public:
                       sizeof(trx_id_t)));
     if (element)
     {
-      mutex_enter(&element->mutex);
+      element->mutex.wr_lock();
       lf_hash_search_unpin(pins);
       if ((trx= element->trx)) {
         DBUG_ASSERT(trx_id == trx->id);
@@ -655,16 +640,13 @@ public:
             trx->mutex is released, and it will have to be rechecked
             by the caller after reacquiring the mutex.
           */
-          trx_mutex_enter(trx);
-          const trx_state_t state= trx->state;
-          trx_mutex_exit(trx);
-          if (state == TRX_STATE_COMMITTED_IN_MEMORY)
-            trx= NULL;
+          if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+            trx= nullptr;
           else
             trx->reference();
         }
       }
-      mutex_exit(&element->mutex);
+      element->mutex.wr_unlock();
     }
     if (!caller_trx)
       lf_hash_put_pins(pins);
@@ -698,9 +680,9 @@ public:
   void erase(trx_t *trx)
   {
     ut_d(validate_element(trx));
-    mutex_enter(&trx->rw_trx_hash_element->mutex);
-    trx->rw_trx_hash_element->trx= 0;
-    mutex_exit(&trx->rw_trx_hash_element->mutex);
+    trx->rw_trx_hash_element->mutex.wr_lock();
+    trx->rw_trx_hash_element->trx= nullptr;
+    trx->rw_trx_hash_element->mutex.wr_unlock();
     int res= lf_hash_delete(&hash, get_pins(trx),
                             reinterpret_cast<const void*>(&trx->id),
                             sizeof(trx_id_t));
@@ -734,12 +716,12 @@ public:
     May return element with committed transaction. If caller doesn't like to
     see committed transactions, it has to skip those under element mutex:
 
-      mutex_enter(&element->mutex);
+      element->mutex.wr_lock();
       if (trx_t trx= element->trx)
       {
         // trx is protected against commit in this branch
       }
-      mutex_exit(&element->mutex);
+      element->mutex.wr_unlock();
 
     May miss concurrently inserted transactions.
 
@@ -800,53 +782,53 @@ public:
 class thread_safe_trx_ilist_t
 {
 public:
-  void create() { mutex_create(LATCH_ID_TRX_SYS, &mutex); }
-  void close() { mutex_free(&mutex); }
+  void create() { mysql_mutex_init(trx_sys_mutex_key, &mutex, nullptr); }
+  void close() { mysql_mutex_destroy(&mutex); }
 
   bool empty() const
   {
-    mutex_enter(&mutex);
+    mysql_mutex_lock(&mutex);
     auto result= trx_list.empty();
-    mutex_exit(&mutex);
+    mysql_mutex_unlock(&mutex);
     return result;
   }
 
   void push_front(trx_t &trx)
   {
-    mutex_enter(&mutex);
+    mysql_mutex_lock(&mutex);
     trx_list.push_front(trx);
-    mutex_exit(&mutex);
+    mysql_mutex_unlock(&mutex);
   }
 
   void remove(trx_t &trx)
   {
-    mutex_enter(&mutex);
+    mysql_mutex_lock(&mutex);
     trx_list.remove(trx);
-    mutex_exit(&mutex);
+    mysql_mutex_unlock(&mutex);
   }
 
   template <typename Callable> void for_each(Callable &&callback) const
   {
-    mutex_enter(&mutex);
+    mysql_mutex_lock(&mutex);
     for (const auto &trx : trx_list)
       callback(trx);
-    mutex_exit(&mutex);
+    mysql_mutex_unlock(&mutex);
   }
 
   template <typename Callable> void for_each(Callable &&callback)
   {
-    mutex_enter(&mutex);
+    mysql_mutex_lock(&mutex);
     for (auto &trx : trx_list)
       callback(trx);
-    mutex_exit(&mutex);
+    mysql_mutex_unlock(&mutex);
   }
 
-  void freeze() const { mutex_enter(&mutex); }
-  void unfreeze() const { mutex_exit(&mutex); }
+  void freeze() const { mysql_mutex_lock(&mutex); }
+  void unfreeze() const { mysql_mutex_unlock(&mutex); }
 
 private:
-  alignas(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;
-  alignas(CACHE_LINE_SIZE) ilist<trx_t> trx_list;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable mysql_mutex_t mutex;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist<trx_t> trx_list;
 };
 
 /** The transaction system central memory data structure. */
@@ -856,7 +838,7 @@ class trx_sys_t
     The smallest number not yet assigned as a transaction id or transaction
     number. Accessed and updated with atomic operations.
   */
-  MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<trx_id_t> m_max_trx_id;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<trx_id_t> m_max_trx_id;
 
 
   /**
@@ -867,39 +849,28 @@ class trx_sys_t
     @sa assign_new_trx_no()
     @sa snapshot_ids()
   */
-  MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<trx_id_t> m_rw_trx_hash_version;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  std::atomic<trx_id_t> m_rw_trx_hash_version;
 
 
   bool m_initialised;
 
 public:
-  /**
-    TRX_RSEG_HISTORY list length (number of committed transactions to purge)
-  */
-  MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<size_t> rseg_history_len;
-
   /** List of all transactions. */
   thread_safe_trx_ilist_t trx_list;
 
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	/** Temporary rollback segments */
-	trx_rseg_t*	temp_rsegs[TRX_SYS_N_RSEGS];
+  /** Temporary rollback segments */
+  trx_rseg_t temp_rsegs[TRX_SYS_N_RSEGS];
 
-	MY_ALIGNED(CACHE_LINE_SIZE)
-	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
-					/*!< Pointer array to rollback
-					segments; NULL if slot not in use;
-					created and destroyed in
-					single-threaded mode; not protected
-					by any mutex, because it is read-only
-					during multi-threaded operation */
+  /** Persistent rollback segments; space==nullptr if slot not in use */
+  trx_rseg_t rseg_array[TRX_SYS_N_RSEGS];
 
   /**
     Lock-free hash of in memory read-write transactions.
     Works faster when it is on it's own cache line (tested).
   */
 
-  MY_ALIGNED(CACHE_LINE_SIZE) rw_trx_hash_t rw_trx_hash;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) rw_trx_hash_t rw_trx_hash;
 
 
 #ifdef WITH_WSREP
@@ -925,20 +896,47 @@ public:
 
 
   /**
-    Returns the minimum trx id in rw trx list.
+    @return TRX_RSEG_HISTORY length (number of committed transactions to purge)
+  */
+  size_t history_size();
+
+
+  /**
+    Check whether history_size() exceeds a specified number.
+    @param threshold   number of committed transactions
+    @return whether TRX_RSEG_HISTORY length exceeds the threshold
+  */
+  bool history_exceeds(size_t threshold);
 
-    This is the smallest id for which the trx can possibly be active. (But, you
-    must look at the trx->state to find out if the minimum trx id transaction
-    itself is active, or already committed.)
 
-    @return the minimum trx id, or m_max_trx_id if the trx list is empty
+  /**
+    @return approximate history_size(), without latch protection
   */
+  TPOOL_SUPPRESS_TSAN size_t history_size_approx() const;
 
-  trx_id_t get_min_trx_id()
+
+  /**
+    @return whether history_size() is nonzero (with some race condition)
+  */
+  TPOOL_SUPPRESS_TSAN bool history_exists();
+
+
+  /**
+    Determine if the specified transaction or any older one might be active.
+
+    @param trx         current transaction
+    @param id          transaction identifier
+    @return whether any transaction not newer than id might be active
+  */
+
+  bool find_same_or_older(trx_t *trx, trx_id_t id)
   {
-    trx_id_t id= get_max_trx_id();
-    rw_trx_hash.iterate(get_min_trx_id_callback, &id);
-    return id;
+    if (trx->max_inactive_id >= id)
+      return false;
+    bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id);
+    if (!found)
+      trx->max_inactive_id= id;
+    return found;
   }
 
 
@@ -1045,7 +1043,7 @@ public:
   }
 
 
-  bool is_initialised() { return m_initialised; }
+  bool is_initialised() const { return m_initialised; }
 
 
   /** Initialise the transaction subsystem. */
@@ -1059,6 +1057,22 @@ public:
 
 
   /**
+    Determine the rollback segment identifier.
+
+    @param rseg        rollback segment
+    @param persistent  whether the rollback segment is persistent
+    @return the rollback segment identifier
+  */
+  unsigned rseg_id(const trx_rseg_t *rseg, bool persistent) const
+  {
+    const trx_rseg_t *array= persistent ? rseg_array : temp_rsegs;
+    ut_ad(rseg >= array);
+    ut_ad(rseg < &array[TRX_SYS_N_RSEGS]);
+    return static_cast<unsigned>(rseg - array);
+  }
+
+
+  /**
     Registers read-write transaction.
 
     Transaction becomes visible to MVCC.
@@ -1157,18 +1171,10 @@ public:
   }
 
 private:
-  static my_bool get_min_trx_id_callback(rw_trx_hash_element_t *element,
-                                         trx_id_t *id)
+  static my_bool find_same_or_older_callback(rw_trx_hash_element_t *element,
+                                             trx_id_t *id)
   {
-    if (element->id < *id)
-    {
-      mutex_enter(&element->mutex);
-      /* We don't care about read-only transactions here. */
-      if (element->trx && element->trx->rsegs.m_redo.rseg)
-        *id= element->id;
-      mutex_exit(&element->mutex);
-    }
-    return 0;
+    return element->id <= *id;
   }
 
 
@@ -1231,5 +1237,3 @@ private:
 
 /** The transaction system */
 extern trx_sys_t trx_sys;
-
-#endif
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index ce3eca7593f..5b2b2264a46 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -38,7 +38,6 @@ Created 3/26/1996 Heikki Tuuri
 #include "ilist.h"
 
 #include <vector>
-#include <set>
 
 // Forward declaration
 struct mtr_t;
@@ -96,18 +95,11 @@ trx_start_if_not_started_low(
 	trx_t*	trx,		/*!< in/out: transaction */
 	bool	read_write);	/*!< in: true if read write transaction */
 
-/*************************************************************//**
-Starts a transaction for internal processing. */
-void
-trx_start_internal_low(
-/*===================*/
-	trx_t*	trx);		/*!< in/out: transaction */
-
-/** Starts a read-only transaction for internal processing.
-@param[in,out] trx	transaction to be started */
-void
-trx_start_internal_read_only_low(
-	trx_t*	trx);
+/**
+Start a transaction for internal processing.
+@param trx          transaction
+@param read_write   whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write);
 
 #ifdef UNIV_DEBUG
 #define trx_start_if_not_started_xa(t, rw)			\
@@ -128,48 +120,39 @@ trx_start_internal_read_only_low(
 	do {							\
 	(t)->start_line = __LINE__;				\
 	(t)->start_file = __FILE__;				\
-	trx_start_internal_low((t));				\
+	trx_start_internal_low(t, true);			\
 	} while (false)
-
 #define trx_start_internal_read_only(t)				\
 	do {							\
 	(t)->start_line = __LINE__;				\
 	(t)->start_file = __FILE__;				\
-	trx_start_internal_read_only_low(t);			\
+	trx_start_internal_low(t, false);			\
 	} while (false)
 #else
 #define trx_start_if_not_started(t, rw)				\
 	trx_start_if_not_started_low((t), rw)
 
-#define trx_start_internal(t)					\
-	trx_start_internal_low((t))
-
-#define trx_start_internal_read_only(t)				\
-	trx_start_internal_read_only_low(t)
+#define trx_start_internal(t) trx_start_internal_low(t, true)
+#define trx_start_internal_read_only(t) trx_start_internal_low(t, false)
 
 #define trx_start_if_not_started_xa(t, rw)			\
 	trx_start_if_not_started_xa_low((t), (rw))
 #endif /* UNIV_DEBUG */
 
-/*************************************************************//**
-Starts the transaction for a DDL operation. */
-void
-trx_start_for_ddl_low(
-/*==================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	trx_dict_op_t	op);	/*!< in: dictionary operation type */
+/** Start a transaction for a DDL operation.
+@param trx   transaction */
+void trx_start_for_ddl_low(trx_t *trx);
 
 #ifdef UNIV_DEBUG
-#define trx_start_for_ddl(t, o)					\
+# define trx_start_for_ddl(t)					\
 	do {							\
 	ut_ad((t)->start_file == 0);				\
 	(t)->start_line = __LINE__;				\
 	(t)->start_file = __FILE__;				\
-	trx_start_for_ddl_low((t), (o));			\
+	trx_start_for_ddl_low(t);				\
 	} while (0)
 #else
-#define trx_start_for_ddl(t, o)					\
-	trx_start_for_ddl_low((t), (o))
+# define trx_start_for_ddl(t) trx_start_for_ddl_low(t)
 #endif /* UNIV_DEBUG */
 
 /**********************************************************************//**
@@ -245,7 +228,7 @@ trx_print_low(
 			/*!< in: max query length to print,
 			or 0 to use the default max length */
 	ulint		n_rec_locks,
-			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+			/*!< in: trx->lock.n_rec_locks */
 	ulint		n_trx_locks,
 			/*!< in: length of trx->lock.trx_locks */
 	ulint		heap_size);
@@ -264,7 +247,7 @@ trx_print_latched(
 
 /**********************************************************************//**
 Prints info about a transaction.
-Acquires and releases lock_sys.mutex. */
+Acquires and releases lock_sys.latch. */
 void
 trx_print(
 /*======*/
@@ -274,25 +257,6 @@ trx_print(
 					or 0 to use the default max length */
 
 /**********************************************************************//**
-Determine if a transaction is a dictionary operation.
-@return dictionary operation mode */
-UNIV_INLINE
-enum trx_dict_op_t
-trx_get_dict_operation(
-/*===================*/
-	const trx_t*	trx)	/*!< in: transaction */
-	MY_ATTRIBUTE((warn_unused_result));
-/**********************************************************************//**
-Flag a transaction a dictionary operation. */
-UNIV_INLINE
-void
-trx_set_dict_operation(
-/*===================*/
-	trx_t*			trx,	/*!< in/out: transaction */
-	enum trx_dict_op_t	op);	/*!< in: operation, not
-					TRX_DICT_OP_NONE */
-
-/**********************************************************************//**
 Determines if a transaction is in the given state.
 The caller must hold trx->mutex, or it must be the thread
 that is serving a running transaction.
@@ -328,43 +292,6 @@ is estimated as the number of altered rows + the number of locked rows.
 @return transaction weight */
 #define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
 
-/*******************************************************************//**
-Compares the "weight" (or size) of two transactions. Transactions that
-have edited non-transactional tables are considered heavier than ones
-that have not.
-@return true if weight(a) >= weight(b) */
-bool
-trx_weight_ge(
-/*==========*/
-	const trx_t*	a,	/*!< in: the transaction to be compared */
-	const trx_t*	b);	/*!< in: the transaction to be compared */
-/* Maximum length of a string that can be returned by
-trx_get_que_state_str(). */
-#define TRX_QUE_STATE_STR_MAX_LEN	12 /* "ROLLING BACK" */
-
-/*******************************************************************//**
-Retrieves transaction's que state in a human readable string. The string
-should not be free()'d or modified.
-@return string in the data segment */
-UNIV_INLINE
-const char*
-trx_get_que_state_str(
-/*==================*/
-	const trx_t*	trx);	/*!< in: transaction */
-
-/** Retreieves the transaction ID.
-In a given point in time it is guaranteed that IDs of the running
-transactions are unique. The values returned by this function for readonly
-transactions may be reused, so a subsequent RO transaction may get the same ID
-as a RO transaction that existed in the past. The values returned by this
-function should be used for printing purposes only.
-@param[in]	trx	transaction whose id to retrieve
-@return transaction id */
-UNIV_INLINE
-trx_id_t
-trx_get_id_for_print(
-	const trx_t*	trx);
-
 /** Create the trx_t pool */
 void
 trx_pool_init();
@@ -395,95 +322,82 @@ from innodb_lock_wait_timeout via trx_t::mysql_thd.
 
 typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> >	lock_list;
 
-/*******************************************************************//**
-Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state
-captures the state of the query thread during the execution of a query.
-This is different from a transaction state. The query state of a transaction
-can be updated asynchronously by other threads.  The other threads can be
-system threads, like the timeout monitor thread or user threads executing
-other queries. Another thing to be mindful of is that there is a delay between
-when a query thread is put into LOCK_WAIT state and before it actually starts
-waiting.  Between these two events it is possible that the query thread is
-granted the lock it was waiting for, which implies that the state can be changed
-asynchronously.
-
-All these operations take place within the context of locking. Therefore state
-changes within the locking code must acquire both the lock mutex and the
-trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or
-trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient
-to only acquire the trx->mutex.
-To query the state either of the mutexes is sufficient within the locking
-code and no mutex is required when the query thread is no longer waiting. */
-
 /** The locks and state of an active transaction. Protected by
-lock_sys.mutex, trx->mutex or both. */
-struct trx_lock_t {
-#ifdef UNIV_DEBUG
-	/** number of active query threads; at most 1, except for the
-	dummy transaction in trx_purge() */
-	ulint n_active_thrs;
-#endif
-	trx_que_t	que_state;	/*!< valid when trx->state
-					== TRX_STATE_ACTIVE: TRX_QUE_RUNNING,
-					TRX_QUE_LOCK_WAIT, ... */
-
-	lock_t*		wait_lock;	/*!< if trx execution state is
-					TRX_QUE_LOCK_WAIT, this points to
-					the lock request, otherwise this is
-					NULL; set to non-NULL when holding
-					both trx->mutex and lock_sys.mutex;
-					set to NULL when holding
-					lock_sys.mutex; readers should
-					hold lock_sys.mutex, except when
-					they are holding trx->mutex and
-					wait_lock==NULL */
-	ib_uint64_t	deadlock_mark;	/*!< A mark field that is initialized
-					to and checked against lock_mark_counter
-					by lock_deadlock_recursive(). */
-	bool		was_chosen_as_deadlock_victim;
-					/*!< when the transaction decides to
-					wait for a lock, it sets this to false;
-					if another transaction chooses this
-					transaction as a victim in deadlock
-					resolution, it sets this to true.
-					Protected by trx->mutex. */
-	time_t		wait_started;	/*!< lock wait started at this time,
-					protected only by lock_sys.mutex */
+lock_sys.latch, trx->mutex or both. */
+struct trx_lock_t
+{
+  /** Lock request being waited for.
+  Set to nonnull when holding lock_sys.latch, lock_sys.wait_mutex and
+  trx->mutex, by the thread that is executing the transaction.
+  Set to nullptr when holding lock_sys.wait_mutex. */
+  Atomic_relaxed<lock_t*> wait_lock;
+  /** Transaction being waited for; protected by lock_sys.wait_mutex */
+  trx_t *wait_trx;
+  /** condition variable for !wait_lock; used with lock_sys.wait_mutex */
+  pthread_cond_t cond;
+  /** lock wait start time */
+  Atomic_relaxed<my_hrtime_t> suspend_time;
+
+#if  defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+  /** 2=high priority WSREP thread has marked this trx to abort;
+  1=another transaction chose this as a victim in deadlock resolution. */
+  Atomic_relaxed<byte> was_chosen_as_deadlock_victim;
+
+  /** Flag the lock owner as a victim in Galera conflict resolution. */
+  void set_wsrep_victim()
+  {
+# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    /* There is no 8-bit version of the 80386 BTS instruction.
+    Technically, this is the wrong addressing mode (16-bit), but
+    there are other data members stored after the byte. */
+    __asm__ __volatile__("lock btsw $1, %0"
+                         : "+m" (was_chosen_as_deadlock_victim));
+# else
+    was_chosen_as_deadlock_victim.fetch_or(2);
+# endif
+  }
+#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+  /** High priority WSREP thread has marked this trx to abort or
+  another transaction chose this as a victim in deadlock resolution. */
+  Atomic_relaxed<bool> was_chosen_as_deadlock_victim;
+
+  /** Flag the lock owner as a victim in Galera conflict resolution. */
+  void set_wsrep_victim() {
+    was_chosen_as_deadlock_victim= true;
+  }
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+  /** Next available rec_pool[] entry */
+  byte rec_cached;
+  /** Next available table_pool[] entry */
+  byte table_cached;
 
 	que_thr_t*	wait_thr;	/*!< query thread belonging to this
-					trx that is in QUE_THR_LOCK_WAIT
+					trx that is in waiting
 					state. For threads suspended in a
 					lock wait, this is protected by
-					lock_sys.mutex. Otherwise, this may
+					lock_sys.latch. Otherwise, this may
 					only be modified by the thread that is
 					serving the running transaction. */
-#ifdef WITH_WSREP
-	bool		was_chosen_as_wsrep_victim;
-					/*!< high priority wsrep thread has
-					marked this trx to abort */
-#endif /* WITH_WSREP */
-
-	/** Pre-allocated record locks */
-	struct {
-		ib_lock_t lock; byte pad[256];
-	} rec_pool[8];
 
-	/** Pre-allocated table locks */
-	ib_lock_t	table_pool[8];
+  /** Pre-allocated record locks */
+  struct {
+    alignas(CPU_LEVEL1_DCACHE_LINESIZE) ib_lock_t lock;
+  } rec_pool[8];
 
-	/** Next available rec_pool[] entry */
-	unsigned	rec_cached;
+  /** Pre-allocated table locks */
+  ib_lock_t table_pool[8];
 
-	/** Next available table_pool[] entry */
-	unsigned	table_cached;
+  /** Memory heap for trx_locks. Protected by lock_sys.assert_locked()
+  and lock_sys.is_writer() || trx->mutex_is_owner(). */
+  mem_heap_t *lock_heap;
 
-	mem_heap_t*	lock_heap;	/*!< memory heap for trx_locks;
-					protected by lock_sys.mutex */
-
-	trx_lock_list_t trx_locks;	/*!< locks requested by the transaction;
-					insertions are protected by trx->mutex
-					and lock_sys.mutex; removals are
-					protected by lock_sys.mutex */
+  /** Locks held by the transaction. Protected by lock_sys.assert_locked()
+  and lock_sys.is_writer() || trx->mutex_is_owner().
+  (If lock_sys.latch is only held in shared mode, then the modification
+  must be protected by trx->mutex.) */
+  trx_lock_list_t trx_locks;
 
 	lock_list	table_locks;	/*!< All table locks requested by this
 					transaction, including AUTOINC locks */
@@ -491,75 +405,94 @@ struct trx_lock_t {
 	/** List of pending trx_t::evict_table() */
 	UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables;
 
-	bool		cancel;		/*!< true if the transaction is being
-					rolled back either via deadlock
-					detection or due to lock timeout. The
-					caller has to acquire the trx_t::mutex
-					in order to cancel the locks. In
-					lock_trx_table_locks_remove() we
-					check for this cancel of a transaction's
-					locks and avoid reacquiring the trx
-					mutex to prevent recursive deadlocks.
-					Protected by both the lock sys mutex
-					and the trx_t::mutex. */
-	ulint		n_rec_locks;	/*!< number of rec locks in this trx */
+  /** number of record locks; protected by lock_sys.assert_locked(page_id) */
+  ulint n_rec_locks;
 };
 
 /** Logical first modification time of a table in a transaction */
 class trx_mod_table_time_t
 {
-	/** First modification of the table */
-	undo_no_t	first;
-	/** First modification of a system versioned column */
-	undo_no_t	first_versioned;
-
-	/** Magic value signifying that a system versioned column of a
-	table was never modified in a transaction. */
-	static const undo_no_t UNVERSIONED = IB_ID_MAX;
-
+  /** Impossible value for trx_t::undo_no */
+  static constexpr undo_no_t NONE= ~undo_no_t{0};
+  /** Theoretical maximum value for trx_t::undo_no.
+  DB_ROLL_PTR is only 7 bytes, so it cannot point to more than
+  this many undo log records. */
+  static constexpr undo_no_t LIMIT= (undo_no_t{1} << (7 * 8)) - 1;
+
+  /** Flag in 'first' to indicate that subsequent operations are
+  covered by a TRX_UNDO_EMPTY record (for the first statement to
+  insert into an empty table) */
+  static constexpr undo_no_t BULK= 1ULL << 63;
+
+  /** First modification of the table, possibly ORed with BULK */
+  undo_no_t first;
+  /** First modification of a system versioned column
+  (NONE= no versioning, BULK= the table was dropped) */
+  undo_no_t first_versioned= NONE;
+#ifdef UNIV_DEBUG
+  /** Whether the modified table is a FTS auxiliary table */
+  bool fts_aux_table= false;
+#endif /* UNIV_DEBUG */
 public:
-	/** Constructor
-	@param[in]	rows	number of modified rows so far */
-	trx_mod_table_time_t(undo_no_t rows)
-		: first(rows), first_versioned(UNVERSIONED) {}
+  /** Constructor
+  @param rows   number of modified rows so far */
+  trx_mod_table_time_t(undo_no_t rows) : first(rows) { ut_ad(rows < LIMIT); }
 
 #ifdef UNIV_DEBUG
-	/** Validation
-	@param[in]	rows	number of modified rows so far
-	@return	whether the object is valid */
-	bool valid(undo_no_t rows = UNVERSIONED) const
-	{
-		return first <= first_versioned && first <= rows;
-	}
+  /** Validation
+  @param rows   number of modified rows so far
+  @return whether the object is valid */
+  bool valid(undo_no_t rows= NONE) const
+  { auto f= first & LIMIT; return f <= first_versioned && f <= rows; }
 #endif /* UNIV_DEBUG */
-	/** @return if versioned columns were modified */
-	bool is_versioned() const { return first_versioned != UNVERSIONED; }
+  /** @return if versioned columns were modified */
+  bool is_versioned() const { return (~first_versioned & LIMIT) != 0; }
+  /** @return if the table was dropped */
+  bool is_dropped() const { return first_versioned == BULK; }
+
+  /** After writing an undo log record, set is_versioned() if needed
+  @param rows   number of modified rows so far */
+  void set_versioned(undo_no_t rows)
+  {
+    ut_ad(first_versioned == NONE);
+    first_versioned= rows;
+    ut_ad(valid(rows));
+  }
 
-	/** After writing an undo log record, set is_versioned() if needed
-	@param[in]	rows	number of modified rows so far */
-	void set_versioned(undo_no_t rows)
-	{
-		ut_ad(!is_versioned());
-		first_versioned = rows;
-		ut_ad(valid());
-	}
+  /** After writing an undo log record, note that the table will be dropped */
+  void set_dropped()
+  {
+    ut_ad(first_versioned == NONE);
+    first_versioned= BULK;
+  }
 
-	/** Invoked after partial rollback
-	@param[in]	limit	number of surviving modified rows
-	@return	whether this should be erased from trx_t::mod_tables */
-	bool rollback(undo_no_t limit)
-	{
-		ut_ad(valid());
-		if (first >= limit) {
-			return true;
-		}
+  /** Notify the start of a bulk insert operation */
+  void start_bulk_insert() { first|= BULK; }
 
-		if (first_versioned < limit && is_versioned()) {
-			first_versioned = UNVERSIONED;
-		}
+  /** Notify the end of a bulk insert operation */
+  void end_bulk_insert() { first&= ~BULK; }
 
-		return false;
-	}
+  /** @return whether an insert is covered by TRX_UNDO_EMPTY record */
+  bool is_bulk_insert() const { return first & BULK; }
+
+  /** Invoked after partial rollback
+  @param limit	number of surviving modified rows (trx_t::undo_no)
+  @return	whether this should be erased from trx_t::mod_tables */
+  bool rollback(undo_no_t limit)
+  {
+    ut_ad(valid());
+    if ((LIMIT & first) >= limit)
+      return true;
+    if (first_versioned < limit)
+      first_versioned= NONE;
+    return false;
+  }
+
+#ifdef UNIV_DEBUG
+  void set_aux_table() { fts_aux_table= true; }
+
+  bool is_aux_table() const { return fts_aux_table; }
+#endif /* UNIV_DEBUG */
 };
 
 /** Collection of persistent tables and their first modification
@@ -593,7 +526,7 @@ no longer be associated with a session when the server is restarted.
 
 A session may be served by at most one thread at a time. The serving
 thread of a session might change in some MySQL implementations.
-Therefore we do not have os_thread_get_curr_id() assertions in the code.
+Therefore we do not have pthread_self() assertions in the code.
 
 Normally, only the thread that is currently associated with a running
 transaction may access (read and modify) the trx object, and it may do
@@ -604,7 +537,7 @@ transactions (state == TRX_STATE_ACTIVE && is_recovered)
 while the system is already processing new user transactions (!is_recovered).
 
 * trx_print_low() may access transactions not associated with the current
-thread. The caller must be holding lock_sys.mutex.
+thread. The caller must be holding lock_sys.latch.
 
 * When a transaction handle is in the trx_sys.trx_list, some of its fields
 must not be modified without holding trx->mutex.
@@ -612,7 +545,7 @@ must not be modified without holding trx->mutex.
 * The locking code (in particular, lock_deadlock_recursive() and
 lock_rec_convert_impl_to_expl()) will access transactions associated
 to other connections. The locks of transactions are protected by
-lock_sys.mutex (insertions also by trx->mutex). */
+lock_sys.latch (insertions also by trx->mutex). */
 
 /** Represents an instance of rollback segment along with its state variables.*/
 struct trx_undo_ptr_t {
@@ -643,7 +576,8 @@ struct trx_rsegs_t {
 	trx_temp_undo_t	m_noredo;
 };
 
-struct trx_t : ilist_node<> {
+struct trx_t : ilist_node<>
+{
 private:
   /**
     Least significant 31 bits is count of references.
@@ -658,96 +592,139 @@ private:
     we don't want to get blocked on GAP locks taken for protecting
     concurrent unique insert or replace operation.
   */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
   Atomic_relaxed<uint32_t> skip_lock_inheritance_and_n_ref;
 
 
 public:
-	TrxMutex	mutex;		/*!< Mutex protecting the fields
-					state and lock (except some fields
-					of lock, which are protected by
-					lock_sys.mutex) */
+  /** Transaction identifier (0 if no locks were acquired).
+  Set by trx_sys_t::register_rw() or trx_resurrect() before
+  the transaction is added to trx_sys.rw_trx_hash.
+  Cleared in commit_in_memory() after commit_state(),
+  trx_sys_t::deregister_rw(), release_locks(). */
+  trx_id_t id;
+  /** The largest encountered transaction identifier for which no
+  transaction was observed to be active. This is a cache to speed up
+  trx_sys_t::find_same_or_older(). */
+  trx_id_t max_inactive_id;
+
+private:
+  /** mutex protecting state and some of lock
+  (some are protected by lock_sys.latch) */
+  srw_spin_mutex mutex;
+#ifdef UNIV_DEBUG
+  /** The owner of mutex (0 if none); protected by mutex */
+  std::atomic<pthread_t> mutex_owner{0};
+#endif /* UNIV_DEBUG */
+public:
+  void mutex_init() { mutex.init(); }
+  void mutex_destroy() { mutex.destroy(); }
+
+  /** Acquire the mutex */
+  void mutex_lock()
+  {
+    ut_ad(!mutex_is_owner());
+    mutex.wr_lock();
+    ut_ad(!mutex_owner.exchange(pthread_self(),
+                                std::memory_order_relaxed));
+  }
+  /** Release the mutex */
+  void mutex_unlock()
+  {
+    ut_ad(mutex_owner.exchange(0, std::memory_order_relaxed)
+	  == pthread_self());
+    mutex.wr_unlock();
+  }
+#ifndef SUX_LOCK_GENERIC
+  bool mutex_is_locked() const noexcept { return mutex.is_locked(); }
+#endif
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread holds the mutex */
+  bool mutex_is_owner() const
+  {
+    return mutex_owner.load(std::memory_order_relaxed) ==
+      pthread_self();
+  }
+#endif /* UNIV_DEBUG */
+
+  /** State of the trx from the point of view of concurrency control
+  and the valid state transitions.
 
-	trx_id_t	id;		/*!< transaction id */
+  Possible states:
 
-	/** State of the trx from the point of view of concurrency control
-	and the valid state transitions.
+  TRX_STATE_NOT_STARTED
+  TRX_STATE_ACTIVE
+  TRX_STATE_PREPARED
+  TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
+  TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
 
-	Possible states:
+  Valid state transitions are:
 
-	TRX_STATE_NOT_STARTED
-	TRX_STATE_ACTIVE
-	TRX_STATE_PREPARED
-	TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
-	TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+  Regular transactions:
+  * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
 
-	Valid state transitions are:
+  Auto-commit non-locking read-only:
+  * NOT_STARTED -> ACTIVE -> NOT_STARTED
 
-	Regular transactions:
-	* NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+  XA (2PC):
+  * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
 
-	Auto-commit non-locking read-only:
-	* NOT_STARTED -> ACTIVE -> NOT_STARTED
+  Recovered XA:
+  * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
 
-	XA (2PC):
-	* NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+  Recovered XA followed by XA ROLLBACK:
+  * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
 
-	Recovered XA:
-	* NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+  XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
+  * NOT_STARTED -> PREPARED -> (freed)
 
-	Recovered XA followed by XA ROLLBACK:
-	* NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
+  Disconnected XA PREPARE transaction can become recovered:
+  * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
 
-	XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
-	* NOT_STARTED -> PREPARED -> (freed)
+  Latching and various transaction lists membership rules:
 
-	Disconnected XA can become recovered:
-	* ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
-	Disconnected means from mysql e.g due to the mysql client disconnection.
-	Latching and various transaction lists membership rules:
+  XA (2PC) transactions are always treated as non-autocommit.
 
-	XA (2PC) transactions are always treated as non-autocommit.
+  Transitions to ACTIVE or NOT_STARTED occur when transaction
+  is not in rw_trx_hash.
 
-	Transitions to ACTIVE or NOT_STARTED occur when transaction
-	is not in rw_trx_hash.
+  Autocommit non-locking read-only transactions move between states
+  without holding any mutex. They are not in rw_trx_hash.
 
-	Autocommit non-locking read-only transactions move between states
-	without holding any mutex. They are not in rw_trx_hash.
+  All transactions, unless they are determined to be ac-nl-ro,
+  explicitly tagged as read-only or read-write, will first be put
+  on the read-only transaction list. Only when a !read-only transaction
+  in the read-only list tries to acquire an X or IX lock on a table
+  do we remove it from the read-only list and put it on the read-write
+  list. During this switch we assign it a rollback segment.
 
-	All transactions, unless they are determined to be ac-nl-ro,
-	explicitly tagged as read-only or read-write, will first be put
-	on the read-only transaction list. Only when a !read-only transaction
-	in the read-only list tries to acquire an X or IX lock on a table
-	do we remove it from the read-only list and put it on the read-write
-	list. During this switch we assign it a rollback segment.
+  When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
+  in rw_trx_hash.
 
-	When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
-	in rw_trx_hash.
+  ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
+  The transition ACTIVE->PREPARED is protected by trx->mutex.
 
-	ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
-	The transition ACTIVE->PREPARED is protected by trx->mutex.
+  ACTIVE->COMMITTED is possible when the transaction is in
+  rw_trx_hash.
 
-	ACTIVE->COMMITTED is possible when the transaction is in
-	rw_trx_hash.
+  Transitions to COMMITTED are protected by trx_t::mutex. */
+  Atomic_relaxed<trx_state_t> state;
+
+  /** The locks of the transaction. Protected by lock_sys.latch
+  (insertions also by trx_t::mutex). */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_lock_t lock;
 
-	Transitions to COMMITTED are protected by trx_t::mutex. */
-	trx_state_t	state;
 #ifdef WITH_WSREP
-	/** whether wsrep_on(mysql_thd) held at the start of transaction */
-	bool		wsrep;
-	bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
-	/** true, if BF thread is performing unique secondary index scanning */
-	bool wsrep_UK_scan;
-	bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep_UK_scan); }
+  /** whether wsrep_on(mysql_thd) held at the start of transaction */
+  byte wsrep;
+  bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
+  bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep & 2); }
 #else /* WITH_WSREP */
-	bool is_wsrep() const { return false; }
+  bool is_wsrep() const { return false; }
 #endif /* WITH_WSREP */
 
-	ReadView	read_view;	/*!< consistent read view used in the
-					transaction, or NULL if not yet set */
-	trx_lock_t	lock;		/*!< Information about the transaction
-					locks and state. Protected by
-					lock_sys.mutex (insertions also
-					by trx_t::mutex). */
+  /** Consistent read view of the transaction */
+  ReadView read_view;
 
 	/* These fields are not protected by any mutex. */
 
@@ -767,6 +744,8 @@ public:
 					wants to suppress foreign key checks,
 					(in table imports, for example) we
 					set this FALSE */
+  /** whether an insert into an empty table is active */
+  bool bulk_insert;
 	/*------------------------------*/
 	/* MySQL has a transaction coordinator to coordinate two phase
 	commit between multiple storage engines and the binary log. When
@@ -800,13 +779,15 @@ public:
 					flush the log in
 					trx_commit_complete_for_mysql() */
 	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
-	trx_dict_op_t	dict_operation;	/**< @see enum trx_dict_op_t */
-
-	ib_uint32_t	dict_operation_lock_mode;
-					/*!< 0, RW_S_LATCH, or RW_X_LATCH:
-					the latch mode trx currently holds
-					on dict_sys.latch. Protected
-					by dict_sys.latch. */
+  /** whether this modifies InnoDB dictionary tables */
+  bool dict_operation;
+#ifdef UNIV_DEBUG
+  /** copy of dict_operation during commit() */
+  bool was_dict_operation;
+#endif
+	/** whether dict_sys.latch is held exclusively; protected by
+	dict_sys.latch */
+	bool dict_operation_lock_mode;
 
 	/** wall-clock time of the latest transition to TRX_STATE_ACTIVE;
 	used for diagnostic purposes only */
@@ -814,8 +795,6 @@ public:
 	/** microsecond_interval_timer() of transaction start */
 	ulonglong	start_time_micro;
 	lsn_t		commit_lsn;	/*!< lsn at the time of the commit */
-	table_id_t	table_id;	/*!< Table to drop iff dict_operation
-					== TRX_DICT_OP_TABLE, or 0. */
 	/*------------------------------*/
 	THD*		mysql_thd;	/*!< MySQL thread handle corresponding
 					to this trx, or NULL */
@@ -886,7 +865,7 @@ public:
 					also in the lock list trx_locks. This
 					vector needs to be freed explicitly
 					when the trx instance is destroyed.
-					Protected by lock_sys.mutex. */
+					Protected by lock_sys.latch. */
 	/*------------------------------*/
 	bool		read_only;	/*!< true if transaction is flagged
 					as a READ-ONLY transaction.
@@ -899,6 +878,10 @@ public:
 	bool		auto_commit;	/*!< true if it is an autocommit */
 	bool		will_lock;	/*!< set to inform trx_start_low() that
 					the transaction may acquire locks */
+	/* True if transaction has to read the undo log and
+	log the DML changes for online DDL table */
+	bool		apply_online_log = false;
+
 	/*------------------------------*/
 	fts_trx_t*	fts_trx;	/*!< FTS information, or NULL if
 					transaction hasn't modified tables
@@ -909,20 +892,12 @@ public:
 					count of tables being flushed. */
 
 	/*------------------------------*/
-	bool		ddl;		/*!< true if it is an internal
-					transaction for DDL */
-	bool		internal;	/*!< true if it is a system/internal
-					transaction background task. This
-					includes DDL transactions too.  Such
-					transactions are always treated as
-					read-write. */
-	/*------------------------------*/
 #ifdef UNIV_DEBUG
 	unsigned	start_line;	/*!< Track where it was started from */
 	const char*	start_file;	/*!< Filename where it was started */
 #endif /* UNIV_DEBUG */
 
-	XID*		xid;		/*!< X/Open XA transaction
+	XID		xid;		/*!< X/Open XA transaction
 					identification to identify a
 					transaction branch */
 	trx_mod_tables_t mod_tables;	/*!< List of tables that were modified
@@ -964,8 +939,9 @@ public:
   inline void release_locks();
 
   /** Evict a table definition due to the rollback of ALTER TABLE.
-  @param[in]	table_id	table identifier */
-  void evict_table(table_id_t table_id);
+  @param table_id   table identifier
+  @param reset_only whether to only reset dict_table_t::def_trx_id */
+  void evict_table(table_id_t table_id, bool reset_only= false);
 
   /** Initiate rollback.
   @param savept     savepoint to which to roll back
@@ -979,8 +955,17 @@ public:
   @retval false if the rollback was aborted by shutdown */
   inline bool rollback_finish();
 private:
-  /** Mark a transaction committed in the main memory data structures. */
+  /** Apply any changes to tables for which online DDL is in progress. */
+  ATTRIBUTE_COLD void apply_log();
+  /** Process tables that were modified by the committing transaction. */
+  inline void commit_tables();
+  /** Mark a transaction committed in the main memory data structures.
+  @param mtr  mini-transaction (if there are any persistent modifications) */
   inline void commit_in_memory(const mtr_t *mtr);
+  /** Write log for committing the transaction. */
+  void commit_persist();
+  /** Clean up the transaction after commit_in_memory() */
+  void commit_cleanup();
   /** Commit the transaction in a mini-transaction.
   @param mtr  mini-transaction (if there are any persistent modifications) */
   void commit_low(mtr_t *mtr= nullptr);
@@ -988,11 +973,41 @@ public:
   /** Commit the transaction. */
   void commit();
 
+
+  /** Try to drop a persistent table.
+  @param table       persistent table
+  @param fk          whether to drop FOREIGN KEY metadata
+  @return error code */
+  dberr_t drop_table(const dict_table_t &table);
+  /** Try to drop the foreign key constraints for a persistent table.
+  @param name        name of persistent table
+  @return error code */
+  dberr_t drop_table_foreign(const table_name_t &name);
+  /** Try to drop the statistics for a persistent table.
+  @param name        name of persistent table
+  @return error code */
+  dberr_t drop_table_statistics(const table_name_t &name);
+  /** Commit the transaction, possibly after drop_table().
+  @param deleted   handles of data files that were deleted */
+  void commit(std::vector<pfs_os_file_t> &deleted);
+
+
+  /** Discard all savepoints */
+  void savepoints_discard()
+  { savepoints_discard(UT_LIST_GET_FIRST(trx_savepoints)); }
+
+
+  /** Discard all savepoints starting from a particular savepoint.
+  @param savept    first savepoint to discard */
+  void savepoints_discard(trx_named_savept_t *savept);
+
+
   bool is_referenced() const
   {
     return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0;
   }
 
+
   void reference()
   {
     ut_d(auto old_n_ref =)
@@ -1032,7 +1047,7 @@ public:
   }
 
   /** @return whether the table has lock on
-  mysql.innodb_table_stats and mysql.innodb_index_stats */
+  mysql.innodb_table_stats or mysql.innodb_index_stats */
   bool has_stats_table_lock() const;
 
   /** Free the memory to trx_pools */
@@ -1043,25 +1058,64 @@ public:
   {
     ut_ad(state == TRX_STATE_NOT_STARTED);
     ut_ad(!id);
+    ut_ad(!mutex_is_owner());
     ut_ad(!has_logged());
     ut_ad(!is_referenced());
     ut_ad(!is_wsrep());
-#ifdef WITH_WSREP
-    ut_ad(!lock.was_chosen_as_wsrep_victim);
-#endif
+    ut_ad(!lock.was_chosen_as_deadlock_victim);
+    ut_ad(mod_tables.empty());
     ut_ad(!read_view.is_open());
     ut_ad(!lock.wait_thr);
+    ut_ad(!lock.wait_lock);
     ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
     ut_ad(lock.table_locks.empty());
     ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks));
     ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
-    ut_ad(dict_operation == TRX_DICT_OP_NONE);
+    ut_ad(!dict_operation);
+    ut_ad(!apply_online_log);
     ut_ad(!is_not_inheriting_locks());
+    ut_ad(check_foreigns);
+    ut_ad(check_unique_secondary);
+  }
+
+  /** This has to be invoked on SAVEPOINT or at the end of a statement.
+  Even if a TRX_UNDO_EMPTY record was written for this table to cover an
+  insert into an empty table, subsequent operations will have to be covered
+  by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+  rollback to the start of a statement will work.
+  @param table   table on which any preceding bulk insert ended */
+  void end_bulk_insert(const dict_table_t &table)
+  {
+    auto it= mod_tables.find(const_cast<dict_table_t*>(&table));
+    if (it != mod_tables.end())
+      it->second.end_bulk_insert();
   }
 
   /** @return whether this is a non-locking autocommit transaction */
   bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
 
+  /** This has to be invoked on SAVEPOINT or at the start of a statement.
+  Even if TRX_UNDO_EMPTY records were written for any table to cover an
+  insert into an empty table, subsequent operations will have to be covered
+  by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+  rollback to the start of a statement will work. */
+  void end_bulk_insert()
+  {
+    for (auto& t : mod_tables)
+      t.second.end_bulk_insert();
+  }
+
+  /** @return whether a bulk insert into empty table is in progress */
+  bool is_bulk_insert() const
+  {
+    if (!bulk_insert || check_unique_secondary || check_foreigns)
+      return false;
+    for (const auto& t : mod_tables)
+      if (t.second.is_bulk_insert())
+        return true;
+    return false;
+  }
+
 private:
   /** Assign a rollback segment for modifying temporary tables.
   @return the assigned rollback segment */
@@ -1134,19 +1188,6 @@ struct commit_node_t{
 };
 
 
-/** Test if trx->mutex is owned. */
-#define trx_mutex_own(t) mutex_own(&t->mutex)
-
-/** Acquire the trx->mutex. */
-#define trx_mutex_enter(t) do {			\
-	mutex_enter(&t->mutex);			\
-} while (0)
-
-/** Release the trx->mutex. */
-#define trx_mutex_exit(t) do {			\
-	mutex_exit(&t->mutex);			\
-} while (0)
-
 #include "trx0trx.inl"
 
 #endif
diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl
index 93c9591e0c2..b063c920e2f 100644
--- a/storage/innobase/include/trx0trx.inl
+++ b/storage/innobase/include/trx0trx.inl
@@ -84,123 +84,3 @@ trx_get_error_info(
 {
 	return(trx->error_info);
 }
-
-/*******************************************************************//**
-Retrieves transaction's que state in a human readable string. The string
-should not be free()'d or modified.
-@return string in the data segment */
-UNIV_INLINE
-const char*
-trx_get_que_state_str(
-/*==================*/
-	const trx_t*	trx)	/*!< in: transaction */
-{
-	/* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
-	switch (trx->lock.que_state) {
-	case TRX_QUE_RUNNING:
-		return("RUNNING");
-	case TRX_QUE_LOCK_WAIT:
-		return("LOCK WAIT");
-	case TRX_QUE_ROLLING_BACK:
-		return("ROLLING BACK");
-	case TRX_QUE_COMMITTING:
-		return("COMMITTING");
-	default:
-		return("UNKNOWN");
-	}
-}
-
-/** Retreieves the transaction ID.
-In a given point in time it is guaranteed that IDs of the running
-transactions are unique. The values returned by this function for readonly
-transactions may be reused, so a subsequent RO transaction may get the same ID
-as a RO transaction that existed in the past. The values returned by this
-function should be used for printing purposes only.
-@param[in]	trx	transaction whose id to retrieve
-@return transaction id */
-UNIV_INLINE
-trx_id_t
-trx_get_id_for_print(
-	const trx_t*	trx)
-{
-	/* Readonly and transactions whose intentions are unknown (whether
-	they will eventually do a WRITE) don't have trx_t::id assigned (it is
-	0 for those transactions). Transaction IDs in
-	innodb_trx.trx_id,
-	innodb_locks.lock_id,
-	innodb_locks.lock_trx_id,
-	innodb_lock_waits.requesting_trx_id,
-	innodb_lock_waits.blocking_trx_id should match because those tables
-	could be used in an SQL JOIN on those columns. Also trx_t::id is
-	printed by SHOW ENGINE INNODB STATUS, and in logs, so we must have the
-	same value printed everywhere consistently. */
-
-	/* DATA_TRX_ID_LEN is the storage size in bytes. */
-	static const trx_id_t	max_trx_id
-		= (1ULL << (DATA_TRX_ID_LEN * CHAR_BIT)) - 1;
-
-	ut_ad(trx->id <= max_trx_id);
-
-	return(trx->id != 0
-	       ? trx->id
-	       : reinterpret_cast<trx_id_t>(trx) | (max_trx_id + 1));
-}
-
-/**********************************************************************//**
-Determine if a transaction is a dictionary operation.
-@return dictionary operation mode */
-UNIV_INLINE
-enum trx_dict_op_t
-trx_get_dict_operation(
-/*===================*/
-	const trx_t*	trx)	/*!< in: transaction */
-{
-	trx_dict_op_t op = static_cast<trx_dict_op_t>(trx->dict_operation);
-
-#ifdef UNIV_DEBUG
-	switch (op) {
-	case TRX_DICT_OP_NONE:
-	case TRX_DICT_OP_TABLE:
-	case TRX_DICT_OP_INDEX:
-		return(op);
-	}
-	ut_error;
-#endif /* UNIV_DEBUG */
-	return(op);
-}
-/**********************************************************************//**
-Flag a transaction a dictionary operation. */
-UNIV_INLINE
-void
-trx_set_dict_operation(
-/*===================*/
-	trx_t*			trx,	/*!< in/out: transaction */
-	enum trx_dict_op_t	op)	/*!< in: operation, not
-					TRX_DICT_OP_NONE */
-{
-#ifdef UNIV_DEBUG
-	enum trx_dict_op_t	old_op = trx_get_dict_operation(trx);
-
-	switch (op) {
-	case TRX_DICT_OP_NONE:
-		ut_error;
-		break;
-	case TRX_DICT_OP_TABLE:
-		switch (old_op) {
-		case TRX_DICT_OP_NONE:
-		case TRX_DICT_OP_INDEX:
-		case TRX_DICT_OP_TABLE:
-			goto ok;
-		}
-		ut_error;
-		break;
-	case TRX_DICT_OP_INDEX:
-		ut_ad(old_op == TRX_DICT_OP_NONE);
-		break;
-	}
-ok:
-#endif /* UNIV_DEBUG */
-
-	trx->ddl = true;
-	trx->dict_operation = op;
-}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
index 99a9c66c839..07c1c6a756b 100644
--- a/storage/innobase/include/trx0types.h
+++ b/storage/innobase/include/trx0types.h
@@ -24,11 +24,9 @@ Transaction system global type definitions
 Created 3/26/1996 Heikki Tuuri
 *******************************************************/
 
-#ifndef trx0types_h
-#define trx0types_h
-
-#include "ut0byte.h"
-#include "ut0mutex.h"
+#pragma once
+#include "univ.i"
+#include "ut0new.h"
 
 #include <vector>
 
@@ -50,15 +48,6 @@ static const ulint TRX_MAGIC_N = 91118598;
 
 constexpr uint innodb_purge_threads_MAX= 32;
 
-/** Transaction execution states when trx->state == TRX_STATE_ACTIVE */
-enum trx_que_t {
-	TRX_QUE_RUNNING,		/*!< transaction is running */
-	TRX_QUE_LOCK_WAIT,		/*!< transaction is waiting for
-					a lock */
-	TRX_QUE_ROLLING_BACK,		/*!< transaction is rolling back */
-	TRX_QUE_COMMITTING		/*!< transaction is committing */
-};
-
 /** Transaction states (trx_t::state) */
 enum trx_state_t {
 	TRX_STATE_NOT_STARTED,
@@ -72,21 +61,6 @@ enum trx_state_t {
 	TRX_STATE_COMMITTED_IN_MEMORY
 };
 
-/** Type of data dictionary operation */
-enum trx_dict_op_t {
-	/** The transaction is not modifying the data dictionary. */
-	TRX_DICT_OP_NONE = 0,
-	/** The transaction is creating a table or an index, or
-	dropping a table.  The table must be dropped in crash
-	recovery.  This and TRX_DICT_OP_NONE are the only possible
-	operation modes in crash recovery. */
-	TRX_DICT_OP_TABLE = 1,
-	/** The transaction is creating or dropping an index in an
-	existing table.  In crash recovery, the data dictionary
-	must be locked, but the table must not be dropped. */
-	TRX_DICT_OP_INDEX = 2
-};
-
 /** Memory objects */
 /* @{ */
 /** Transaction */
@@ -133,10 +107,10 @@ typedef	byte	trx_undo_rec_t;
 
 /* @} */
 
-typedef ib_mutex_t RsegMutex;
-typedef ib_mutex_t TrxMutex;
-typedef ib_mutex_t PQMutex;
-typedef ib_mutex_t TrxSysMutex;
-
 typedef std::vector<trx_id_t, ut_allocator<trx_id_t> >	trx_ids_t;
-#endif /* trx0types_h */
+
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
+static constexpr unsigned TRX_SYS_N_RSEGS= 128;
+/** Maximum number of undo tablespaces (not counting the system tablespace) */
+static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1;
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
index a4578d61fe2..3474a903f6c 100644
--- a/storage/innobase/include/trx0undo.h
+++ b/storage/innobase/include/trx0undo.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -96,22 +96,6 @@ inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
 	return mach_read_from_7(ptr);
 }
 
-/** Gets an undo log page and x-latches it.
-@param[in]	page_id		page id
-@param[in,out]	mtr		mini-transaction
-@return pointer to page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get(const page_id_t page_id, mtr_t* mtr);
-
-/** Gets an undo log page and s-latches it.
-@param[in]	page_id		page id
-@param[in,out]	mtr		mini-transaction
-@return pointer to page s-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr);
-
 /** Get the next record in an undo log.
 @param[in]      undo_page       undo log page
 @param[in]      rec             undo record offset in the page
@@ -140,8 +124,8 @@ trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
 @param[in,out]  mtr     mini-transaction
 @return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
-                      uint16_t offset, mtr_t *mtr);
+trx_undo_get_next_rec(const buf_block_t *&block, uint16_t rec,
+                      uint32_t page_no, uint16_t offset, mtr_t *mtr);
 
 /** Get the first record in an undo log.
 @param[in]      space   undo log header space
@@ -150,11 +134,13 @@ trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
 @param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
 @param[out]     block   undo log page
 @param[in,out]  mtr     mini-transaction
-@return undo log record, the page latched, NULL if none */
+@param[out]     err     error code
+@return undo log record, the page latched
+@retval nullptr if none */
 trx_undo_rec_t*
 trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
-                       uint16_t offset, ulint mode, buf_block_t*& block,
-                       mtr_t *mtr);
+                       uint16_t offset, ulint mode, const buf_block_t*& block,
+                       mtr_t *mtr, dberr_t *err);
 
 /** Initialize an undo log page.
 NOTE: This corresponds to a redo log record and must not be changed!
@@ -165,24 +151,24 @@ void trx_undo_page_init(const buf_block_t &block);
 /** Allocate an undo log page.
 @param[in,out]	undo	undo log
 @param[in,out]	mtr	mini-transaction that does not hold any page latch
+@param[out]	err	error code
 @return	X-latched block if success
-@retval	NULL	on failure */
-buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
-	MY_ATTRIBUTE((nonnull, warn_unused_result));
+@retval	nullptr	on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 
 /** Free the last undo log page. The caller must hold the rseg mutex.
 @param[in,out]	undo	undo log
 @param[in,out]	mtr	mini-transaction that does not hold any undo log page
-			or that has allocated the undo log page */
-void
-trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
-	MY_ATTRIBUTE((nonnull));
+			or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 
-/** Truncate the tail of an undo log during rollback.
-@param[in,out]	undo	undo log
-@param[in]	limit	all undo logs after this limit will be discarded
-@param[in]	is_temp	whether this is temporary undo log */
-void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp);
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx);
 
 /** Truncate the head of an undo log.
 NOTE that only whole pages are freed; the header page is not
@@ -191,13 +177,15 @@ freed, but emptied, if all the records there are below the limit.
 @param[in]	hdr_page_no	header page number
 @param[in]	hdr_offset	header offset on the page
 @param[in]	limit		first undo number to preserve
-(everything below the limit will be truncated) */
-void
+(everything below the limit will be truncated)
+@return error code */
+dberr_t
 trx_undo_truncate_start(
 	trx_rseg_t*	rseg,
 	uint32_t	hdr_page_no,
 	uint16_t	hdr_offset,
-	undo_no_t	limit);
+	undo_no_t	limit)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
 /** Mark that an undo log header belongs to a data dictionary transaction.
 @param[in]	trx	dictionary transaction
 @param[in,out]	undo	undo log
@@ -292,9 +280,7 @@ struct trx_undo_t {
 					log */
 	XID		xid;		/*!< X/Open XA transaction
 					identification */
-	ibool		dict_operation;	/*!< TRUE if a dict operation trx */
-	table_id_t	table_id;	/*!< if a dict operation, then the table
-					id */
+	bool		dict_operation;	/*!< TRUE if a dict operation trx */
 	trx_rseg_t*	rseg;		/*!< rseg where the undo log belongs */
 	/*-----------------------------*/
 	uint32_t	hdr_page_no;	/*!< page number of the header page in
@@ -326,6 +312,106 @@ struct trx_undo_t {
 					/*!< undo log objects in the rollback
 					segment are chained into lists */
 };
+
+/** Cache a pointer to an undo record in a latched buffer pool page,
+parse the undo log record and store the record type, update vector
+and compiler information */
+class UndorecApplier
+{
+  /** Undo log block page id */
+  page_id_t page_id;
+  /** Undo log record pointer */
+  const trx_undo_rec_t *undo_rec;
+  /** Offset of the undo log record within the block */
+  uint16_t offset;
+  /** Transaction id of the undo log */
+  const trx_id_t trx_id;
+  /** Undo log record type */
+  ulint type;
+  /** compiler information */
+  ulint cmpl_info;
+  /** Update vector */
+  upd_t *update;
+  /** memory heap which can be used to build previous version of
+  the index record and its offsets */
+  mem_heap_t *heap;
+  /** mini-transaction for accessing B-tree pages */
+  mtr_t mtr;
+
+public:
+  UndorecApplier(page_id_t page_id, trx_id_t trx_id) :
+    page_id(page_id), trx_id(trx_id), heap(mem_heap_create(100))
+  {
+  }
+
+  /** Assign the next page id */
+  void assign_next(const page_id_t next_page_id)
+  {
+    page_id= next_page_id;
+  }
+
+  /** Assign the undo log record and offset */
+  inline void assign_rec(const buf_block_t &block, uint16_t offset);
+
+  uint16_t get_offset() const { return offset; }
+
+  page_id_t get_page_id() const { return page_id; }
+
+  /** Handle the DML undo log and apply it on online indexes */
+  inline void apply_undo_rec();
+
+  ~UndorecApplier()
+  {
+    mem_heap_free(heap);
+  }
+
+private:
+  /** Handle the insert undo log and apply it on online indexes
+  @param  tuple		row reference from undo log record
+  @param  clust_index	clustered index */
+  void log_insert(const dtuple_t &tuple, dict_index_t *clust_index);
+
+  /** Handle the update, delete undo log and apply it on online
+  indexes.
+  @param  tuple		row reference from undo log record
+  @param  clust_index	clustered index */
+  void log_update(const dtuple_t &tuple, dict_index_t *clust_index);
+
+  /** Check whether the given roll pointer is generated by
+  the current undo log record information stored.
+  @return true if roll pointer matches with current undo log info */
+  bool is_same(roll_ptr_t roll_ptr) const
+  {
+    uint16_t offset= static_cast<uint16_t>(roll_ptr);
+    uint32_t page_no= static_cast<uint32_t>(roll_ptr >> 16);
+    return page_no == page_id.page_no() && offset == this->offset;
+  }
+
+  /** Clear the undo log record information */
+  void clear_undo_rec()
+  {
+    undo_rec= nullptr;
+    cmpl_info= 0;
+    type= 0;
+    update= nullptr;
+    mem_heap_empty(heap);
+  }
+
+  /** Get the correct version of the clustered index record that
+  was modified by the current undo log record. Because there could
+  be the multiple successive updates of the same record within the
+  same transaction.
+  @param	tuple		tuple contains primary key value
+  @param	index		clustered index
+  @param[out]	clust_rec	current clustered index record
+  @param	offsets		offsets points to the record
+  @return clustered index record which was changed by
+  the undo log record or nullptr when there is no clustered
+  index record changed by undo log record */
+  const rec_t* get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+                           const rec_t **clust_rec, rec_offs **offsets);
+};
+
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** The offset of the undo log page header on pages of the undo log */
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
index 43af932708e..9f05989f634 100644
--- a/storage/innobase/include/trx0undo.inl
+++ b/storage/innobase/include/trx0undo.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -95,35 +95,6 @@ trx_undo_trx_id_is_insert(
 	return bool(trx_id[DATA_TRX_ID_LEN] >> 7);
 }
 
-/** Gets an undo log page and x-latches it.
-@param[in]	page_id		page id
-@param[in,out]	mtr		mini-transaction
-@return pointer to page x-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get(const page_id_t page_id, mtr_t* mtr)
-{
-	buf_block_t*	block = buf_page_get(page_id, 0, RW_X_LATCH, mtr);
-
-	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-	return block;
-}
-
-/** Gets an undo log page and s-latches it.
-@param[in]	page_id		page id
-@param[in,out]	mtr		mini-transaction
-@return pointer to page s-latched */
-UNIV_INLINE
-buf_block_t*
-trx_undo_page_get_s_latched(const page_id_t page_id, mtr_t* mtr)
-{
-	buf_block_t*	block = buf_page_get(page_id, 0, RW_S_LATCH, mtr);
-
-	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-
-	return block;
-}
-
 /** Determine the end offset of undo log records of an undo log page.
 @param[in]	undo_page	undo log page
 @param[in]	page_no		undo log header page number
@@ -135,11 +106,11 @@ uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
 {
   if (page_no == undo_page->page.id().page_no())
     if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
-					undo_page->frame))
+					undo_page->page.frame))
       return end;
 
   return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
-			  undo_page->frame);
+			  undo_page->page.frame);
 }
 
 /** Get the next record in an undo log.
@@ -153,6 +124,6 @@ trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
                            uint32_t page_no, uint16_t offset)
 {
   uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
-  uint16_t next= mach_read_from_2(undo_page->frame + rec);
-  return next == end ? nullptr : undo_page->frame + next;
+  uint16_t next= mach_read_from_2(undo_page->page.frame + rec);
+  return next == end ? nullptr : undo_page->page.frame + next;
 }
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 7c1af230eaf..23eee89f857 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -31,8 +31,8 @@ Version control for database, common definitions, and include files
 Created 1/20/1994 Heikki Tuuri
 ****************************************************************************/
 
-#ifndef univ_i
-#define univ_i
+#pragma once
+#define my_test_if_thinly_provisioned(f) 0
 
 /* aux macros to convert M into "123" (string) if M is defined like
 #define M 123 */
@@ -57,16 +57,6 @@ component, i.e. we show M.N.P as M.N */
 (time in seconds) */
 #define INNODB_EXTEND_TIMEOUT_INTERVAL 30
 
-#ifdef MYSQL_DYNAMIC_PLUGIN
-/* In the dynamic plugin, redefine some externally visible symbols
-in order not to conflict with the symbols of a builtin InnoDB. */
-
-/* Rename all C++ classes that contain virtual functions, because we
-have not figured out how to apply the visibility=hidden attribute to
-the virtual method table (vtable) in GCC 3. */
-# define ha_innobase ha_innodb
-#endif /* MYSQL_DYNAMIC_PLUGIN */
-
 #if defined(_WIN32)
 # include <windows.h>
 #endif /* _WIN32 */
@@ -78,16 +68,9 @@ support cross-platform development and expose comonly used SQL names. */
 
 #include <my_global.h>
 #include "my_counter.h"
+#include "aligned.h"
 #include <m_string.h>
-
-/* JAN: TODO: missing 5.7 header */
-#ifdef HAVE_MY_THREAD_H
-//# include <my_thread.h>
-#endif
-
-#ifndef UNIV_INNOCHECKSUM
-# include <mysqld_error.h>
-#endif /* !UNIV_INNOCHECKSUM */
+#include <mysqld_error.h>
 
 /* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
 #include <sys/stat.h>
@@ -119,15 +102,6 @@ HAVE_PSI_INTERFACE is defined. */
 #  define UNIV_PFS_MEMORY
 # endif /* HAVE_PSI_MEMORY_INTERFACE */
 
-/* There are mutexes/rwlocks that we want to exclude from
-instrumentation even if their corresponding performance schema
-define is set. And this PFS_NOT_INSTRUMENTED is used
-as the key value to identify those objects that would
-be excluded from instrumentation. */
-# define PFS_NOT_INSTRUMENTED		ULINT32_UNDEFINED
-
-# define PFS_IS_INSTRUMENTED(key)	((key) != PFS_NOT_INSTRUMENTED)
-
 #ifdef HAVE_PFS_THREAD_PROVIDER_H
 /* For PSI_MUTEX_CALL() and similar. */
 #include "pfs_thread_provider.h"
@@ -194,8 +168,6 @@ using the call command. */
                                                 related stuff. */
 #define UNIV_SEARCH_PERF_STAT			/* statistics for the
 						adaptive hash index */
-#define UNIV_SRV_PRINT_LATCH_WAITS		/* enable diagnostic output
-						in sync0sync.cc */
 #define UNIV_BTR_PRINT				/* enable functions for
 						printing B-trees */
 #define UNIV_ZIP_DEBUG				/* extensive consistency checks
@@ -212,27 +184,8 @@ using the call command. */
                                                 info output */
 #endif
 
-#define UNIV_BTR_DEBUG				/* check B-tree links */
-#define UNIV_LIGHT_MEM_DEBUG			/* light memory debugging */
-
 // #define UNIV_SQL_DEBUG
 
-/* Linkage specifier for non-static InnoDB symbols (variables and functions)
-that are only referenced from within InnoDB, not from MySQL. We disable the
-GCC visibility directive on all Sun operating systems because there is no
-easy way to get it to work. See http://bugs.mysql.com/bug.php?id=52263. */
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(sun) || defined(__INTEL_COMPILER)
-# define UNIV_INTERN __attribute__((visibility ("hidden")))
-#else
-# define UNIV_INTERN
-#endif
-
-#if defined(__GNUC__) && (__GNUC__ >= 11)
-# define ATTRIBUTE_ACCESS(X) __attribute__((access X))
-#else
-# define ATTRIBUTE_ACCESS(X)
-#endif
-
 #ifndef MY_ATTRIBUTE
 #if defined(__GNUC__)
 #  define MY_ATTRIBUTE(A) __attribute__(A)
@@ -421,12 +374,6 @@ in both 32-bit and 64-bit environments. */
 # define UINT64PFx	"%016" PRIx64
 #endif
 
-#ifdef UNIV_INNOCHECKSUM
-extern bool 			strict_verify;
-extern FILE* 			log_file;
-extern uint32_t			cur_page_num;
-#endif /* UNIV_INNOCHECKSUM */
-
 typedef int64_t ib_int64_t;
 typedef uint64_t ib_uint64_t;
 typedef uint32_t ib_uint32_t;
@@ -522,14 +469,21 @@ it is read or written. */
 # define UNIV_PREFETCH_R(addr) ((void) 0)
 # define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
 
-# elif defined __WIN__
-# include <xmmintrin.h>
+# elif defined _MSC_VER
 # define UNIV_EXPECT(expr,value) (expr)
 # define UNIV_LIKELY_NULL(expr) (expr)
-// __MM_HINT_T0 - (temporal data)
-// prefetch data into all levels of the cache hierarchy.
-# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
-# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# if defined _M_IX86 || defined _M_X64
+   // __MM_HINT_T0 - (temporal data)
+   // prefetch data into all levels of the cache hierarchy.
+#  define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+#  define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# elif defined _M_ARM64
+#  define UNIV_PREFETCH_R(addr) __prefetch(addr)
+#  define UNIV_PREFETCH_RW(addr) __prefetch(addr)
+# else
+#  define UNIV_PREFETCH_R ((void) 0)
+#  define  UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif
 #else
 /* Dummy versions of the macros */
 # define UNIV_EXPECT(expr,value) (expr)
@@ -546,28 +500,11 @@ it is read or written. */
 /* Compile-time constant of the given array's size. */
 #define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
 
-/* The return type from a thread's start function differs between Unix and
-Windows, so define a typedef for it and a macro to use at the end of such
-functions. */
-
-#ifdef _WIN32
-typedef DWORD os_thread_ret_t;
-# define OS_THREAD_DUMMY_RETURN		return(0)
-# define OS_PATH_SEPARATOR		'\\'
-# define OS_PATH_SEPARATOR_ALT		'/'
-#else
-typedef void* os_thread_ret_t;
-# define OS_THREAD_DUMMY_RETURN		return(NULL)
-# define OS_PATH_SEPARATOR		'/'
-# define OS_PATH_SEPARATOR_ALT		'\\'
-#endif
-
 #include <stdio.h>
 #include "db0err.h"
 #include "ut0dbg.h"
 #include "ut0lst.h"
 #include "ut0ut.h"
-#include "sync0types.h"
 
 extern ulong	srv_page_size_shift;
 extern ulong	srv_page_size;
@@ -576,4 +513,49 @@ extern ulong	srv_page_size;
 myisam/sp_defs.h. We only support 2 dimension data */
 #define SPDIMS          2
 
-#endif
+#ifdef HAVE_PSI_INTERFACE
+typedef unsigned int mysql_pfs_key_t;
+
+# ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t fts_cache_mutex_key;
+extern mysql_pfs_key_t fts_cache_init_mutex_key;
+extern mysql_pfs_key_t fts_delete_mutex_key;
+extern mysql_pfs_key_t fts_doc_id_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t log_sys_mutex_key;
+extern mysql_pfs_key_t log_flush_order_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+extern mysql_pfs_key_t purge_sys_pq_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rtr_active_mutex_key;
+extern mysql_pfs_key_t rtr_match_mutex_key;
+extern mysql_pfs_key_t rtr_path_mutex_key;
+extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t buf_dblwr_mutex_key;
+extern mysql_pfs_key_t trx_pool_mutex_key;
+extern mysql_pfs_key_t trx_pool_manager_mutex_key;
+extern mysql_pfs_key_t lock_wait_mutex_key;
+extern mysql_pfs_key_t srv_threads_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+extern mysql_pfs_key_t index_online_log_key;
+extern mysql_pfs_key_t trx_sys_rw_lock_key;
+extern mysql_pfs_key_t lock_latch_key;
+extern mysql_pfs_key_t trx_rseg_latch_key;
+# endif /* UNIV_PFS_RWLOCK */
+#endif /* HAVE_PSI_INTERFACE */
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
index 646a5f367c2..d6589cc4fd3 100644
--- a/storage/innobase/include/ut0counter.h
+++ b/storage/innobase/include/ut0counter.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -28,19 +28,9 @@ Created 2012/04/12 by Sunny Bains
 #ifndef ut0counter_h
 #define ut0counter_h
 
-#include "os0thread.h"
+#include "univ.i"
 #include "my_rdtsc.h"
 
-/** CPU cache line size */
-#ifdef CPU_LEVEL1_DCACHE_LINESIZE
-# define CACHE_LINE_SIZE	CPU_LEVEL1_DCACHE_LINESIZE
-#else
-# error CPU_LEVEL1_DCACHE_LINESIZE is undefined
-#endif /* CPU_LEVEL1_DCACHE_LINESIZE */
-
-/** Default number of slots to use in ib_counter_t */
-#define IB_N_SLOTS		64
-
 /** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles
 as a random value. See the comments for my_timer_cycles() */
 /** @return result from RDTSC or similar functions. */
@@ -56,7 +46,7 @@ get_rnd_value()
 	/* We may go here if my_timer_cycles() returns 0,
 	so we have to have the plan B for the counter. */
 #if !defined(_WIN32)
-	return (size_t)os_thread_get_curr_id();
+	return (size_t)pthread_self();
 #else
 	LARGE_INTEGER cnt;
 	QueryPerformanceCounter(&cnt);
@@ -65,14 +55,34 @@ get_rnd_value()
 #endif /* !_WIN32 */
 }
 
+/** Atomic which occupies whole CPU cache line.
+Note: We rely on the default constructor of std::atomic and
+do not explicitly initialize the contents. This works for us,
+because ib_counter_t is only intended for usage with global
+memory that is allocated from the .bss and thus guaranteed to
+be zero-initialized by the run-time environment.
+@see srv_stats */
+template <typename Type>
+struct ib_atomic_counter_element_t {
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_relaxed<Type> value;
+};
+
+template <typename Type>
+struct ib_counter_element_t {
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Type value;
+};
+
+
 /** Class for using fuzzy counters. The counter is multi-instance relaxed atomic
 so the results are not guaranteed to be 100% accurate but close
-enough. Creates an array of counters and separates each element by the
-CACHE_LINE_SIZE bytes */
-template <typename Type, int N = IB_N_SLOTS>
+enough. */
+template <typename Type,
+          template <typename T> class Element = ib_atomic_counter_element_t,
+          int N = 128 >
 struct ib_counter_t {
 	/** Increment the counter by 1. */
 	void inc() { add(1); }
+	ib_counter_t& operator++() { inc(); return *this; }
 
 	/** Increment the counter by 1.
 	@param[in]	index	a reasonably thread-unique identifier */
@@ -85,12 +95,12 @@ struct ib_counter_t {
 	/** Add to the counter.
 	@param[in]	index	a reasonably thread-unique identifier
 	@param[in]	n	amount to be added */
-	void add(size_t index, Type n) {
+	TPOOL_SUPPRESS_TSAN void add(size_t index, Type n) {
 		index = index % N;
 
 		ut_ad(index < UT_ARR_SIZE(m_counter));
 
-		m_counter[index].value.fetch_add(n, std::memory_order_relaxed);
+		m_counter[index].value += n;
 	}
 
 	/* @return total value - not 100% accurate, since it is relaxed atomic*/
@@ -98,28 +108,16 @@ struct ib_counter_t {
 		Type	total = 0;
 
 		for (const auto &counter : m_counter) {
-			total += counter.value.load(std::memory_order_relaxed);
+			total += counter.value;
 		}
 
 		return(total);
 	}
 
 private:
-	/** Atomic which occupies whole CPU cache line.
-	Note: We rely on the default constructor of std::atomic and
-	do not explicitly initialize the contents. This works for us,
-	because ib_counter_t is only intended for usage with global
-	memory that is allocated from the .bss and thus guaranteed to
-	be zero-initialized by the run-time environment.
-	@see srv_stats
-	@see rw_lock_stats */
-	struct ib_counter_element_t {
-		MY_ALIGNED(CACHE_LINE_SIZE) std::atomic<Type> value;
-	};
-	static_assert(sizeof(ib_counter_element_t) == CACHE_LINE_SIZE, "");
-
+	static_assert(sizeof(Element<Type>) == CPU_LEVEL1_DCACHE_LINESIZE, "");
 	/** Array of counter elements */
-	MY_ALIGNED(CACHE_LINE_SIZE) ib_counter_element_t m_counter[N];
+	alignas(CPU_LEVEL1_DCACHE_LINESIZE) Element<Type> m_counter[N];
 };
 
 #endif /* ut0counter_h */
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
index 4c8d2cf7a61..f4183e4c61a 100644
--- a/storage/innobase/include/ut0new.h
+++ b/storage/innobase/include/ut0new.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -120,9 +120,8 @@ InnoDB:
 #ifndef ut0new_h
 #define ut0new_h
 
-#include <algorithm> /* std::min() */
 #include <limits> /* std::numeric_limits */
-#include <map> /* std::map */
+#include <thread>
 
 #include <stddef.h>
 #include <stdlib.h> /* malloc() */
@@ -136,8 +135,7 @@ InnoDB:
 
 #include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
 
-#include "os0thread.h" /* os_thread_sleep() */
-#include "ut0ut.h" /* ut_strcmp_functor, ut_basename_noext() */
+#include "ut0ut.h" /* ut_strcmp_functor */
 
 #define	OUT_OF_MEMORY_MSG \
 	"Check if you should increase the swap file or ulimits of your" \
@@ -381,7 +379,7 @@ public:
 				break;
 			}
 
-			os_thread_sleep(1000000 /* 1 second */);
+			std::this_thread::sleep_for(std::chrono::seconds(1));
 		}
 
 		if (ptr == NULL) {
@@ -516,7 +514,7 @@ public:
 				break;
 			}
 
-			os_thread_sleep(1000000 /* 1 second */);
+			std::this_thread::sleep_for(std::chrono::seconds(1));
 		}
 
 		if (pfx_new == NULL) {
@@ -843,6 +841,8 @@ constexpr const char* const auto_event_names[] =
   "buf0buf",
   "buf0dblwr",
   "buf0dump",
+  "buf0lru",
+  "buf0rea",
   "dict0dict",
   "dict0mem",
   "dict0stats",
@@ -868,7 +868,6 @@ constexpr const char* const auto_event_names[] =
   "lexyy",
   "lock0lock",
   "mem0mem",
-  "os0event",
   "os0file",
   "pars0lex",
   "rem0rec",
@@ -879,11 +878,6 @@ constexpr const char* const auto_event_names[] =
   "row0mysql",
   "row0sel",
   "srv0start",
-  "sync0arr",
-  "sync0debug",
-  "sync0rw",
-  "sync0start",
-  "sync0types",
   "trx0i_s",
   "trx0i_s",
   "trx0roll",
diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h
index e0a1f7c04ca..63628cc169f 100644
--- a/storage/innobase/include/ut0pool.h
+++ b/storage/innobase/include/ut0pool.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,7 +31,7 @@ Created 2012-Feb-26 Sunny Bains
 #include <queue>
 #include <functional>
 
-#include "ut0new.h"
+#include <my_global.h>
 
 /** Allocate the memory for the object in blocks. We keep the objects sorted
 on pointer so that they are closer together in case they have to be iterated
@@ -41,8 +41,6 @@ struct Pool {
 
 	typedef Type value_type;
 
-	// FIXME: Add an assertion to check alignment and offset is
-	// as we expect it. Also, sizeof(void*) can be 8, can we impove on this.
 	struct Element {
 		Pool*		m_pool;
 		value_type	m_type;
@@ -57,17 +55,23 @@ struct Pool {
 		m_size(size),
 		m_last()
 	{
+		ut_ad(ut_is_2pow(size));
 		ut_a(size >= sizeof(Element));
+		static_assert(!(sizeof(Element) % CPU_LEVEL1_DCACHE_LINESIZE),
+			      "alignment");
 
 		m_lock_strategy.create();
 
 		ut_a(m_start == 0);
 
-		m_start = reinterpret_cast<Element*>(ut_zalloc_nokey(m_size));
+		m_start = static_cast<Element*>(
+			aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE));
+		memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(
+			m_start, 0, m_size);
 
 		m_last = m_start;
 
-		m_end = &m_start[m_size / sizeof(*m_start)];
+		m_end = &m_start[m_size / sizeof *m_start];
 
 		/* Note: Initialise only a small subset, even though we have
 		allocated all the memory. This is required only because PFS
@@ -90,7 +94,7 @@ struct Pool {
 			Factory::destroy(&elem->m_type);
 		}
 
-		ut_free(m_start);
+		IF_WIN(_aligned_free,free)(m_start);
 		m_end = m_last = m_start = 0;
 		m_size = 0;
 	}
@@ -254,7 +258,8 @@ struct PoolManager {
 					except crash and burn, however lets
 					be a little optimistic and wait for
 					a resource to be freed. */
-					os_thread_sleep(delay * 1000000);
+					std::this_thread::sleep_for(
+						std::chrono::seconds(delay));
 
 					if (delay < 32) {
 						delay <<= 1;
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
index dba8d3f1a06..511eb21fd11 100644
--- a/storage/innobase/include/ut0rnd.h
+++ b/storage/innobase/include/ut0rnd.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -90,15 +90,6 @@ ut_fold_ull(
 /*========*/
 	ib_uint64_t	d)	/*!< in: 64-bit integer */
 	MY_ATTRIBUTE((const));
-/*************************************************************//**
-Folds a character string ending in the null character.
-@return folded value */
-UNIV_INLINE
-ulint
-ut_fold_string(
-/*===========*/
-	const char*	str)	/*!< in: null-terminated string */
-	MY_ATTRIBUTE((warn_unused_result));
 /***********************************************************//**
 Looks for a prime number slightly greater than the given argument.
 The prime is chosen so that it is not near any power of 2.
diff --git a/storage/innobase/include/ut0rnd.inl b/storage/innobase/include/ut0rnd.inl
index c0105160a42..37da323f8f3 100644
--- a/storage/innobase/include/ut0rnd.inl
+++ b/storage/innobase/include/ut0rnd.inl
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -59,28 +59,6 @@ ut_fold_ull(
 	return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
 				  (ulint) (d >> 32)));
 }
-
-/*************************************************************//**
-Folds a character string ending in the null character.
-@return folded value */
-UNIV_INLINE
-ulint
-ut_fold_string(
-/*===========*/
-	const char*	str)	/*!< in: null-terminated string */
-{
-	ulint	fold = 0;
-
-	ut_ad(str);
-
-	while (*str != '\0') {
-		fold = ut_fold_ulint_pair(fold, (ulint)(*str));
-		str++;
-	}
-
-	return(fold);
-}
-
 #endif /* !UNIV_INNOCHECKSUM */
 
 /*************************************************************//**
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
index 369f3f8c5d3..b7625b512a2 100644
--- a/storage/innobase/include/ut0ut.h
+++ b/storage/innobase/include/ut0ut.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, 2020, MariaDB Corporation.
+Copyright (c) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -276,25 +276,6 @@ ut_strerr(
 
 #endif /* !UNIV_INNOCHECKSUM */
 
-#ifdef UNIV_PFS_MEMORY
-
-/** Extract the basename of a file without its extension.
-For example, extract "foo0bar" out of "/path/to/foo0bar.cc".
-@param[in]	file		file path, e.g. "/path/to/foo0bar.cc"
-@param[out]	base		result, e.g. "foo0bar"
-@param[in]	base_size	size of the output buffer 'base', if there
-is not enough space, then the result will be truncated, but always
-'\0'-terminated
-@return number of characters that would have been printed if the size
-were unlimited (not including the final ‘\0’) */
-size_t
-ut_basename_noext(
-	const char*	file,
-	char*		base,
-	size_t		base_size);
-
-#endif /* UNIV_PFS_MEMORY */
-
 namespace ib {
 
 /** This is a wrapper class, used to print any unsigned integer type
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
index 26838c95443..95c7a248f7a 100644
--- a/storage/innobase/include/ut0wqueue.h
+++ b/storage/innobase/include/ut0wqueue.h
@@ -30,8 +30,7 @@ wait for work items to be available and take them off the queue for
 processing.
 ************************************************************************/
 
-#ifndef IB_WORK_QUEUE_H
-#define IB_WORK_QUEUE_H
+#pragma once
 
 #include "ut0list.h"
 #include "mem0mem.h"
@@ -42,12 +41,12 @@ struct ib_list_t;
 /** Work queue */
 struct ib_wqueue_t
 {
-	/** Mutex protecting everything */
-	ib_mutex_t	mutex;
-	/** Work item list */
-	ib_list_t*	items;
-	/** ib_list_len(*items) */
-	size_t		length;
+  /** Mutex protecting everything */
+  mysql_mutex_t mutex;
+  /** Work item list */
+  ib_list_t *items;
+  /** ib_list_len(*items) */
+  size_t length;
 };
 
 /****************************************************************//**
@@ -85,5 +84,3 @@ void*
 ib_wqueue_nowait(
 /*=============*/
 	ib_wqueue_t*	wq);		/*<! in: work queue */
-
-#endif /* IB_WORK_QUEUE_H */
diff --git a/storage/innobase/lock/lock0iter.cc b/storage/innobase/lock/lock0iter.cc
index 7a7130eddb9..0cd271bfd8d 100644
--- a/storage/innobase/lock/lock0iter.cc
+++ b/storage/innobase/lock/lock0iter.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -48,27 +49,20 @@ lock_queue_iterator_reset(
 	ulint			bit_no)	/*!< in: record number in the
 					heap */
 {
-	ut_ad(lock_mutex_own());
-
-	iter->current_lock = lock;
-
-	if (bit_no != ULINT_UNDEFINED) {
-
-		iter->bit_no = bit_no;
-	} else {
-
-		switch (lock_get_type_low(lock)) {
-		case LOCK_TABLE:
-			iter->bit_no = ULINT_UNDEFINED;
-			break;
-		case LOCK_REC:
-			iter->bit_no = lock_rec_find_set_bit(lock);
-			ut_a(iter->bit_no != ULINT_UNDEFINED);
-			break;
-		default:
-			ut_error;
-		}
-	}
+  lock_sys.assert_locked(*lock);
+
+  iter->current_lock = lock;
+
+  if (bit_no != ULINT_UNDEFINED);
+  else if (lock->is_table())
+    bit_no= ULINT_UNDEFINED;
+  else
+  {
+    bit_no= lock_rec_find_set_bit(lock);
+    ut_ad(bit_no != ULINT_UNDEFINED);
+  }
+
+  iter->bit_no= bit_no;
 }
 
 /*******************************************************************//**
@@ -81,27 +75,14 @@ lock_queue_iterator_get_prev(
 /*=========================*/
 	lock_queue_iterator_t*	iter)	/*!< in/out: iterator */
 {
-	const lock_t*	prev_lock;
-
-	ut_ad(lock_mutex_own());
-
-	switch (lock_get_type_low(iter->current_lock)) {
-	case LOCK_REC:
-		prev_lock = lock_rec_get_prev(
-			iter->current_lock, iter->bit_no);
-		break;
-	case LOCK_TABLE:
-		prev_lock = UT_LIST_GET_PREV(
-			un_member.tab_lock.locks, iter->current_lock);
-		break;
-	default:
-		ut_error;
-	}
+  lock_sys.assert_locked(*iter->current_lock);
 
-	if (prev_lock != NULL) {
+  const lock_t *prev_lock= !iter->current_lock->is_table()
+    ? lock_rec_get_prev(iter->current_lock, iter->bit_no)
+    : UT_LIST_GET_PREV(un_member.tab_lock.locks, iter->current_lock);
 
-		iter->current_lock = prev_lock;
-	}
+  if (prev_lock)
+    iter->current_lock= prev_lock;
 
-	return(prev_lock);
+  return prev_lock;
 }
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 8fedd8a68d6..3c7c3d348af 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -29,6 +29,7 @@ Created 5/7/1996 Heikki Tuuri
 #include "univ.i"
 
 #include <mysql/service_thd_error_context.h>
+#include <mysql/service_thd_wait.h>
 #include <sql_class.h>
 
 #include "lock0lock.h"
@@ -42,6 +43,10 @@ Created 5/7/1996 Heikki Tuuri
 #include "row0mysql.h"
 #include "row0vers.h"
 #include "pars0pars.h"
+#include "srv0mon.h"
+#include "que0que.h"
+#include "scope.h"
+#include <debug_sync.h>
 
 #include <set>
 
@@ -49,272 +54,283 @@ Created 5/7/1996 Heikki Tuuri
 #include <mysql/service_wsrep.h>
 #endif /* WITH_WSREP */
 
-/** Lock scheduling algorithm */
-ulong innodb_lock_schedule_algorithm;
-
 /** The value of innodb_deadlock_detect */
-my_bool	innobase_deadlock_detect;
-
-/*********************************************************************//**
-Checks if a waiting record lock request still has to wait in a queue.
-@return lock that is causing the wait */
-static
-const lock_t*
-lock_rec_has_to_wait_in_queue(
-/*==========================*/
-	const lock_t*	wait_lock);	/*!< in: waiting record lock */
-
-/** Grant a lock to a waiting lock request and release the waiting transaction
-after lock_reset_lock_and_trx_wait() has been called. */
-static void lock_grant_after_reset(lock_t* lock);
+my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+ulong innodb_deadlock_report;
 
+#ifdef HAVE_REPLICATION
 extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
 extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
 extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+#endif
 
-/** Pretty-print a table lock.
-@param[in,out]	file	output stream
-@param[in]	lock	table lock */
-static void lock_table_print(FILE* file, const lock_t* lock);
-
-/** Pretty-print a record lock.
-@param[in,out]	file	output stream
-@param[in]	lock	record lock
-@param[in,out]	mtr	mini-transaction for accessing the record */
-static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr);
-
-/** Deadlock checker. */
-class DeadlockChecker {
-public:
-	/** Check if a joining lock request results in a deadlock.
-	If a deadlock is found, we will resolve the deadlock by
-	choosing a victim transaction and rolling it back.
-	We will attempt to resolve all deadlocks.
-
-	@param[in]	lock	the lock request
-	@param[in,out]	trx	transaction requesting the lock
-
-	@return trx if it was chosen as victim
-	@retval	NULL if another victim was chosen,
-	or there is no deadlock (any more) */
-	static const trx_t* check_and_resolve(const lock_t* lock, trx_t* trx);
-
-private:
-	/** Do a shallow copy. Default destructor OK.
-	@param trx the start transaction (start node)
-	@param wait_lock lock that a transaction wants
-	@param mark_start visited node counter
-	@param report_waiters whether to call thd_rpl_deadlock_check() */
-	DeadlockChecker(
-		const trx_t*	trx,
-		const lock_t*	wait_lock,
-		ib_uint64_t	mark_start,
-		bool report_waiters)
-		:
-		m_cost(),
-		m_start(trx),
-		m_too_deep(),
-		m_wait_lock(wait_lock),
-		m_mark_start(mark_start),
-		m_n_elems(),
-		m_report_waiters(report_waiters)
-	{
-	}
-
-	/** Check if the search is too deep. */
-	bool is_too_deep() const
-	{
-		return(m_n_elems > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK
-		       || m_cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK);
-	}
-
-	/** Save current state.
-	@param lock lock to push on the stack.
-	@param heap_no the heap number to push on the stack.
-	@return false if stack is full. */
-	bool push(const lock_t*	lock, ulint heap_no)
-	{
-		ut_ad((lock_get_type_low(lock) & LOCK_REC)
-		      || (lock_get_type_low(lock) & LOCK_TABLE));
-
-		ut_ad(((lock_get_type_low(lock) & LOCK_TABLE) != 0)
-		      == (heap_no == ULINT_UNDEFINED));
-
-		/* Ensure that the stack is bounded. */
-		if (m_n_elems >= UT_ARR_SIZE(s_states)) {
-			return(false);
-		}
-
-		state_t&	state = s_states[m_n_elems++];
-
-		state.m_lock = lock;
-		state.m_wait_lock = m_wait_lock;
-		state.m_heap_no =heap_no;
-
-		return(true);
-	}
-
-	/** Restore state.
-	@param[out] lock current lock
-	@param[out] heap_no current heap_no */
-	void pop(const lock_t*& lock, ulint& heap_no)
-	{
-		ut_a(m_n_elems > 0);
-
-		const state_t&	state = s_states[--m_n_elems];
-
-		lock = state.m_lock;
-		heap_no = state.m_heap_no;
-		m_wait_lock = state.m_wait_lock;
-	}
-
-	/** Check whether the node has been visited.
-	@param lock lock to check
-	@return true if the node has been visited */
-	bool is_visited(const lock_t* lock) const
-	{
-		return(lock->trx->lock.deadlock_mark > m_mark_start);
-	}
-
-	/** Get the next lock in the queue that is owned by a transaction
-	whose sub-tree has not already been searched.
-	Note: "next" here means PREV for table locks.
-	@param lock Lock in queue
-	@param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED
-	@return next lock or NULL if at end of queue */
-	const lock_t* get_next_lock(const lock_t* lock, ulint heap_no) const;
-
-	/** Get the first lock to search. The search starts from the current
-	wait_lock. What we are really interested in is an edge from the
-	current wait_lock's owning transaction to another transaction that has
-	a lock ahead in the queue. We skip locks where the owning transaction's
-	sub-tree has already been searched.
-
-	Note: The record locks are traversed from the oldest lock to the
-	latest. For table locks we go from latest to oldest.
-
-	For record locks, we first position the iterator on first lock on
-	the page and then reposition on the actual heap_no. This is required
-	due to the way the record lock has is implemented.
-
-	@param[out] heap_no if rec lock, else ULINT_UNDEFINED.
-
-	@return first lock or NULL */
-	const lock_t* get_first_lock(ulint* heap_no) const;
-
-	/** Notify that a deadlock has been detected and print the conflicting
-	transaction info.
-	@param lock lock causing deadlock */
-	void notify(const lock_t* lock) const;
-
-	/** Select the victim transaction that should be rolledback.
-	@return victim transaction */
-	const trx_t* select_victim() const;
+/** Functor for accessing the embedded node within a table lock. */
+struct TableLockGetNode
+{
+  ut_list_node<lock_t> &operator()(lock_t &elem)
+  { return(elem.un_member.tab_lock.locks); }
+};
 
-	/** Rollback transaction selected as the victim. */
-	void trx_rollback();
+/** Create the hash table.
+@param n  the lower bound of n_cells */
+void lock_sys_t::hash_table::create(ulint n)
+{
+  n_cells= ut_find_prime(n);
+  const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array,
+                              CPU_LEVEL1_DCACHE_LINESIZE);
+  void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+  array= static_cast<hash_cell_t*>(v);
+}
 
-	/** Looks iteratively for a deadlock. Note: the joining transaction
-	may have been granted its lock by the deadlock checks.
+/** Resize the hash table.
+@param n  the lower bound of n_cells */
+void lock_sys_t::hash_table::resize(ulint n)
+{
+  ut_ad(lock_sys.is_writer());
+  ulint new_n_cells= ut_find_prime(n);
+  const size_t size= MY_ALIGN(pad(new_n_cells) * sizeof *array,
+                              CPU_LEVEL1_DCACHE_LINESIZE);
+  void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+  hash_cell_t *new_array= static_cast<hash_cell_t*>(v);
 
-	@return 0 if no deadlock else the victim transaction.*/
-	const trx_t* search();
+  for (auto i= pad(n_cells); i--; )
+  {
+    if (lock_t *lock= static_cast<lock_t*>(array[i].node))
+    {
+      /* all hash_latch must vacated */
+      ut_ad(i % (ELEMENTS_PER_LATCH + LATCH) >= LATCH);
+      do
+      {
+        ut_ad(!lock->is_table());
+        hash_cell_t *c= calc_hash(lock->un_member.rec_lock.page_id.fold(),
+                                  new_n_cells) + new_array;
+        lock_t *next= lock->hash;
+        lock->hash= nullptr;
+        if (!c->node)
+          c->node= lock;
+        else if (!lock->is_waiting())
+        {
+          lock->hash= static_cast<lock_t*>(c->node);
+          c->node= lock;
+        }
+        else
+        {
+          lock_t *next= static_cast<lock_t*>(c->node);
+          while (next->hash)
+            next= next->hash;
+          next->hash= lock;
+        }
+        lock= next;
+      }
+      while (lock);
+    }
+  }
 
-	/** Print transaction data to the deadlock file and possibly to stderr.
-	@param trx transaction
-	@param max_query_len max query length to print */
-	static void print(const trx_t* trx, ulint max_query_len);
+  aligned_free(array);
+  array= new_array;
+  n_cells= new_n_cells;
+}
 
-	/** rewind(3) the file used for storing the latest detected deadlock
-	and print a heading message to stderr if printing of all deadlocks to
-	stderr is enabled. */
-	static void start_print();
+#ifdef SUX_LOCK_GENERIC
+void lock_sys_t::hash_latch::wait()
+{
+  pthread_mutex_lock(&lock_sys.hash_mutex);
+  while (!write_trylock())
+    pthread_cond_wait(&lock_sys.hash_cond, &lock_sys.hash_mutex);
+  pthread_mutex_unlock(&lock_sys.hash_mutex);
+}
 
-	/** Print lock data to the deadlock file and possibly to stderr.
-	@param lock record or table type lock */
-	static void print(const lock_t* lock);
+void lock_sys_t::hash_latch::release()
+{
+  pthread_mutex_lock(&lock_sys.hash_mutex);
+  write_unlock();
+  pthread_cond_signal(&lock_sys.hash_cond);
+  pthread_mutex_unlock(&lock_sys.hash_mutex);
+}
+#endif
 
-	/** Print a message to the deadlock file and possibly to stderr.
-	@param msg message to print */
-	static void print(const char* msg);
+#ifdef UNIV_DEBUG
+/** Assert that a lock shard is exclusively latched by this thread */
+void lock_sys_t::assert_locked(const lock_t &lock) const
+{
+  ut_ad(this == &lock_sys);
+  if (is_writer())
+    return;
+  if (lock.is_table())
+    assert_locked(*lock.un_member.tab_lock.table);
+  else
+    lock_sys.hash_get(lock.type_mode).
+      assert_locked(lock.un_member.rec_lock.page_id);
+}
 
-	/** Print info about transaction that was rolled back.
-	@param trx transaction rolled back
-	@param lock lock trx wants */
-	static void rollback_print(const trx_t* trx, const lock_t* lock);
+/** Assert that a table lock shard is exclusively latched by this thread */
+void lock_sys_t::assert_locked(const dict_table_t &table) const
+{
+  ut_ad(!table.is_temporary());
+  if (is_writer())
+    return;
+  ut_ad(readers);
+  ut_ad(table.lock_mutex_is_owner());
+}
 
-private:
-	/** DFS state information, used during deadlock checking. */
-	struct state_t {
-		const lock_t*	m_lock;		/*!< Current lock */
-		const lock_t*	m_wait_lock;	/*!< Waiting for lock */
-		ulint		m_heap_no;	/*!< heap number if rec lock */
-	};
+/** Assert that hash cell for page is exclusively latched by this thread */
+void lock_sys_t::hash_table::assert_locked(const page_id_t id) const
+{
+  if (lock_sys.is_writer())
+    return;
+  ut_ad(lock_sys.readers);
+  ut_ad(latch(cell_get(id.fold()))->is_locked());
+}
 
-	/** Used in deadlock tracking. Protected by lock_sys.mutex. */
-	static ib_uint64_t	s_lock_mark_counter;
+/** Assert that a hash table cell is exclusively latched (by some thread) */
+void lock_sys_t::assert_locked(const hash_cell_t &cell) const
+{
+  if (is_writer())
+    return;
+  ut_ad(lock_sys.readers);
+  ut_ad(hash_table::latch(const_cast<hash_cell_t*>(&cell))->is_locked());
+}
+#endif
 
-	/** Calculation steps thus far. It is the count of the nodes visited. */
-	ulint			m_cost;
+LockGuard::LockGuard(lock_sys_t::hash_table &hash, page_id_t id)
+{
+  const auto id_fold= id.fold();
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  cell_= hash.cell_get(id_fold);
+  hash.latch(cell_)->acquire();
+}
 
-	/** Joining transaction that is requesting a lock in an
-	incompatible mode */
-	const trx_t*		m_start;
+LockMultiGuard::LockMultiGuard(lock_sys_t::hash_table &hash,
+                               const page_id_t id1, const page_id_t id2)
+{
+  ut_ad(id1.space() == id2.space());
+  const auto id1_fold= id1.fold(), id2_fold= id2.fold();
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  cell1_= hash.cell_get(id1_fold);
+  cell2_= hash.cell_get(id2_fold);
 
-	/** TRUE if search was too deep and was aborted */
-	bool			m_too_deep;
+  auto latch1= hash.latch(cell1_), latch2= hash.latch(cell2_);
+  if (latch1 > latch2)
+    std::swap(latch1, latch2);
+  latch1->acquire();
+  if (latch1 != latch2)
+    latch2->acquire();
+}
 
-	/** Lock that trx wants */
-	const lock_t*		m_wait_lock;
+LockMultiGuard::~LockMultiGuard()
+{
+  auto latch1= lock_sys_t::hash_table::latch(cell1_),
+    latch2= lock_sys_t::hash_table::latch(cell2_);
+  latch1->release();
+  if (latch1 != latch2)
+    latch2->release();
+  /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+  lock_sys.rd_unlock();
+}
 
-	/**  Value of lock_mark_count at the start of the deadlock check. */
-	ib_uint64_t		m_mark_start;
+TRANSACTIONAL_TARGET
+TMLockGuard::TMLockGuard(lock_sys_t::hash_table &hash, page_id_t id)
+{
+  const auto id_fold= id.fold();
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (lock_sys.latch.is_write_locked())
+      xabort();
+    cell_= hash.cell_get(id_fold);
+    if (hash.latch(cell_)->is_locked())
+      xabort();
+    elided= true;
+    return;
+  }
+  elided= false;
+#endif
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  cell_= hash.cell_get(id_fold);
+  hash.latch(cell_)->acquire();
+}
 
-	/** Number of states pushed onto the stack */
-	size_t			m_n_elems;
+/** Pretty-print a table lock.
+@param[in,out]	file	output stream
+@param[in]	lock	table lock */
+static void lock_table_print(FILE* file, const lock_t* lock);
 
-	/** This is to avoid malloc/free calls. */
-	static state_t		s_states[MAX_STACK_SIZE];
+/** Pretty-print a record lock.
+@param[in,out]	file	output stream
+@param[in]	lock	record lock
+@param[in,out]	mtr	mini-transaction for accessing the record */
+static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr);
 
-	/** Set if thd_rpl_deadlock_check() should be called for waits. */
-	const bool m_report_waiters;
+namespace Deadlock
+{
+  /** Whether to_check may be nonempty */
+  static Atomic_relaxed<bool> to_be_checked;
+  /** Transactions to check for deadlock. Protected by lock_sys.wait_mutex. */
+  static std::set<trx_t*> to_check;
+
+  MY_ATTRIBUTE((nonnull, warn_unused_result))
+  /** Check if a lock request results in a deadlock.
+  Resolve a deadlock by choosing a transaction that will be rolled back.
+  @param trx    transaction requesting a lock
+  @return whether trx must report DB_DEADLOCK */
+  static bool check_and_resolve(trx_t *trx);
+
+  /** Quickly detect a deadlock using Brent's cycle detection algorithm.
+  @param trx     transaction that is waiting for another transaction
+  @return a transaction that is part of a cycle
+  @retval nullptr if no cycle was found */
+  inline trx_t *find_cycle(trx_t *trx)
+  {
+    mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+    trx_t *tortoise= trx, *hare= trx;
+    for (unsigned power= 1, l= 1; (hare= hare->lock.wait_trx) != nullptr; l++)
+    {
+      if (tortoise == hare)
+      {
+        ut_ad(l > 1);
+        lock_sys.deadlocks++;
+        /* Note: Normally, trx should be part of any deadlock cycle
+        that is found. However, if innodb_deadlock_detect=OFF had been
+        in effect in the past, it is possible that trx will be waiting
+        for a transaction that participates in a pre-existing deadlock
+        cycle. In that case, our victim will not be trx. */
+        return hare;
+      }
+      if (l == power)
+      {
+        /* The maximum concurrent number of TRX_STATE_ACTIVE transactions
+        is TRX_RSEG_N_SLOTS * 128, or innodb_page_size / 16 * 128
+        (default: 131,072, maximum: 524,288).
+        Our maximum possible number of iterations should be twice that. */
+        power<<= 1;
+        l= 0;
+        tortoise= hare;
+      }
+    }
+    return nullptr;
+  }
 };
 
-/** Counter to mark visited nodes during deadlock search. */
-ib_uint64_t	DeadlockChecker::s_lock_mark_counter = 0;
-
-/** The stack used for deadlock searches. */
-DeadlockChecker::state_t	DeadlockChecker::s_states[MAX_STACK_SIZE];
-
 #ifdef UNIV_DEBUG
-/*********************************************************************//**
-Validates the lock system.
-@return TRUE if ok */
-static
-bool
-lock_validate();
-/*============*/
+/** Validate the transactional locks. */
+static void lock_validate();
 
-/*********************************************************************//**
-Validates the record lock queues on a page.
-@return TRUE if ok */
-static
-ibool
-lock_rec_validate_page(
-/*===================*/
-	const buf_block_t*	block)	/*!< in: buffer block */
-	MY_ATTRIBUTE((warn_unused_result));
+/** Validate the record lock queues on a page.
+@param block    buffer pool block
+@param latched  whether the tablespace latch may be held
+@return true if ok */
+static bool lock_rec_validate_page(const buf_block_t *block, bool latched)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
 #endif /* UNIV_DEBUG */
 
 /* The lock system */
 lock_sys_t lock_sys;
 
-/** We store info on the latest deadlock error to this buffer. InnoDB
-Monitor will then fetch it and print */
-static bool	lock_deadlock_found = false;
-
-/** Only created if !srv_read_only_mode */
-static FILE*		lock_latest_err_file;
+/** Only created if !srv_read_only_mode. Protected by lock_sys.latch. */
+static FILE *lock_latest_err_file;
 
 /*********************************************************************//**
 Reports that a transaction id is insensible, i.e., in the future. */
@@ -365,83 +381,6 @@ lock_check_trx_id_sanity(
   return true;
 }
 
-/*********************************************************************//**
-Checks that a record is seen in a consistent read.
-@return true if sees, or false if an earlier version of the record
-should be retrieved */
-bool
-lock_clust_rec_cons_read_sees(
-/*==========================*/
-	const rec_t*	rec,	/*!< in: user record which should be read or
-				passed over by a read cursor */
-	dict_index_t*	index,	/*!< in: clustered index */
-	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
-	ReadView*	view)	/*!< in: consistent read view */
-{
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(page_rec_is_user_rec(rec));
-	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_ad(!rec_is_metadata(rec, *index));
-
-	/* Temp-tables are not shared across connections and multiple
-	transactions from different connections cannot simultaneously
-	operate on same temp-table and so read of temp-table is
-	always consistent read. */
-	if (index->table->is_temporary()) {
-		return(true);
-	}
-
-	/* NOTE that we call this function while holding the search
-	system latch. */
-
-	trx_id_t	trx_id = row_get_rec_trx_id(rec, index, offsets);
-
-	return(view->changes_visible(trx_id, index->table->name));
-}
-
-/*********************************************************************//**
-Checks that a non-clustered index record is seen in a consistent read.
-
-NOTE that a non-clustered index page contains so little information on
-its modifications that also in the case false, the present version of
-rec may be the right, but we must check this from the clustered index
-record.
-
-@return true if certainly sees, or false if an earlier version of the
-clustered index record might be needed */
-bool
-lock_sec_rec_cons_read_sees(
-/*========================*/
-	const rec_t*		rec,	/*!< in: user record which
-					should be read or passed over
-					by a read cursor */
-	const dict_index_t*	index,	/*!< in: index */
-	const ReadView*	view)	/*!< in: consistent read view */
-{
-	ut_ad(page_rec_is_user_rec(rec));
-	ut_ad(!index->is_primary());
-	ut_ad(!rec_is_metadata(rec, *index));
-
-	/* NOTE that we might call this function while holding the search
-	system latch. */
-
-	if (index->table->is_temporary()) {
-
-		/* Temp-tables are not shared across connections and multiple
-		transactions from different connections cannot simultaneously
-		operate on same temp-table and so read of temp-table is
-		always consistent read. */
-
-		return(true);
-	}
-
-	trx_id_t	max_trx_id = page_get_max_trx_id(page_align(rec));
-
-	ut_ad(max_trx_id > 0);
-
-	return(view->sees(max_trx_id));
-}
-
 
 /**
   Creates the lock system at database start.
@@ -450,38 +389,66 @@ lock_sec_rec_cons_read_sees(
 */
 void lock_sys_t::create(ulint n_cells)
 {
-	ut_ad(this == &lock_sys);
-
-	m_initialised= true;
+  ut_ad(this == &lock_sys);
+  ut_ad(!is_initialised());
 
-	waiting_threads = static_cast<srv_slot_t*>
-		(ut_zalloc_nokey(srv_max_n_threads * sizeof *waiting_threads));
-	last_slot = waiting_threads;
+  m_initialised= true;
 
-	mutex_create(LATCH_ID_LOCK_SYS, &mutex);
-
-	mutex_create(LATCH_ID_LOCK_SYS_WAIT, &wait_mutex);
+  latch.SRW_LOCK_INIT(lock_latch_key);
+#ifdef __aarch64__
+  mysql_mutex_init(lock_wait_mutex_key, &wait_mutex, MY_MUTEX_INIT_FAST);
+#else
+  mysql_mutex_init(lock_wait_mutex_key, &wait_mutex, nullptr);
+#endif
+#ifdef SUX_LOCK_GENERIC
+  pthread_mutex_init(&hash_mutex, nullptr);
+  pthread_cond_init(&hash_cond, nullptr);
+#endif
 
+  rec_hash.create(n_cells);
+  prdt_hash.create(n_cells);
+  prdt_page_hash.create(n_cells);
 
-	rec_hash.create(n_cells);
-	prdt_hash.create(n_cells);
-	prdt_page_hash.create(n_cells);
+  if (!srv_read_only_mode)
+  {
+    lock_latest_err_file= os_file_create_tmpfile();
+    ut_a(lock_latest_err_file);
+  }
+}
 
-	if (!srv_read_only_mode) {
-		lock_latest_err_file = os_file_create_tmpfile();
-		ut_a(lock_latest_err_file);
-	}
-	timeout_timer_active = false;
+#ifdef UNIV_PFS_RWLOCK
+/** Acquire exclusive lock_sys.latch */
+void lock_sys_t::wr_lock(const char *file, unsigned line)
+{
+  mysql_mutex_assert_not_owner(&wait_mutex);
+  latch.wr_lock(file, line);
+  ut_ad(!writer.exchange(pthread_self(), std::memory_order_relaxed));
+}
+/** Release exclusive lock_sys.latch */
+void lock_sys_t::wr_unlock()
+{
+  ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+        pthread_self());
+  latch.wr_unlock();
 }
 
-/** Calculates the fold value of a lock: used in migrating the hash table.
-@param[in]	lock	record lock object
-@return	folded value */
-static ulint lock_rec_lock_fold(const lock_t *lock)
+/** Acquire shared lock_sys.latch */
+void lock_sys_t::rd_lock(const char *file, unsigned line)
 {
-  return lock->un_member.rec_lock.page_id.fold();
+  mysql_mutex_assert_not_owner(&wait_mutex);
+  latch.rd_lock(file, line);
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_d(readers.fetch_add(1, std::memory_order_relaxed));
 }
 
+/** Release shared lock_sys.latch */
+void lock_sys_t::rd_unlock()
+{
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+  latch.rd_unlock();
+}
+#endif
 
 /**
   Resize the lock hash table.
@@ -490,123 +457,48 @@ static ulint lock_rec_lock_fold(const lock_t *lock)
 */
 void lock_sys_t::resize(ulint n_cells)
 {
-	ut_ad(this == &lock_sys);
-
-	mutex_enter(&mutex);
-
-	hash_table_t old_hash(rec_hash);
-	rec_hash.create(n_cells);
-	HASH_MIGRATE(&old_hash, &rec_hash, lock_t, hash,
-		     lock_rec_lock_fold);
-	old_hash.free();
-
-	old_hash = prdt_hash;
-	prdt_hash.create(n_cells);
-	HASH_MIGRATE(&old_hash, &prdt_hash, lock_t, hash,
-		     lock_rec_lock_fold);
-	old_hash.free();
-
-	old_hash = prdt_page_hash;
-	prdt_page_hash.create(n_cells);
-	HASH_MIGRATE(&old_hash, &prdt_page_hash, lock_t, hash,
-		     lock_rec_lock_fold);
-	old_hash.free();
-	mutex_exit(&mutex);
+  ut_ad(this == &lock_sys);
+  /* Buffer pool resizing is rarely initiated by the user, and this
+  would exceed the maximum size of a memory transaction. */
+  LockMutexGuard g{SRW_LOCK_CALL};
+  rec_hash.resize(n_cells);
+  prdt_hash.resize(n_cells);
+  prdt_page_hash.resize(n_cells);
 }
 
-
 /** Closes the lock system at database shutdown. */
 void lock_sys_t::close()
 {
-	ut_ad(this == &lock_sys);
-
-	if (!m_initialised) return;
-
-	if (lock_latest_err_file != NULL) {
-		my_fclose(lock_latest_err_file, MYF(MY_WME));
-		lock_latest_err_file = NULL;
-	}
+  ut_ad(this == &lock_sys);
 
-	rec_hash.free();
-	prdt_hash.free();
-	prdt_page_hash.free();
-
-	mutex_destroy(&mutex);
-	mutex_destroy(&wait_mutex);
-
-	for (ulint i = srv_max_n_threads; i--; ) {
-		if (os_event_t& event = waiting_threads[i].event) {
-			os_event_destroy(event);
-		}
-	}
+  if (!m_initialised)
+    return;
 
-	ut_free(waiting_threads);
-	m_initialised= false;
-}
-
-/*********************************************************************//**
-Gets the size of a lock struct.
-@return size in bytes */
-ulint
-lock_get_size(void)
-/*===============*/
-{
-	return((ulint) sizeof(lock_t));
-}
-
-static inline void lock_grant_have_trx_mutex(lock_t* lock)
-{
-	lock_reset_lock_and_trx_wait(lock);
-	lock_grant_after_reset(lock);
-}
-
-/*********************************************************************//**
-Gets the gap flag of a record lock.
-@return LOCK_GAP or 0 */
-UNIV_INLINE
-ulint
-lock_rec_get_gap(
-/*=============*/
-	const lock_t*	lock)	/*!< in: record lock */
-{
-	ut_ad(lock);
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
-
-	return(lock->type_mode & LOCK_GAP);
-}
+  if (lock_latest_err_file)
+  {
+    my_fclose(lock_latest_err_file, MYF(MY_WME));
+    lock_latest_err_file= nullptr;
+  }
 
-/*********************************************************************//**
-Gets the LOCK_REC_NOT_GAP flag of a record lock.
-@return LOCK_REC_NOT_GAP or 0 */
-UNIV_INLINE
-ulint
-lock_rec_get_rec_not_gap(
-/*=====================*/
-	const lock_t*	lock)	/*!< in: record lock */
-{
-	ut_ad(lock);
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+  rec_hash.free();
+  prdt_hash.free();
+  prdt_page_hash.free();
+#ifdef SUX_LOCK_GENERIC
+  pthread_mutex_destroy(&hash_mutex);
+  pthread_cond_destroy(&hash_cond);
+#endif
 
-	return(lock->type_mode & LOCK_REC_NOT_GAP);
-}
+  latch.destroy();
+  mysql_mutex_destroy(&wait_mutex);
 
-/*********************************************************************//**
-Gets the waiting insert flag of a record lock.
-@return LOCK_INSERT_INTENTION or 0 */
-UNIV_INLINE
-ulint
-lock_rec_get_insert_intention(
-/*==========================*/
-	const lock_t*	lock)	/*!< in: record lock */
-{
-	ut_ad(lock);
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+  Deadlock::to_check.clear();
+  Deadlock::to_be_checked= false;
 
-	return(lock->type_mode & LOCK_INSERT_INTENTION);
+  m_initialised= false;
 }
 
-#ifdef UNIV_DEBUG
 #ifdef WITH_WSREP
+# ifdef UNIV_DEBUG
 /** Check if both conflicting lock transaction and other transaction
 requesting record lock are brute force (BF). If they are check is
 this BF-BF wait correct and if not report BF wait and assert.
@@ -616,12 +508,12 @@ this BF-BF wait correct and if not report BF wait and assert.
 */
 static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 {
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
-	ut_ad(lock_mutex_own());
+	ut_ad(!lock->is_table());
+	lock_sys.assert_locked(*lock);
 	trx_t* lock_trx= lock->trx;
 
-	/* Note that we are holding lock_sys->mutex, thus we should
-	not acquire THD::LOCK_thd_data mutex below to avoid mutexing
+	/* Note that we are holding lock_sys.latch, thus we should
+	not acquire THD::LOCK_thd_data mutex below to avoid latching
 	order violation. */
 
 	if (!trx->is_wsrep() || !lock_trx->is_wsrep())
@@ -632,18 +524,13 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 
 	ut_ad(trx->state == TRX_STATE_ACTIVE);
 
-	trx_mutex_enter(lock_trx);
-	const trx_state_t trx2_state= lock_trx->state;
-	trx_mutex_exit(lock_trx);
-
-	/* If transaction is already committed in memory or
-	prepared we should wait. When transaction is committed in
-	memory we held trx mutex, but not lock_sys->mutex. Therefore,
-	we could end here before transaction has time to do
-	lock_release() that is protected with lock_sys->mutex. */
-	switch (trx2_state) {
+	switch (lock_trx->state) {
 	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The state change is only protected by trx_t::mutex,
+		which we are not even holding here. */
 	case TRX_STATE_PREPARED:
+		/* Wait for lock->trx to complete the commit
+		(or XA ROLLBACK) and to release the lock. */
 		return;
 	case TRX_STATE_ACTIVE:
 		break;
@@ -679,8 +566,25 @@ static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 	/* BF-BF wait is a bug */
 	ut_error;
 }
+# endif /* UNIV_DEBUG */
+
+/** check if lock timeout was for priority thread,
+as a side effect trigger lock monitor
+@param trx    transaction owning the lock
+@return false for regular lock timeout */
+ATTRIBUTE_NOINLINE static bool wsrep_is_BF_lock_timeout(const trx_t &trx)
+{
+  ut_ad(trx.is_wsrep());
+
+  if (trx.error_state == DB_DEADLOCK || !srv_monitor_timer ||
+      !wsrep_thd_is_BF(trx.mysql_thd, false))
+    return false;
+
+  ib::info() << "WSREP: BF lock wait long for trx:" << ib::hex(trx.id)
+             << " query: " << wsrep_thd_query(trx.mysql_thd);
+  return true;
+}
 #endif /* WITH_WSREP */
-#endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
 Checks if a lock request for a new lock has to wait for request lock2.
@@ -704,14 +608,15 @@ lock_rec_has_to_wait(
 				index page: we know then that the lock
 				request is really for a 'gap' type lock */
 {
-	ut_ad(trx && lock2);
-	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
-	ut_ad(lock_mutex_own());
+	ut_ad(trx);
+	ut_ad(!lock2->is_table());
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(
+		     lock2->un_member.rec_lock.page_id));
 
 	if (trx == lock2->trx
 	    || lock_mode_compatible(
 		       static_cast<lock_mode>(LOCK_MODE_MASK & type_mode),
-		       lock_get_mode(lock2))) {
+		       lock2->mode())) {
 		return false;
 	}
 
@@ -729,7 +634,7 @@ lock_rec_has_to_wait(
 		return false;
 	}
 
-	if (!(type_mode & LOCK_INSERT_INTENTION) && lock_rec_get_gap(lock2)) {
+	if (!(type_mode & LOCK_INSERT_INTENTION) && lock2->is_gap()) {
 
 		/* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP
 		does not need to wait for a gap type lock */
@@ -737,7 +642,7 @@ lock_rec_has_to_wait(
 		return false;
 	}
 
-	if ((type_mode & LOCK_GAP) && lock_rec_get_rec_not_gap(lock2)) {
+	if ((type_mode & LOCK_GAP) && lock2->is_record_not_gap()) {
 
 		/* Lock on gap does not need to wait for
 		a LOCK_REC_NOT_GAP type lock */
@@ -745,8 +650,7 @@ lock_rec_has_to_wait(
 		return false;
 	}
 
-	if (lock_rec_get_insert_intention(lock2)) {
-
+	if (lock2->is_insert_intention()) {
 		/* No lock request needs to wait for an insert
 		intention lock to be removed. This is ok since our
 		rules allow conflicting locks on gaps. This eliminates
@@ -761,7 +665,8 @@ lock_rec_has_to_wait(
 		return false;
 	}
 
-	if ((type_mode & LOCK_GAP || lock_rec_get_gap(lock2))
+#ifdef HAVE_REPLICATION
+	if ((type_mode & LOCK_GAP || lock2->is_gap())
 	    && !thd_need_ordering_with(trx->mysql_thd, lock2->trx->mysql_thd)) {
 		/* If the upper server layer has already decided on the
 		commit order between the transaction requesting the
@@ -785,6 +690,7 @@ lock_rec_has_to_wait(
 
 		return false;
 	}
+#endif /* HAVE_REPLICATION */
 
 #ifdef WITH_WSREP
 	/* New lock request from a transaction is using unique key
@@ -792,21 +698,18 @@ lock_rec_has_to_wait(
 	(brute force). If conflicting transaction is also wsrep high
 	priority transaction we should avoid lock conflict because
 	ordering of these transactions is already decided and
-	conflicting transaction will be later replayed. Note
-	that thread holding conflicting lock can't be
-	committed or rolled back while we hold
-	lock_sys->mutex. */
+	conflicting transaction will be later replayed. */
 	if (trx->is_wsrep_UK_scan()
 	    && wsrep_thd_is_BF(lock2->trx->mysql_thd, false)) {
 		return false;
 	}
 
-	/* If BF-BF conflict, we have to look at write set order */
-	if (trx->is_wsrep()
-	    && (type_mode & LOCK_MODE_MASK) == LOCK_X
-	    && (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X
-	    && wsrep_thd_order_before(trx->mysql_thd,
-				      lock2->trx->mysql_thd)) {
+	/* if BF-BF conflict, we have to look at write set order */
+	if (trx->is_wsrep() &&
+	   (type_mode & LOCK_MODE_MASK) == LOCK_X &&
+	   (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X &&
+	   wsrep_thd_order_before(trx->mysql_thd,
+				  lock2->trx->mysql_thd)) {
 		return false;
 	}
 
@@ -835,16 +738,15 @@ lock_has_to_wait(
 	ut_ad(lock1 && lock2);
 
 	if (lock1->trx == lock2->trx
-	    || lock_mode_compatible(lock_get_mode(lock1),
-				    lock_get_mode(lock2))) {
+	    || lock_mode_compatible(lock1->mode(), lock2->mode())) {
 		return false;
 	}
 
-	if (lock_get_type_low(lock1) != LOCK_REC) {
+	if (lock1->is_table()) {
 		return true;
 	}
 
-	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+	ut_ad(!lock2->is_table());
 
 	if (lock1->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) {
 		return lock_prdt_has_to_wait(lock1->trx, lock1->type_mode,
@@ -892,7 +794,7 @@ lock_rec_bitmap_reset(
 {
 	ulint	n_bytes;
 
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(!lock->is_table());
 
 	/* Reset to zero the bitmap which resides immediately after the lock
 	struct */
@@ -916,7 +818,7 @@ lock_rec_copy(
 {
 	ulint	size;
 
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(!lock->is_table());
 
 	size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
 
@@ -932,22 +834,16 @@ lock_rec_get_prev(
 	const lock_t*	in_lock,/*!< in: record lock */
 	ulint		heap_no)/*!< in: heap number of the record */
 {
-	lock_t*		lock;
-	lock_t*		found_lock	= NULL;
+  ut_ad(!in_lock->is_table());
+  const page_id_t id{in_lock->un_member.rec_lock.page_id};
+  hash_cell_t *cell= lock_sys.hash_get(in_lock->type_mode).cell_get(id.fold());
 
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+  for (lock_t *lock= lock_sys_t::get_first(*cell, id); lock != in_lock;
+       lock= lock_rec_get_next_on_page(lock))
+    if (lock_rec_get_nth_bit(lock, heap_no))
+      return lock;
 
-	for (lock = lock_sys.get_first(*lock_hash_get(in_lock->type_mode),
-				       in_lock->un_member.rec_lock.page_id);
-	     lock != in_lock;
-	     lock = lock_rec_get_next_on_page(lock)) {
-		if (lock_rec_get_nth_bit(lock, heap_no)) {
-			found_lock = lock;
-		}
-	}
-
-	return found_lock;
+  return nullptr;
 }
 
 /*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
@@ -965,41 +861,27 @@ lock_rec_has_expl(
 					LOCK_REC_NOT_GAP, for a
 					supremum record we regard this
 					always a gap type request */
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
+	const hash_cell_t&	cell,	/*!< in: lock hash table cell */
+	const page_id_t		id,	/*!< in: page identifier */
 	ulint			heap_no,/*!< in: heap number of the record */
 	const trx_t*		trx)	/*!< in: transaction */
 {
-	lock_t*	lock;
-
-	ut_ad(lock_mutex_own());
-	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
-	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
-	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
-
-	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
-	     lock != NULL;
-	     lock = lock_rec_get_next(heap_no, lock)) {
+  ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	|| (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+  ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
 
-		if (lock->trx == trx
-		    && !lock_rec_get_insert_intention(lock)
-		    && lock_mode_stronger_or_eq(
-			    lock_get_mode(lock),
-			    static_cast<lock_mode>(
-				    precise_mode & LOCK_MODE_MASK))
-		    && !lock_get_wait(lock)
-		    && (!lock_rec_get_rec_not_gap(lock)
-			|| (precise_mode & LOCK_REC_NOT_GAP)
-			|| heap_no == PAGE_HEAP_NO_SUPREMUM)
-		    && (!lock_rec_get_gap(lock)
-			|| (precise_mode & LOCK_GAP)
-			|| heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+  for (lock_t *lock= lock_sys_t::get_first(cell, id, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+    if (lock->trx == trx &&
+	!(lock->type_mode & (LOCK_WAIT | LOCK_INSERT_INTENTION)) &&
+	(!((LOCK_REC_NOT_GAP | LOCK_GAP) & lock->type_mode) ||
+	 heap_no == PAGE_HEAP_NO_SUPREMUM ||
+	 ((LOCK_REC_NOT_GAP | LOCK_GAP) & precise_mode & lock->type_mode)) &&
+	lock_mode_stronger_or_eq(lock->mode(), static_cast<lock_mode>
+				 (precise_mode & LOCK_MODE_MASK)))
+      return lock;
 
-			return(lock);
-		}
-	}
-
-	return(NULL);
+  return nullptr;
 }
 
 #ifdef UNIV_DEBUG
@@ -1011,8 +893,8 @@ lock_t*
 lock_rec_other_has_expl_req(
 /*========================*/
 	lock_mode		mode,	/*!< in: LOCK_S or LOCK_X */
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
+	const hash_cell_t&	cell,	/*!< in: lock hash table cell */
+	const page_id_t		id,	/*!< in: page identifier */
 	bool			wait,	/*!< in: whether also waiting locks
 					are taken into account */
 	ulint			heap_no,/*!< in: heap number of the record */
@@ -1020,8 +902,6 @@ lock_rec_other_has_expl_req(
 					requests by all transactions
 					are taken into account */
 {
-
-	ut_ad(lock_mutex_own());
 	ut_ad(mode == LOCK_X || mode == LOCK_S);
 
 	/* Only GAP lock can be on SUPREMUM, and we are not looking for
@@ -1030,15 +910,12 @@ lock_rec_other_has_expl_req(
 		return(NULL);
 	}
 
-	for (lock_t* lock = lock_rec_get_first(&lock_sys.rec_hash,
-					       block, heap_no);
-	     lock != NULL;
-	     lock = lock_rec_get_next(heap_no, lock)) {
-
+	for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no);
+	     lock; lock = lock_rec_get_next(heap_no, lock)) {
 		if (lock->trx != trx
-		    && !lock_rec_get_gap(lock)
-		    && (wait || !lock_get_wait(lock))
-		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+		    && !lock->is_gap()
+		    && (!lock->is_waiting() || wait)
+		    && lock_mode_stronger_or_eq(lock->mode(), mode)) {
 
 			return(lock);
 		}
@@ -1049,74 +926,113 @@ lock_rec_other_has_expl_req(
 #endif /* UNIV_DEBUG */
 
 #ifdef WITH_WSREP
-static void wsrep_kill_victim(const trx_t * const trx, const lock_t *lock)
+void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id);
+
+/** Kill the holders of conflicting locks.
+@param trx   brute-force applier transaction running in the current thread */
+ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+static void lock_wait_wsrep(trx_t *trx)
 {
-	ut_ad(lock_mutex_own());
-	ut_ad(trx->is_wsrep());
-	trx_t* lock_trx = lock->trx;
-	ut_ad(trx_mutex_own(lock_trx));
-	ut_ad(lock_trx != trx);
+  DBUG_ASSERT(wsrep_on(trx->mysql_thd));
+  if (!wsrep_thd_is_BF(trx->mysql_thd, false))
+    return;
 
-	if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
-		return;
+  std::set<trx_t*> victims;
 
-	if (lock_trx->state == TRX_STATE_COMMITTED_IN_MEMORY
-	    || lock_trx->lock.was_chosen_as_deadlock_victim)
-              return;
-
-	if (!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)
-	    || wsrep_thd_order_before(trx->mysql_thd, lock_trx->mysql_thd)) {
-		if (lock_trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-			if (UNIV_UNLIKELY(wsrep_debug))
-				WSREP_INFO("BF victim waiting");
-			/* cannot release lock, until our lock
-			is in the queue*/
-		} else {
-			wsrep_innobase_kill_one_trx(trx->mysql_thd,
-						    lock_trx, true);
-		}
-	}
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+
+  const lock_t *wait_lock= trx->lock.wait_lock;
+  if (!wait_lock)
+  {
+func_exit:
+    lock_sys.wr_unlock();
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+    return;
+  }
+
+  if (wait_lock->is_table())
+  {
+    dict_table_t *table= wait_lock->un_member.tab_lock.table;
+    for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock;
+         lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+      /* if victim has also BF status, but has earlier seqno, we have to wait */
+      if (lock->trx != trx &&
+          !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) &&
+            wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd)))
+      {
+        victims.emplace(lock->trx);
+      }
+  }
+  else
+  {
+    const page_id_t id{wait_lock->un_member.rec_lock.page_id};
+    hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE
+                         ? lock_sys.prdt_hash : lock_sys.rec_hash).cell_get
+      (id.fold());
+    if (lock_t *lock= lock_sys_t::get_first(cell, id))
+    {
+      const ulint heap_no= lock_rec_find_set_bit(wait_lock);
+      if (!lock_rec_get_nth_bit(lock, heap_no))
+        lock= lock_rec_get_next(heap_no, lock);
+      do
+        /* if victim has also BF status, but has earlier seqno, we have to wait */
+        if (lock->trx != trx &&
+            !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) &&
+              wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd)))
+        {
+          victims.emplace(lock->trx);
+        }
+      while ((lock= lock_rec_get_next(heap_no, lock)));
+    }
+  }
+
+  if (victims.empty())
+    goto func_exit;
+
+  std::vector<std::pair<ulong,trx_id_t>> victim_id;
+  for (trx_t *v : victims)
+    victim_id.emplace_back(std::pair<ulong,trx_id_t>
+                           {thd_get_thread_id(v->mysql_thd), v->id});
+
+  DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+                  {
+                    const char act[]=
+                      "now SIGNAL sync.before_wsrep_thd_abort_reached "
+                      "WAIT_FOR signal.before_wsrep_thd_abort";
+                    DBUG_ASSERT(!debug_sync_set_action(trx->mysql_thd,
+                                                       STRING_WITH_LEN(act)));
+                  };);
+
+  lock_sys.wr_unlock();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+
+  for (const auto &v : victim_id)
+    lock_wait_wsrep_kill(trx, v.first, v.second);
 }
 #endif /* WITH_WSREP */
 
 /*********************************************************************//**
 Checks if some other transaction has a conflicting explicit lock request
 in the queue, so that we have to wait.
-@return lock or NULL */
-static
-lock_t*
-lock_rec_other_has_conflicting(
-/*===========================*/
-	unsigned		mode,	/*!< in: LOCK_S or LOCK_X,
-					possibly ORed to LOCK_GAP or
-					LOC_REC_NOT_GAP,
-					LOCK_INSERT_INTENTION */
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
-	ulint			heap_no,/*!< in: heap number of the record */
-	const trx_t*		trx)	/*!< in: our transaction */
+@param[in] mode LOCK_S or LOCK_X, possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP,
+LOCK_INSERT_INTENTION
+@param[in] cell lock hash table cell
+@param[in] id page identifier
+@param[in] heap_no heap number of the record
+@param[in] trx our transaction
+@return conflicting lock and the flag which indicated if conflicting locks
+which wait for the current transaction were ignored */
+static lock_t *lock_rec_other_has_conflicting(unsigned mode,
+                                              const hash_cell_t &cell,
+                                              const page_id_t id,
+                                              ulint heap_no, const trx_t *trx)
 {
-	lock_t*		lock;
-
-	ut_ad(lock_mutex_own());
-
 	bool	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
 
-	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
-	     lock != NULL;
-	     lock = lock_rec_get_next(heap_no, lock)) {
-
+	for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no);
+	     lock; lock = lock_rec_get_next(heap_no, lock)) {
 		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
-#ifdef WITH_WSREP
-			if (trx->is_wsrep()) {
-				trx_mutex_enter(lock->trx);
-				/* Below function will roll back either trx
-				or lock->trx depending on priority of the
-				transaction. */
-				wsrep_kill_victim(const_cast<trx_t*>(trx), lock);
-				trx_mutex_exit(lock->trx);
-			}
-#endif /* WITH_WSREP */
 			return(lock);
 		}
 	}
@@ -1140,59 +1056,30 @@ lock_sec_rec_some_has_impl(
 	dict_index_t*	index,	/*!< in: secondary index */
 	const rec_offs*	offsets)/*!< in: rec_get_offsets(rec, index) */
 {
-	trx_t*		trx;
-	trx_id_t	max_trx_id;
-	const page_t*	page = page_align(rec);
-
-	ut_ad(!lock_mutex_own());
-	ut_ad(!dict_index_is_clust(index));
-	ut_ad(page_rec_is_user_rec(rec));
-	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_ad(!rec_is_metadata(rec, *index));
-
-	max_trx_id = page_get_max_trx_id(page);
-
-	/* Some transaction may have an implicit x-lock on the record only
-	if the max trx id for the page >= min trx id for the trx list, or
-	database recovery is running. */
-
-	if (max_trx_id < trx_sys.get_min_trx_id()) {
-
-		trx = 0;
-
-	} else if (!lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) {
-
-		/* The page is corrupt: try to avoid a crash by returning 0 */
-		trx = 0;
-
-	/* In this case it is possible that some transaction has an implicit
-	x-lock. We have to look in the clustered index. */
-
-	} else {
-		trx = row_vers_impl_x_locked(caller_trx, rec, index, offsets);
-	}
+  lock_sys.assert_unlocked();
+  ut_ad(!dict_index_is_clust(index));
+  ut_ad(page_rec_is_user_rec(rec));
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(!rec_is_metadata(rec, *index));
 
-	return(trx);
-}
+  const trx_id_t max_trx_id= page_get_max_trx_id(page_align(rec));
 
-/*********************************************************************//**
-Return approximate number or record locks (bits set in the bitmap) for
-this transaction. Since delete-marked records may be removed, the
-record count will not be precise.
-The caller must be holding lock_sys.mutex. */
-ulint
-lock_number_of_rows_locked(
-/*=======================*/
-	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
-{
-	ut_ad(lock_mutex_own());
+  /* Note: It is possible to have caller_trx->id == 0 in a locking read
+  if caller_trx has not modified any persistent tables. */
+  if (!trx_sys.find_same_or_older(caller_trx, max_trx_id) ||
+      !lock_check_trx_id_sanity(max_trx_id, rec, index, offsets))
+    return nullptr;
 
-	return(trx_lock->n_rec_locks);
+  /* We checked above that some active (or XA PREPARE) transaction exists
+  that is older than PAGE_MAX_TRX_ID. That is, some transaction may be
+  holding an implicit lock on the record. We have to look up the
+  clustered index record to find if it is (or was) the case. */
+  return row_vers_impl_x_locked(caller_trx, rec, index, offsets);
 }
 
 /*********************************************************************//**
 Return the number of table locks for a transaction.
-The caller must be holding lock_sys.mutex. */
+The caller must be holding lock_sys.latch. */
 ulint
 lock_number_of_tables_locked(
 /*=========================*/
@@ -1201,13 +1088,13 @@ lock_number_of_tables_locked(
 	const lock_t*	lock;
 	ulint		n_tables = 0;
 
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked();
 
 	for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks);
 	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
 
-		if (lock_get_type_low(lock) == LOCK_TABLE) {
+		if (lock->is_table()) {
 			n_tables++;
 		}
 	}
@@ -1217,35 +1104,21 @@ lock_number_of_tables_locked(
 
 /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
 
-#ifdef WITH_WSREP
-ATTRIBUTE_COLD
-static
-void
-wsrep_print_wait_locks(
-/*===================*/
-	lock_t*		c_lock) /* conflicting lock to print */
+/** Reset the wait status of a lock.
+@param[in,out]	lock	lock that was possibly being waited for */
+static void lock_reset_lock_and_trx_wait(lock_t *lock)
 {
-	if (c_lock->trx->lock.wait_lock != c_lock) {
-		mtr_t mtr;
-		ib::info() << "WSREP: c_lock != wait lock";
-		ib::info() << " SQL: "
-			   << wsrep_thd_query(c_lock->trx->mysql_thd);
-
-		if (lock_get_type_low(c_lock) & LOCK_TABLE) {
-			lock_table_print(stderr, c_lock);
-		} else {
-			lock_rec_print(stderr, c_lock, mtr);
-		}
-
-		if (lock_get_type_low(c_lock->trx->lock.wait_lock) & LOCK_TABLE) {
-			lock_table_print(stderr, c_lock->trx->lock.wait_lock);
-		} else {
-			lock_rec_print(stderr, c_lock->trx->lock.wait_lock,
-				       mtr);
-		}
-	}
+  lock_sys.assert_locked(*lock);
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  trx_t *trx= lock->trx;
+  ut_ad(lock->is_waiting());
+  ut_ad(!trx->lock.wait_lock || trx->lock.wait_lock == lock);
+  if (trx_t *wait_trx= trx->lock.wait_trx)
+    Deadlock::to_check.erase(wait_trx);
+  trx->lock.wait_lock= nullptr;
+  trx->lock.wait_trx= nullptr;
+  lock->type_mode&= ~LOCK_WAIT;
 }
-#endif /* WITH_WSREP */
 
 #ifdef UNIV_DEBUG
 /** Check transaction state */
@@ -1262,8 +1135,8 @@ static void check_trx_state(const trx_t *trx)
 
 /** Create a new record lock and inserts it to the lock queue,
 without checking for deadlocks or conflicts.
-@param[in]	type_mode	lock mode and wait flag; type will be replaced
-				with LOCK_REC
+@param[in]	c_lock		conflicting lock
+@param[in]	type_mode	lock mode and wait flag
 @param[in]	page_id		index page number
 @param[in]	page		R-tree index page, or NULL
 @param[in]	heap_no		record heap number in the index page
@@ -1273,10 +1146,7 @@ without checking for deadlocks or conflicts.
 @return created lock */
 lock_t*
 lock_rec_create_low(
-#ifdef WITH_WSREP
-	lock_t*		c_lock,	/*!< conflicting lock */
-	que_thr_t*	thr,	/*!< thread owning trx */
-#endif
+	lock_t*		c_lock,
 	unsigned	type_mode,
 	const page_id_t	page_id,
 	const page_t*	page,
@@ -1286,21 +1156,14 @@ lock_rec_create_low(
 	bool		holds_trx_mutex)
 {
 	lock_t*		lock;
-	ulint		n_bits;
 	ulint		n_bytes;
 
-	ut_ad(lock_mutex_own());
-	ut_ad(holds_trx_mutex == trx_mutex_own(trx));
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(page_id));
+	ut_ad(xtest() || holds_trx_mutex == trx->mutex_is_owner());
 	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
-
-#ifdef UNIV_DEBUG
-	/* Non-locking autocommit read-only transactions should not set
-	any locks. See comment in trx_set_rw_mode explaining why this
-	conditional check is required in debug code. */
-	if (holds_trx_mutex) {
-		check_trx_state(trx);
-	}
-#endif /* UNIV_DEBUG */
+	ut_ad(!(type_mode & LOCK_TABLE));
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+	ut_ad(!trx->is_autocommit_non_locking());
 
 	/* If rec is the supremum record, then we reset the gap and
 	LOCK_REC_NOT_GAP bits, as all locks on the supremum are
@@ -1312,9 +1175,7 @@ lock_rec_create_low(
 	}
 
 	if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) {
-		/* Make lock bitmap bigger by a safety margin */
-		n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
-		n_bytes = 1 + n_bits / 8;
+		n_bytes = (page_dir_get_n_heap(page) + 7) / 8;
 	} else {
 		ut_ad(heap_no == PRDT_HEAPNO);
 
@@ -1335,6 +1196,12 @@ lock_rec_create_low(
 		}
 	}
 
+	if (!holds_trx_mutex) {
+		trx->mutex_lock();
+	}
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+
 	if (trx->lock.rec_cached >= UT_ARR_SIZE(trx->lock.rec_pool)
 	    || sizeof *lock + n_bytes > sizeof *trx->lock.rec_pool) {
 		lock = static_cast<lock_t*>(
@@ -1345,7 +1212,7 @@ lock_rec_create_low(
 	}
 
 	lock->trx = trx;
-	lock->type_mode = (type_mode & unsigned(~LOCK_TYPE_MASK)) | LOCK_REC;
+	lock->type_mode = type_mode;
 	lock->index = index;
 	lock->un_member.rec_lock.page_id = page_id;
 
@@ -1358,89 +1225,26 @@ lock_rec_create_low(
 	lock_rec_bitmap_reset(lock);
 	lock_rec_set_nth_bit(lock, heap_no);
 	index->table->n_rec_locks++;
-	ut_ad(index->table->get_ref_count() > 0 || !index->table->can_be_evicted);
-
-#ifdef WITH_WSREP
-	if (c_lock && trx->is_wsrep()
-	    && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
-		lock_t *hash	= (lock_t *)c_lock->hash;
-		lock_t *prev	= NULL;
-
-		while (hash && wsrep_thd_is_BF(hash->trx->mysql_thd, FALSE)
-		       && wsrep_thd_order_before(hash->trx->mysql_thd,
-						 trx->mysql_thd)) {
-			prev = hash;
-			hash = (lock_t *)hash->hash;
-		}
-		lock->hash = hash;
-		if (prev) {
-			prev->hash = lock;
-		} else {
-			c_lock->hash = lock;
-		}
-		/*
-		 * delayed conflict resolution '...kill_one_trx' was not called,
-		 * if victim was waiting for some other lock
-		 */
-		trx_mutex_enter(c_lock->trx);
-		if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-
-			c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
-
-			if (UNIV_UNLIKELY(wsrep_debug)) {
-				wsrep_print_wait_locks(c_lock);
-			}
-
-			trx->lock.que_state = TRX_QUE_LOCK_WAIT;
-			lock_set_lock_and_trx_wait(lock, trx);
-			UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
+	ut_ad(index->table->get_ref_count() || !index->table->can_be_evicted);
 
-			trx->lock.wait_thr = thr;
-			thr->state = QUE_THR_LOCK_WAIT;
+	const auto lock_hash = &lock_sys.hash_get(type_mode);
+	lock_hash->cell_get(page_id.fold())->append(*lock, &lock_t::hash);
 
-			/* have to release trx mutex for the duration of
-			   victim lock release. This will eventually call
-			   lock_grant, which wants to grant trx mutex again
-			*/
-			if (holds_trx_mutex) {
-				trx_mutex_exit(trx);
-			}
-			lock_cancel_waiting_and_release(
-				c_lock->trx->lock.wait_lock);
-
-			if (holds_trx_mutex) {
-				trx_mutex_enter(trx);
-			}
-
-			trx_mutex_exit(c_lock->trx);
-
-			/* have to bail out here to avoid lock_set_lock... */
-			return(lock);
-		}
-		trx_mutex_exit(c_lock->trx);
-	} else
-#endif /* WITH_WSREP */
-	if (!(type_mode & (LOCK_WAIT | LOCK_PREDICATE | LOCK_PRDT_PAGE))
-	    && innodb_lock_schedule_algorithm
-	    == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
-	    && !thd_is_replication_slave_thread(trx->mysql_thd)) {
-		HASH_PREPEND(lock_t, hash, &lock_sys.rec_hash,
-			     page_id.fold(), lock);
-	} else {
-		HASH_INSERT(lock_t, hash, lock_hash_get(type_mode),
-			    page_id.fold(), lock);
-	}
-
-	if (!holds_trx_mutex) {
-		trx_mutex_enter(trx);
-	}
-	ut_ad(trx_mutex_own(trx));
 	if (type_mode & LOCK_WAIT) {
-		lock_set_lock_and_trx_wait(lock, trx);
+		if (trx->lock.wait_trx) {
+			ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx);
+			ut_ad(trx->lock.wait_lock);
+			ut_ad((*trx->lock.wait_lock).trx == trx);
+		} else {
+			ut_ad(c_lock);
+			trx->lock.wait_trx = c_lock->trx;
+			ut_ad(!trx->lock.wait_lock);
+		}
+		trx->lock.wait_lock = lock;
 	}
 	UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
 	if (!holds_trx_mutex) {
-		trx_mutex_exit(trx);
+		trx->mutex_unlock();
 	}
 	MONITOR_INC(MONITOR_RECLOCK_CREATED);
 	MONITOR_INC(MONITOR_NUM_RECLOCK);
@@ -1448,139 +1252,6 @@ lock_rec_create_low(
 	return lock;
 }
 
-/*********************************************************************//**
-Check if lock1 has higher priority than lock2.
-NULL has lowest priority.
-If neither of them is wait lock, the first one has higher priority.
-If only one of them is a wait lock, it has lower priority.
-If either is a high priority transaction, the lock has higher priority.
-Otherwise, the one with an older transaction has higher priority.
-@returns true if lock1 has higher priority, false otherwise. */
-static bool has_higher_priority(lock_t *lock1, lock_t *lock2)
-{
-	if (lock1 == NULL) {
-		return false;
-	} else if (lock2 == NULL) {
-		return true;
-	}
-	// Granted locks has higher priority.
-	if (!lock_get_wait(lock1)) {
-		return true;
-	} else if (!lock_get_wait(lock2)) {
-		return false;
-	}
-	return lock1->trx->start_time_micro <= lock2->trx->start_time_micro;
-}
-
-/*********************************************************************//**
-Insert a lock to the hash list according to the mode (whether it is a wait
-lock) and the age of the transaction the it is associated with.
-If the lock is not a wait lock, insert it to the head of the hash list.
-Otherwise, insert it to the middle of the wait locks according to the age of
-the transaciton. */
-static
-dberr_t
-lock_rec_insert_by_trx_age(
-	lock_t	*in_lock) /*!< in: lock to be insert */{
-	lock_t*				node;
-	lock_t*				next;
-	hash_table_t*		hash;
-	hash_cell_t*		cell;
-
-	ut_ad(!in_lock->trx->is_wsrep());
-	const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
-	hash = lock_hash_get(in_lock->type_mode);
-	cell = &hash->array[hash->calc_hash(page_id.fold())];
-
-	node = (lock_t *) cell->node;
-	// If in_lock is not a wait lock, we insert it to the head of the list.
-	if (node == NULL || !lock_get_wait(in_lock) || has_higher_priority(in_lock, node)) {
-		cell->node = in_lock;
-		in_lock->hash = node;
-		if (lock_get_wait(in_lock)) {
-			lock_grant_have_trx_mutex(in_lock);
-			return DB_SUCCESS_LOCKED_REC;
-		}
-		return DB_SUCCESS;
-	}
-	while (node != NULL && has_higher_priority((lock_t *) node->hash,
-						   in_lock)) {
-		node = (lock_t *) node->hash;
-	}
-	next = (lock_t *) node->hash;
-	node->hash = in_lock;
-	in_lock->hash = next;
-
-	if (lock_get_wait(in_lock) && !lock_rec_has_to_wait_in_queue(in_lock)) {
-		lock_grant_have_trx_mutex(in_lock);
-		if (cell->node != in_lock) {
-			// Move it to the front of the queue
-			node->hash = in_lock->hash;
-			next = (lock_t *) cell->node;
-			cell->node = in_lock;
-			in_lock->hash = next;
-		}
-		return DB_SUCCESS_LOCKED_REC;
-	}
-
-	return DB_SUCCESS;
-}
-
-#ifdef UNIV_DEBUG
-static
-bool
-lock_queue_validate(
-	const lock_t	*in_lock) /*!< in: lock whose hash list is to be validated */
-{
-	hash_table_t*		hash;
-	hash_cell_t*		cell;
-	lock_t*				next;
-	bool				wait_lock __attribute__((unused))= false;
-
-	if (in_lock == NULL) {
-		return true;
-	}
-
-	const page_id_t	page_id(in_lock->un_member.rec_lock.page_id);
-	hash = lock_hash_get(in_lock->type_mode);
-	cell = &hash->array[hash->calc_hash(page_id.fold())];
-	next = (lock_t *) cell->node;
-	while (next != NULL) {
-		// If this is a granted lock, check that there's no wait lock before it.
-		if (!lock_get_wait(next)) {
-			ut_ad(!wait_lock);
-		} else {
-			wait_lock = true;
-		}
-		next = next->hash;
-	}
-	return true;
-}
-#endif /* UNIV_DEBUG */
-
-static
-void
-lock_rec_insert_to_head(
-	lock_t *in_lock,   /*!< in: lock to be insert */
-	ulint	rec_fold)  /*!< in: rec_fold of the page */
-{
-	hash_table_t*		hash;
-	hash_cell_t*		cell;
-	lock_t*				node;
-
-	if (in_lock == NULL) {
-		return;
-	}
-
-	hash = lock_hash_get(in_lock->type_mode);
-	cell = &hash->array[hash->calc_hash(rec_fold)];
-	node = (lock_t *) cell->node;
-	if (node != in_lock) {
-		cell->node = in_lock;
-		in_lock->hash = node;
-	}
-}
-
 /** Enqueue a waiting request for a lock which cannot be granted immediately.
 Check for deadlocks.
 @param[in]	type_mode	the requested lock mode (LOCK_S or LOCK_X)
@@ -1590,49 +1261,40 @@ Check for deadlocks.
 				waiting lock request is set
 				when performing an insert of
 				an index record
-@param[in]	block		leaf page in the index
+@param[in]	id		page identifier
+@param[in]	page		leaf page in the index
 @param[in]	heap_no		record heap number in the block
 @param[in]	index		index tree
 @param[in,out]	thr		query thread
 @param[in]	prdt		minimum bounding box (spatial index)
 @retval	DB_LOCK_WAIT		if the waiting lock was enqueued
-@retval	DB_DEADLOCK		if this transaction was chosen as the victim
-@retval	DB_SUCCESS_LOCKED_REC	if the other transaction was chosen as a victim
-				(or it happened to commit) */
+@retval	DB_DEADLOCK		if this transaction was chosen as the victim */
 dberr_t
 lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
-	lock_t*			c_lock,	/*!< conflicting lock */
-#endif
+	lock_t*			c_lock,
 	unsigned		type_mode,
-	const buf_block_t*	block,
+	const page_id_t		id,
+	const page_t*		page,
 	ulint			heap_no,
 	dict_index_t*		index,
 	que_thr_t*		thr,
 	lock_prdt_t*		prdt)
 {
-	ut_ad(lock_mutex_own());
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(id));
 	ut_ad(!srv_read_only_mode);
 	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
 
 	trx_t* trx = thr_get_trx(thr);
-
-	ut_ad(trx_mutex_own(trx));
-	ut_a(!que_thr_stop(thr));
-
-	switch (trx_get_dict_operation(trx)) {
-	case TRX_DICT_OP_NONE:
-		break;
-	case TRX_DICT_OP_TABLE:
-	case TRX_DICT_OP_INDEX:
-		ib::error() << "A record lock wait happens in a dictionary"
-			" operation. index "
-			<< index->name
-			<< " of table "
-			<< index->table->name
-			<< ". " << BUG_REPORT_MSG;
-		ut_ad(0);
-	}
+	ut_ad(xtest() || trx->mutex_is_owner());
+	ut_ad(!trx->dict_operation_lock_mode);
+        /* Apart from Galera, only transactions that have waiting lock can be
+        chosen as deadlock victim. Only one lock can be waited for at a time,
+        and a transaction is associated with a single thread. That is why there
+        must not be waiting lock requests if the transaction is deadlock victim
+        and it is not WSREP. Galera transaction abort can be invoked from MDL
+        acquisition code when the transaction does not have waiting record
+        lock, that's why we check only deadlock victim bit here. */
+        ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
 
 	if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) {
 		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
@@ -1641,43 +1303,15 @@ lock_rec_enqueue_waiting(
 
 	/* Enqueue the lock request that will wait to be granted, note that
 	we already own the trx mutex. */
-	lock_t* lock = lock_rec_create(
-#ifdef WITH_WSREP
-		c_lock, thr,
-#endif
-		type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE);
+	lock_t* lock = lock_rec_create_low(
+		c_lock,
+		type_mode | LOCK_WAIT, id, page, heap_no, index, trx, true);
 
 	if (prdt && type_mode & LOCK_PREDICATE) {
 		lock_prdt_set_prdt(lock, prdt);
 	}
 
-	if (ut_d(const trx_t* victim =)
-	    DeadlockChecker::check_and_resolve(lock, trx)) {
-		ut_ad(victim == trx);
-		lock_reset_lock_and_trx_wait(lock);
-		lock_rec_reset_nth_bit(lock, heap_no);
-		return DB_DEADLOCK;
-	}
-
-	if (!trx->lock.wait_lock) {
-		/* If there was a deadlock but we chose another
-		transaction as a victim, it is possible that we
-		already have the lock now granted! */
-#ifdef WITH_WSREP
-		if (UNIV_UNLIKELY(wsrep_debug)) {
-			ib::info() << "WSREP: BF thread got lock granted early, ID " << ib::hex(trx->id)
-				   << " query: " << wsrep_thd_query(trx->mysql_thd);
-		}
-#endif
-		return DB_SUCCESS_LOCKED_REC;
-	}
-
-	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
-
-	trx->lock.was_chosen_as_deadlock_victim = false;
-	trx->lock.wait_started = time(NULL);
-
-	ut_a(que_thr_stop(thr));
+	trx->lock.wait_thr = thr;
 
 	DBUG_LOG("ib_lock", "trx " << ib::hex(trx->id)
 		 << " waits for lock in index " << index->name
@@ -1685,18 +1319,6 @@ lock_rec_enqueue_waiting(
 
 	MONITOR_INC(MONITOR_LOCKREC_WAIT);
 
-	if (innodb_lock_schedule_algorithm
-	    == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
-	    && !prdt
-	    && !thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
-		HASH_DELETE(lock_t, hash, &lock_sys.rec_hash,
-			    lock_rec_lock_fold(lock), lock);
-		dberr_t res = lock_rec_insert_by_trx_age(lock);
-		if (res != DB_SUCCESS) {
-			return res;
-		}
-	}
-
 	return DB_LOCK_WAIT;
 }
 
@@ -1713,7 +1335,7 @@ lock_rec_find_similar_on_page(
 	lock_t*         lock,           /*!< in: lock_sys.get_first() */
 	const trx_t*    trx)            /*!< in: transaction */
 {
-	ut_ad(lock_mutex_own());
+	lock_sys.rec_hash.assert_locked(lock->un_member.rec_lock.page_id);
 
 	for (/* No op */;
 	     lock != NULL;
@@ -1737,28 +1359,26 @@ on the record, and the request to be added is not a waiting request, we
 can reuse a suitable record lock object already existing on the same page,
 just setting the appropriate bit in its bitmap. This is a low-level function
 which does NOT check for deadlocks or lock compatibility!
-@return lock where the bit was set */
-static
-void
-lock_rec_add_to_queue(
-/*==================*/
-	unsigned		type_mode,/*!< in: lock mode, wait, gap
-					etc. flags; type is ignored
-					and replaced by LOCK_REC */
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
-	ulint			heap_no,/*!< in: heap number of the record */
-	dict_index_t*		index,	/*!< in: index of record */
-	trx_t*			trx,	/*!< in/out: transaction */
-	bool			caller_owns_trx_mutex)
-					/*!< in: TRUE if caller owns the
-					transaction mutex */
-{
-#ifdef UNIV_DEBUG
-	ut_ad(lock_mutex_own());
-	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
-	ut_ad(dict_index_is_clust(index)
+@param[in] type_mode lock mode, wait, gap etc. flags
+@param[in,out] cell first hash table cell
+@param[in] id page identifier
+@param[in] page buffer block containing the record
+@param[in] heap_no heap number of the record
+@param[in] index index of record
+@param[in,out] trx transaction
+@param[in] caller_owns_trx_mutex TRUE if caller owns the transaction mutex */
+TRANSACTIONAL_TARGET
+static void lock_rec_add_to_queue(unsigned type_mode, hash_cell_t &cell,
+                                  const page_id_t id, const page_t *page,
+                                  ulint heap_no, dict_index_t *index,
+                                  trx_t *trx, bool caller_owns_trx_mutex)
+{
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(id));
+	ut_ad(xtest() || caller_owns_trx_mutex == trx->mutex_is_owner());
+	ut_ad(index->is_primary()
 	      || dict_index_get_online_status(index) != ONLINE_INDEX_CREATION);
+	ut_ad(!(type_mode & LOCK_TABLE));
+#ifdef UNIV_DEBUG
 	switch (type_mode & LOCK_MODE_MASK) {
 	case LOCK_X:
 	case LOCK_S:
@@ -1773,7 +1393,7 @@ lock_rec_add_to_queue(
 			: LOCK_S;
 		const lock_t*	other_lock
 			= lock_rec_other_has_expl_req(
-				mode, block, false, heap_no, trx);
+				mode, cell, id, false, heap_no, trx);
 #ifdef WITH_WSREP
 		if (UNIV_LIKELY_NULL(other_lock) && trx->is_wsrep()) {
 			/* Only BF transaction may be granted lock
@@ -1791,8 +1411,6 @@ lock_rec_add_to_queue(
 	}
 #endif /* UNIV_DEBUG */
 
-	type_mode |= LOCK_REC;
-
 	/* If rec is the supremum record, then we can reset the gap bit, as
 	all locks on the supremum are automatically of the gap type, and we
 	try to avoid unnecessary memory consumption of a new record lock
@@ -1807,45 +1425,48 @@ lock_rec_add_to_queue(
 		type_mode &= ~(LOCK_GAP | LOCK_REC_NOT_GAP);
 	}
 
-	lock_t*		lock;
-	lock_t*		first_lock;
-
-	/* Look for a waiting lock request on the same record or on a gap */
-
-	for (first_lock = lock = lock_sys.get_first(*lock_hash_get(type_mode),
-						    block->page.id());
-	     lock != NULL;
-	     lock = lock_rec_get_next_on_page(lock)) {
-
-		if (lock_get_wait(lock)
-		    && lock_rec_get_nth_bit(lock, heap_no)) {
-
-			break;
+	if (type_mode & LOCK_WAIT) {
+		goto create;
+	} else if (lock_t *first_lock = lock_sys_t::get_first(cell, id)) {
+		for (lock_t* lock = first_lock;;) {
+			if (lock->is_waiting()
+			    && lock_rec_get_nth_bit(lock, heap_no)) {
+				goto create;
+			}
+			if (!(lock = lock_rec_get_next_on_page(lock))) {
+				break;
+			}
 		}
-	}
-
-	if (lock == NULL && !(type_mode & LOCK_WAIT)) {
 
 		/* Look for a similar record lock on the same page:
 		if one is found and there are no waiting lock requests,
 		we can just set the bit */
+		if (lock_t* lock = lock_rec_find_similar_on_page(
+			    type_mode, heap_no, first_lock, trx)) {
+			trx_t* lock_trx = lock->trx;
+			if (caller_owns_trx_mutex) {
+				trx->mutex_unlock();
+			}
+			{
+				TMTrxGuard tg{*lock_trx};
+				lock_rec_set_nth_bit(lock, heap_no);
+			}
 
-		lock = lock_rec_find_similar_on_page(
-			type_mode, heap_no, first_lock, trx);
-
-		if (lock != NULL) {
-
-			lock_rec_set_nth_bit(lock, heap_no);
-
+			if (caller_owns_trx_mutex) {
+				trx->mutex_lock();
+			}
 			return;
 		}
 	}
 
-	lock_rec_create(
-#ifdef WITH_WSREP
-		NULL, NULL,
-#endif
-		type_mode, block, heap_no, index, trx, caller_owns_trx_mutex);
+create:
+	/* Note: We will not pass any conflicting lock to lock_rec_create(),
+	because we should be moving an existing waiting lock request. */
+	ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx);
+
+	lock_rec_create_low(nullptr,
+			    type_mode, id, page, heap_no, index, trx,
+			    caller_owns_trx_mutex);
 }
 
 /*********************************************************************//**
@@ -1873,58 +1494,67 @@ lock_rec_lock(
 	que_thr_t*		thr)	/*!< in: query thread */
 {
   trx_t *trx= thr_get_trx(thr);
-  dberr_t err= DB_SUCCESS;
-
+  /* There must not be lock requests for reads or updates if transaction was
+  chosen as deadlock victim. Apart from Galera, only transactions that have
+  waiting lock may be chosen as deadlock victims. Only one lock can be waited
+  for at a time, and a transaction is associated with a single thread. Galera
+  transaction abort can be invoked from MDL acquisition code when the
+  transaction does not have waiting lock, that's why we check only deadlock
+  victim bit here. */
+  ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
   ut_ad(!srv_read_only_mode);
-  ut_ad((LOCK_MODE_MASK & mode) == LOCK_S ||
-        (LOCK_MODE_MASK & mode) == LOCK_X);
-  ut_ad((mode & LOCK_TYPE_MASK) == LOCK_GAP ||
-        (mode & LOCK_TYPE_MASK) == LOCK_REC_NOT_GAP ||
-        (mode & LOCK_TYPE_MASK) == 0);
+  ut_ad(((LOCK_MODE_MASK | LOCK_TABLE) & mode) == LOCK_S ||
+        ((LOCK_MODE_MASK | LOCK_TABLE) & mode) == LOCK_X);
+  ut_ad(~mode & (LOCK_GAP | LOCK_REC_NOT_GAP));
   ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
   DBUG_EXECUTE_IF("innodb_report_deadlock", return DB_DEADLOCK;);
 
-  lock_mutex_enter();
   ut_ad((LOCK_MODE_MASK & mode) != LOCK_S ||
         lock_table_has(trx, index->table, LOCK_IS));
   ut_ad((LOCK_MODE_MASK & mode) != LOCK_X ||
          lock_table_has(trx, index->table, LOCK_IX));
 
   if (lock_table_has(trx, index->table,
-                     static_cast<lock_mode>(LOCK_MODE_MASK & mode)));
-  else if (lock_t *lock= lock_sys.get_first(block->page.id()))
+                     static_cast<lock_mode>(LOCK_MODE_MASK & mode)))
+    return DB_SUCCESS;
+
+  /* During CREATE TABLE, we will write to newly created FTS_*_CONFIG
+  on which no lock has been created yet. */
+  ut_ad(!trx->dict_operation_lock_mode ||
+        (strstr(index->table->name.m_name, "/FTS_") &&
+         strstr(index->table->name.m_name, "_CONFIG") + sizeof("_CONFIG") ==
+         index->table->name.m_name + strlen(index->table->name.m_name) + 1));
+  MONITOR_ATOMIC_INC(MONITOR_NUM_RECLOCK_REQ);
+  const page_id_t id{block->page.id()};
+  LockGuard g{lock_sys.rec_hash, id};
+
+  if (lock_t *lock= lock_sys_t::get_first(g.cell(), id))
   {
-    trx_mutex_enter(trx);
+    dberr_t err= DB_SUCCESS;
+    trx->mutex_lock();
     if (lock_rec_get_next_on_page(lock) ||
         lock->trx != trx ||
-        lock->type_mode != (ulint(mode) | LOCK_REC) ||
+        lock->type_mode != mode ||
         lock_rec_get_n_bits(lock) <= heap_no)
     {
       /* Do nothing if the trx already has a strong enough lock on rec */
-      if (!lock_rec_has_expl(mode, block, heap_no, trx))
+      if (!lock_rec_has_expl(mode, g.cell(), id, heap_no, trx))
       {
-        if (
-#ifdef WITH_WSREP
-	    lock_t *c_lock=
-#endif
-	    lock_rec_other_has_conflicting(mode, block, heap_no, trx))
-        {
+        if (lock_t *c_lock= lock_rec_other_has_conflicting(mode, g.cell(), id,
+                                                           heap_no, trx))
           /*
             If another transaction has a non-gap conflicting
             request in the queue, as this transaction does not
             have a lock strong enough already granted on the
-	    record, we have to wait. */
-	    err = lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
-			c_lock,
-#endif /* WITH_WSREP */
-			mode, block, heap_no, index, thr, NULL);
-        }
+            record, we have to wait.
+          */
+          err= lock_rec_enqueue_waiting(c_lock, mode, id, block->page.frame,
+                                        heap_no, index, thr, nullptr);
         else if (!impl)
         {
           /* Set the requested lock on the record. */
-          lock_rec_add_to_queue(LOCK_REC | mode, block, heap_no, index, trx,
-                                true);
+          lock_rec_add_to_queue(mode, g.cell(), id, block->page.frame, heap_no,
+                                index, trx, true);
           err= DB_SUCCESS_LOCKED_REC;
         }
       }
@@ -1941,26 +1571,16 @@ lock_rec_lock(
         err= DB_SUCCESS_LOCKED_REC;
       }
     }
-    trx_mutex_exit(trx);
+    trx->mutex_unlock();
+    return err;
   }
-  else
-  {
-    /*
-      Simplified and faster path for the most common cases
-      Note that we don't own the trx mutex.
-    */
-    if (!impl)
-      lock_rec_create(
-#ifdef WITH_WSREP
-         NULL, NULL,
-#endif
-        mode, block, heap_no, index, trx, false);
 
-    err= DB_SUCCESS_LOCKED_REC;
-  }
-  lock_mutex_exit();
-  MONITOR_ATOMIC_INC(MONITOR_NUM_RECLOCK_REQ);
-  return err;
+  /* Simplified and faster path for the most common cases */
+  if (!impl)
+    lock_rec_create_low(nullptr, mode, id, block->page.frame, heap_no, index,
+                        trx, false);
+
+  return DB_SUCCESS_LOCKED_REC;
 }
 
 /*********************************************************************//**
@@ -1968,27 +1588,23 @@ Checks if a waiting record lock request still has to wait in a queue.
 @return lock that is causing the wait */
 static
 const lock_t*
-lock_rec_has_to_wait_in_queue(
-/*==========================*/
-	const lock_t*	wait_lock)	/*!< in: waiting record lock */
+lock_rec_has_to_wait_in_queue(const hash_cell_t &cell, const lock_t *wait_lock)
 {
 	const lock_t*	lock;
 	ulint		heap_no;
 	ulint		bit_mask;
 	ulint		bit_offset;
 
-	ut_ad(wait_lock);
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_wait(wait_lock));
-	ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
+	ut_ad(wait_lock->is_waiting());
+	ut_ad(!wait_lock->is_table());
 
 	heap_no = lock_rec_find_set_bit(wait_lock);
 
 	bit_offset = heap_no / 8;
 	bit_mask = static_cast<ulint>(1) << (heap_no % 8);
 
-	for (lock = lock_sys.get_first(*lock_hash_get(wait_lock->type_mode),
-				       wait_lock->un_member.rec_lock.page_id);
+	for (lock = lock_sys_t::get_first(
+		     cell, wait_lock->un_member.rec_lock.page_id);
 	     lock != wait_lock;
 	     lock = lock_rec_get_next_on_page_const(lock)) {
 		const byte*	p = (const byte*) &lock[1];
@@ -1997,9 +1613,9 @@ lock_rec_has_to_wait_in_queue(
 		    && (p[bit_offset] & bit_mask)
 		    && lock_has_to_wait(wait_lock, lock)) {
 #ifdef WITH_WSREP
-			if (lock->trx->is_wsrep()
-			    && wsrep_thd_order_before(wait_lock->trx->mysql_thd,
-						      lock->trx->mysql_thd)) {
+			if (lock->trx->is_wsrep() &&
+			    wsrep_thd_order_before(wait_lock->trx->mysql_thd,
+						   lock->trx->mysql_thd)) {
 				/* don't wait for another BF lock */
 				continue;
 			}
@@ -2011,249 +1627,498 @@ lock_rec_has_to_wait_in_queue(
 	return(NULL);
 }
 
-/** Grant a lock to a waiting lock request and release the waiting transaction
-after lock_reset_lock_and_trx_wait() has been called. */
-static void lock_grant_after_reset(lock_t* lock)
+/** Note that a record lock wait started */
+inline void lock_sys_t::wait_start()
 {
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(lock->trx));
-
-	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
-		dict_table_t*	table = lock->un_member.tab_lock.table;
-
-		if (table->autoinc_trx == lock->trx) {
-			ib::error() << "Transaction already had an"
-				<< " AUTO-INC lock!";
-		} else {
-			table->autoinc_trx = lock->trx;
+  mysql_mutex_assert_owner(&wait_mutex);
+  wait_count+= WAIT_COUNT_STEP + 1;
+  /* The maximum number of concurrently waiting transactions is one less
+  than the maximum number of concurrent transactions. */
+  static_assert(WAIT_COUNT_STEP == UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS,
+                "compatibility");
+}
 
-			ib_vector_push(lock->trx->autoinc_locks, &lock);
-		}
-	}
+/** Note that a record lock wait resumed */
+inline
+void lock_sys_t::wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now)
+{
+  mysql_mutex_assert_owner(&wait_mutex);
+  ut_ad(get_wait_pending());
+  ut_ad(get_wait_cumulative());
+  wait_count--;
+  if (now.val >= start.val)
+  {
+    const uint32_t diff_time=
+      static_cast<uint32_t>((now.val - start.val) / 1000);
+    wait_time+= diff_time;
 
-	DBUG_PRINT("ib_lock", ("wait for trx " TRX_ID_FMT " ends",
-			       trx_get_id_for_print(lock->trx)));
+    if (diff_time > wait_time_max)
+      wait_time_max= diff_time;
 
-	/* If we are resolving a deadlock by choosing another transaction
-	as a victim, then our original transaction may not be in the
-	TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
-	for it */
+    thd_storage_lock_wait(thd, diff_time);
+  }
+}
 
-	if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-		que_thr_t*	thr;
+#ifdef HAVE_REPLICATION
+ATTRIBUTE_NOINLINE MY_ATTRIBUTE((nonnull))
+/** Report lock waits to parallel replication. Sets
+trx->error_state= DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was
+set when lock_sys.wait_mutex was unlocked.
+@param trx       transaction that may be waiting for a lock
+@param wait_lock lock that is being waited for */
+static void lock_wait_rpl_report(trx_t *trx)
+{
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  THD *const thd= trx->mysql_thd;
+  ut_ad(thd);
+  const lock_t *wait_lock= trx->lock.wait_lock;
+  if (!wait_lock)
+    return;
+  ut_ad(!(wait_lock->type_mode & LOCK_AUTO_INC));
+  /* This would likely be too large to attempt to use a memory transaction,
+  even for wait_lock->is_table(). */
+  const bool nowait=  lock_sys.wr_lock_try();
+  if (!nowait)
+  {
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+    lock_sys.wr_lock(SRW_LOCK_CALL);
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    wait_lock= trx->lock.wait_lock;
+    if (!wait_lock)
+    {
+func_exit:
+      lock_sys.wr_unlock();
+      /* trx->lock.was_chosen_as_deadlock_victim can be set when
+      lock_sys.wait_mutex was unlocked, let's check it. */
+      if (!nowait && trx->lock.was_chosen_as_deadlock_victim)
+        trx->error_state= DB_DEADLOCK;
+      return;
+    }
+    ut_ad(wait_lock->is_waiting());
+  }
+  else if (!wait_lock->is_waiting())
+    goto func_exit;
+  ut_ad(!(wait_lock->type_mode & LOCK_AUTO_INC));
 
-		thr = que_thr_end_lock_wait(lock->trx);
+  if (wait_lock->is_table())
+  {
+    dict_table_t *table= wait_lock->un_member.tab_lock.table;
+    for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock;
+         lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+      if (!(lock->type_mode & LOCK_AUTO_INC) && lock->trx != trx)
+        thd_rpl_deadlock_check(thd, lock->trx->mysql_thd);
+  }
+  else
+  {
+    const page_id_t id{wait_lock->un_member.rec_lock.page_id};
+    hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE
+                         ? lock_sys.prdt_hash : lock_sys.rec_hash).cell_get
+      (id.fold());
+    if (lock_t *lock= lock_sys_t::get_first(cell, id))
+    {
+      const ulint heap_no= lock_rec_find_set_bit(wait_lock);
+      if (!lock_rec_get_nth_bit(lock, heap_no))
+        lock= lock_rec_get_next(heap_no, lock);
+      do
+        if (lock->trx->mysql_thd != thd)
+          thd_rpl_deadlock_check(thd, lock->trx->mysql_thd);
+      while ((lock= lock_rec_get_next(heap_no, lock)));
+    }
+  }
 
-		if (thr != NULL) {
-			lock_wait_release_thread_if_suspended(thr);
-		}
-	}
+  goto func_exit;
 }
+#endif /* HAVE_REPLICATION */
 
-/** Grant a lock to a waiting lock request and release the waiting transaction. */
-static void lock_grant(lock_t* lock)
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr)
 {
-	lock_reset_lock_and_trx_wait(lock);
-	trx_mutex_enter(lock->trx);
-	lock_grant_after_reset(lock);
-	trx_mutex_exit(lock->trx);
-}
+  trx_t *trx= thr_get_trx(thr);
 
-/*************************************************************//**
-Cancels a waiting record lock request and releases the waiting transaction
-that requested it. NOTE: does NOT check if waiting lock requests behind this
-one can now be granted! */
-static
-void
-lock_rec_cancel(
-/*============*/
-	lock_t*	lock)	/*!< in: waiting record lock request */
-{
-	que_thr_t*	thr;
+  if (trx->mysql_thd)
+    DEBUG_SYNC_C("lock_wait_start");
+
+  /* Create the sync point for any quit from the function. */
+  ut_d(SCOPE_EXIT([trx]() {
+    if (trx->mysql_thd)
+      DEBUG_SYNC_C("lock_wait_end");
+  }));
+
+  /* InnoDB system transactions may use the global value of
+  innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+  const ulong innodb_lock_wait_timeout= trx_lock_wait_timeout_get(trx);
+  const my_hrtime_t suspend_time= my_hrtime_coarse();
+  ut_ad(!trx->dict_operation_lock_mode);
+
+  /* The wait_lock can be cleared by another thread in lock_grant(),
+  lock_rec_cancel(), or lock_cancel_waiting_and_release(). But, a wait
+  can only be initiated by the current thread which owns the transaction.
+
+  Even if trx->lock.wait_lock were changed, the object that it used to
+  point to it will remain valid memory (remain allocated from
+  trx->lock.lock_heap). If trx->lock.wait_lock was set to nullptr, the
+  original object could be transformed to a granted lock. On a page
+  split or merge, we would change trx->lock.wait_lock to point to
+  another waiting lock request object, and the old object would be
+  logically discarded.
+
+  In any case, it is safe to read the memory that wait_lock points to,
+  even though we are not holding any mutex. We are only reading
+  wait_lock->type_mode & (LOCK_TABLE | LOCK_AUTO_INC), which will be
+  unaffected by any page split or merge operation. (Furthermore,
+  table lock objects will never be cloned or moved.) */
+  const lock_t *const wait_lock= trx->lock.wait_lock;
+
+  if (!wait_lock)
+  {
+    /* The lock has already been released or this transaction
+    was chosen as a deadlock victim: no need to wait */
+    trx->error_state=
+        trx->lock.was_chosen_as_deadlock_victim ? DB_DEADLOCK : DB_SUCCESS;
+    return trx->error_state;
+  }
 
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+  trx->lock.suspend_time= suspend_time;
 
-	/* Reset the bit (there can be only one set bit) in the lock bitmap */
-	lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+  ut_ad(!trx->dict_operation_lock_mode);
 
-	/* Reset the wait flag and the back pointer to lock in trx */
+  IF_WSREP(if (trx->is_wsrep()) lock_wait_wsrep(trx),);
 
-	lock_reset_lock_and_trx_wait(lock);
+  const auto type_mode= wait_lock->type_mode;
+#ifdef HAVE_REPLICATION
+  /* Even though lock_wait_rpl_report() has nothing to do with
+  deadlock detection, it was always disabled by innodb_deadlock_detect=OFF.
+  We will keep it in that way, because unfortunately
+  thd_need_wait_reports() will hold even if parallel (or any) replication
+  is not being used. We want to be allow the user to skip
+  lock_wait_rpl_report(). */
+  const bool rpl= !(type_mode & LOCK_AUTO_INC) && trx->mysql_thd &&
+    innodb_deadlock_detect && thd_need_wait_reports(trx->mysql_thd);
+#endif
+  const bool row_lock_wait= thr->lock_state == QUE_THR_LOCK_ROW;
+  timespec abstime;
+  set_timespec_time_nsec(abstime, suspend_time.val * 1000);
+  abstime.MY_tv_sec+= innodb_lock_wait_timeout;
+  /* Dictionary transactions must wait be immune to lock wait timeouts
+  for locks on data dictionary tables. Here we check only for
+  SYS_TABLES, SYS_COLUMNS, SYS_INDEXES, SYS_FIELDS. Locks on further
+  tables SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_VIRTUAL will only be
+  acquired while holding an exclusive lock on one of the 4 tables. */
+  const bool no_timeout= innodb_lock_wait_timeout >= 100000000 ||
+    ((type_mode & LOCK_TABLE) &&
+     wait_lock->un_member.tab_lock.table->id <= DICT_FIELDS_ID);
+  thd_wait_begin(trx->mysql_thd, (type_mode & LOCK_TABLE)
+                 ? THD_WAIT_TABLE_LOCK : THD_WAIT_ROW_LOCK);
+
+  int err= 0;
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  if (trx->lock.wait_lock)
+  {
+    if (Deadlock::check_and_resolve(trx))
+    {
+      ut_ad(!trx->lock.wait_lock);
+      trx->error_state= DB_DEADLOCK;
+      goto end_wait;
+    }
+  }
+  else
+  {
+    /* trx->lock.was_chosen_as_deadlock_victim can be changed before
+    lock_sys.wait_mutex is acquired, so let's check it once more. */
+    trx->error_state=
+        trx->lock.was_chosen_as_deadlock_victim ? DB_DEADLOCK : DB_SUCCESS;
+    goto end_wait;
+  }
+  if (row_lock_wait)
+    lock_sys.wait_start();
 
-	/* The following function releases the trx from lock wait */
+#ifdef HAVE_REPLICATION
+  if (rpl)
+    lock_wait_rpl_report(trx);
+#endif
 
-	trx_mutex_enter(lock->trx);
+  if (trx->error_state != DB_SUCCESS)
+    goto check_trx_error;
 
-	thr = que_thr_end_lock_wait(lock->trx);
+  while (trx->lock.wait_lock)
+  {
+    DEBUG_SYNC_C("lock_wait_before_suspend");
 
-	if (thr != NULL) {
-		lock_wait_release_thread_if_suspended(thr);
-	}
+    if (no_timeout)
+      my_cond_wait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex);
+    else
+      err= my_cond_timedwait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex,
+                             &abstime);
+check_trx_error:
+    switch (trx->error_state) {
+    case DB_DEADLOCK:
+    case DB_INTERRUPTED:
+      break;
+    default:
+      ut_ad(trx->error_state != DB_LOCK_WAIT_TIMEOUT);
+      /* Dictionary transactions must ignore KILL, because they could
+      be executed as part of a multi-transaction DDL operation,
+      such as rollback_inplace_alter_table() or ha_innobase::delete_table(). */
+      if (!trx->dict_operation && trx_is_interrupted(trx))
+        /* innobase_kill_query() can only set trx->error_state=DB_INTERRUPTED
+        for any transaction that is attached to a connection. */
+        trx->error_state= DB_INTERRUPTED;
+      else if (!err)
+        continue;
+#ifdef WITH_WSREP
+      else if (trx->is_wsrep() && wsrep_is_BF_lock_timeout(*trx));
+#endif
+      else
+      {
+        trx->error_state= DB_LOCK_WAIT_TIMEOUT;
+        lock_sys.timeouts++;
+      }
+    }
+    break;
+  }
+
+  if (row_lock_wait)
+    lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse());
+
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
+  if (lock_t *lock= trx->lock.wait_lock)
+  {
+    lock_sys_t::cancel<false>(trx, lock);
+    lock_sys.deadlock_check();
+  }
+
+end_wait:
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  thd_wait_end(trx->mysql_thd);
+
+  return trx->error_state;
+}
+
+
+/** Resume a lock wait */
+template <bool from_deadlock= false>
+void lock_wait_end(trx_t *trx)
+{
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->mutex_is_owner());
+  ut_d(const auto state= trx->state);
+  ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE ||
+        state == TRX_STATE_PREPARED);
+  /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before
+  requesting lock_sys.wait_mutex, and if the flag is set, it returns error,
+  what causes transaction rollback, which can reset trx->lock.wait_thr before
+  deadlock resolution starts cancelling victim's waiting lock. That's why we
+  don't check trx->lock.wait_thr here if the function was called from deadlock
+  resolution function. */
+  ut_ad(from_deadlock || trx->lock.wait_thr);
 
-	trx_mutex_exit(lock->trx);
+  if (trx->lock.was_chosen_as_deadlock_victim)
+  {
+    ut_ad(from_deadlock || state == TRX_STATE_ACTIVE);
+    trx->error_state= DB_DEADLOCK;
+  }
+
+  trx->lock.wait_thr= nullptr;
+  pthread_cond_signal(&trx->lock.cond);
 }
 
-static void lock_grant_and_move_on_page(ulint rec_fold, const page_id_t id)
+/** Grant a waiting lock request and release the waiting transaction. */
+static void lock_grant(lock_t *lock)
 {
-	lock_t*		lock;
-	lock_t*		previous = static_cast<lock_t*>(
-		lock_sys.rec_hash.array[lock_sys.rec_hash.calc_hash(rec_fold)].
-		node);
-	if (previous == NULL) {
-		return;
-	}
-	if (previous->un_member.rec_lock.page_id == id) {
-		lock = previous;
-	}
-	else {
-		while (previous->hash &&
-		       (previous->hash->un_member.rec_lock.page_id != id)) {
-			previous = previous->hash;
-		}
-		lock = previous->hash;
-	}
+  lock_reset_lock_and_trx_wait(lock);
+  trx_t *trx= lock->trx;
+  trx->mutex_lock();
+  if (lock->mode() == LOCK_AUTO_INC)
+  {
+    dict_table_t *table= lock->un_member.tab_lock.table;
+    ut_ad(!table->autoinc_trx);
+    table->autoinc_trx= trx;
+    ib_vector_push(trx->autoinc_locks, &lock);
+  }
 
-	ut_ad(previous->hash == lock || previous == lock);
-	/* Grant locks if there are no conflicting locks ahead.
-	 Move granted locks to the head of the list. */
-	while (lock) {
-		/* If the lock is a wait lock on this page, and it does not need to wait. */
-		ut_ad(!lock->trx->is_wsrep());
-		if (lock_get_wait(lock)
-		    && lock->un_member.rec_lock.page_id == id
-		    && !lock_rec_has_to_wait_in_queue(lock)) {
-			lock_grant(lock);
+  DBUG_PRINT("ib_lock", ("wait for trx " TRX_ID_FMT " ends", trx->id));
 
-			if (previous != NULL) {
-				/* Move the lock to the head of the list. */
-				HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
-				lock_rec_insert_to_head(lock, rec_fold);
-			} else {
-				/* Already at the head of the list. */
-				previous = lock;
-			}
-			/* Move on to the next lock. */
-			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
-		} else {
-			previous = lock;
-			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
-		}
-	}
+  /* If we are resolving a deadlock by choosing another transaction as
+  a victim, then our original transaction may not be waiting anymore */
+
+  if (trx->lock.wait_thr)
+    lock_wait_end(trx);
+
+  trx->mutex_unlock();
+}
+
+/*************************************************************//**
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+static void lock_rec_cancel(lock_t *lock)
+{
+  trx_t *trx= lock->trx;
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  trx->mutex_lock();
+
+  ut_d(lock_sys.hash_get(lock->type_mode).
+       assert_locked(lock->un_member.rec_lock.page_id));
+  /* Reset the bit (there can be only one set bit) in the lock bitmap */
+  lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+
+  /* Reset the wait flag and the back pointer to lock in trx */
+  lock_reset_lock_and_trx_wait(lock);
+
+  /* The following releases the trx from lock wait */
+  lock_wait_end(trx);
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  trx->mutex_unlock();
 }
 
 /** Remove a record lock request, waiting or granted, from the queue and
 grant locks to other transactions in the queue if they now are entitled
 to a lock. NOTE: all record locks contained in in_lock are removed.
-@param[in,out]	in_lock		record lock */
-static void lock_rec_dequeue_from_page(lock_t* in_lock)
+@param[in,out]	in_lock		record lock
+@param[in]	owns_wait_mutex	whether lock_sys.wait_mutex is held */
+static void lock_rec_dequeue_from_page(lock_t *in_lock, bool owns_wait_mutex)
 {
-	hash_table_t*	lock_hash;
-
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
-	/* We may or may not be holding in_lock->trx->mutex here. */
+#ifdef SAFE_MUTEX
+	ut_ad(owns_wait_mutex == mysql_mutex_is_owner(&lock_sys.wait_mutex));
+#endif /* SAFE_MUTEX */
+	ut_ad(!in_lock->is_table());
 
-	const page_id_t page_id(in_lock->un_member.rec_lock.page_id);
+	const page_id_t page_id{in_lock->un_member.rec_lock.page_id};
+	auto& lock_hash = lock_sys.hash_get(in_lock->type_mode);
+	ut_ad(lock_sys.is_writer() || in_lock->trx->mutex_is_owner());
 
+	ut_d(auto old_n_locks=)
 	in_lock->index->table->n_rec_locks--;
-
-	lock_hash = lock_hash_get(in_lock->type_mode);
+	ut_ad(old_n_locks);
 
 	const ulint rec_fold = page_id.fold();
+	hash_cell_t &cell = *lock_hash.cell_get(rec_fold);
+	lock_sys.assert_locked(cell);
 
-	HASH_DELETE(lock_t, hash, lock_hash, rec_fold, in_lock);
+	HASH_DELETE(lock_t, hash, &lock_hash, rec_fold, in_lock);
+	ut_ad(lock_sys.is_writer() || in_lock->trx->mutex_is_owner());
 	UT_LIST_REMOVE(in_lock->trx->lock.trx_locks, in_lock);
 
 	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
 	MONITOR_DEC(MONITOR_NUM_RECLOCK);
 
-	if (innodb_lock_schedule_algorithm
-	    == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS
-	    || lock_hash != &lock_sys.rec_hash
-	    || thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) {
-		/* Check if waiting locks in the queue can now be granted:
-		grant locks if there are no conflicting locks ahead. Stop at
-		the first X lock that is waiting or has been granted. */
+	bool acquired = false;
 
-		for (lock_t* lock = lock_sys.get_first(*lock_hash, page_id);
-		     lock != NULL;
-		     lock = lock_rec_get_next_on_page(lock)) {
+	/* Check if waiting locks in the queue can now be granted:
+	grant locks if there are no conflicting locks ahead. Stop at
+	the first X lock that is waiting or has been granted. */
 
-			if (!lock_get_wait(lock)) {
-				continue;
-			}
-			const lock_t* c = lock_rec_has_to_wait_in_queue(lock);
-			if (!c) {
-				/* Grant the lock */
-				ut_ad(lock->trx != in_lock->trx);
-				lock_grant(lock);
+	for (lock_t* lock = lock_sys_t::get_first(cell, page_id);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (!lock->is_waiting()) {
+			continue;
+		}
+
+		if (!owns_wait_mutex) {
+			mysql_mutex_lock(&lock_sys.wait_mutex);
+			acquired = owns_wait_mutex = true;
+		}
+
+		ut_ad(lock->trx->lock.wait_trx);
+		ut_ad(lock->trx->lock.wait_lock);
+
+		if (const lock_t* c = lock_rec_has_to_wait_in_queue(
+			    cell, lock)) {
+			trx_t* c_trx = c->trx;
+			lock->trx->lock.wait_trx = c_trx;
+			if (c_trx->lock.wait_trx
+			    && innodb_deadlock_detect
+			    && Deadlock::to_check.emplace(c_trx).second) {
+				Deadlock::to_be_checked = true;
 			}
+		} else {
+			/* Grant the lock */
+			ut_ad(lock->trx != in_lock->trx);
+			lock_grant(lock);
 		}
-	} else {
-		lock_grant_and_move_on_page(rec_fold, page_id);
+	}
+
+	if (acquired) {
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
 	}
 }
 
-/*************************************************************//**
-Removes a record lock request, waiting or granted, from the queue. */
-void
-lock_rec_discard(
-/*=============*/
-	lock_t*		in_lock)	/*!< in: record lock object: all
-					record locks which are contained
-					in this lock object are removed */
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash     hash table
+@param in_lock  lock object */
+TRANSACTIONAL_TARGET
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock)
 {
-	trx_lock_t*	trx_lock;
-
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
-
-	trx_lock = &in_lock->trx->lock;
-
-	in_lock->index->table->n_rec_locks--;
-
-	HASH_DELETE(lock_t, hash, lock_hash_get(in_lock->type_mode),
-		    in_lock->un_member.rec_lock.page_id.fold(), in_lock);
+  ut_ad(!in_lock->is_table());
+  lock_hash.assert_locked(in_lock->un_member.rec_lock.page_id);
 
-	UT_LIST_REMOVE(trx_lock->trx_locks, in_lock);
-
-	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
-	MONITOR_DEC(MONITOR_NUM_RECLOCK);
+  HASH_DELETE(lock_t, hash, &lock_hash,
+              in_lock->un_member.rec_lock.page_id.fold(), in_lock);
+  ut_d(uint32_t old_locks);
+  {
+    trx_t *trx= in_lock->trx;
+    TMTrxGuard tg{*trx};
+    ut_d(old_locks=)
+    in_lock->index->table->n_rec_locks--;
+    UT_LIST_REMOVE(trx->lock.trx_locks, in_lock);
+  }
+  ut_ad(old_locks);
+  MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+  MONITOR_DEC(MONITOR_NUM_RECLOCK);
 }
 
 /*************************************************************//**
 Removes record lock objects set on an index page which is discarded. This
 function does not move locks, or check for waiting locks, therefore the
 lock bitmaps must already be reset when this function is called. */
-static void lock_rec_free_all_from_discard_page_low(const page_id_t id,
-                                                    hash_table_t *lock_hash)
+static void
+lock_rec_free_all_from_discard_page(page_id_t id, const hash_cell_t &cell,
+                                    lock_sys_t::hash_table &lock_hash)
 {
-  lock_t *lock= lock_sys.get_first(*lock_hash, id);
-
-  while (lock)
+  for (lock_t *lock= lock_sys_t::get_first(cell, id); lock; )
   {
-    ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
-    ut_ad(!lock_get_wait(lock));
+    ut_ad(&lock_hash != &lock_sys.rec_hash ||
+          lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+    ut_ad(!lock->is_waiting());
     lock_t *next_lock= lock_rec_get_next_on_page(lock);
-    lock_rec_discard(lock);
+    lock_rec_discard(lock_hash, lock);
     lock= next_lock;
   }
 }
 
-/*************************************************************//**
-Removes record lock objects set on an index page which is discarded. This
-function does not move locks, or check for waiting locks, therefore the
-lock bitmaps must already be reset when this function is called. */
-void
-lock_rec_free_all_from_discard_page(
-/*================================*/
-	const buf_block_t*	block)	/*!< in: page to be discarded */
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index   a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index)
 {
-  const page_id_t page_id(block->page.id());
-  lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.rec_hash);
-  lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.prdt_hash);
-  lock_rec_free_all_from_discard_page_low(page_id, &lock_sys.prdt_page_hash);
+  ut_ad(!index.is_committed());
+  /* This is very rarely executed code, and the size of the hash array
+  would exceed the maximum size of a memory transaction. */
+  LockMutexGuard g{SRW_LOCK_CALL};
+  const ulint n= lock_sys.rec_hash.pad(lock_sys.rec_hash.n_cells);
+  for (ulint i= 0; i < n; i++)
+  {
+    for (lock_t *lock= static_cast<lock_t*>(lock_sys.rec_hash.array[i].node);
+         lock; )
+    {
+      ut_ad(!lock->is_table());
+      if (lock->index == &index)
+      {
+        ut_ad(!lock->is_waiting());
+        lock_rec_discard(lock_sys.rec_hash, lock);
+        lock= static_cast<lock_t*>(lock_sys.rec_hash.array[i].node);
+      }
+      else
+        lock= lock->hash;
+    }
+  }
 }
 
 /*============= RECORD LOCK MOVING AND INHERITING ===================*/
@@ -2261,58 +2126,35 @@ lock_rec_free_all_from_discard_page(
 /*************************************************************//**
 Resets the lock bits for a single record. Releases transactions waiting for
 lock requests here. */
+TRANSACTIONAL_TARGET
 static
 void
-lock_rec_reset_and_release_wait_low(
-/*================================*/
-	hash_table_t*		hash,	/*!< in: hash table */
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
-	ulint			heap_no)/*!< in: heap number of record */
-{
-	lock_t*	lock;
-
-	ut_ad(lock_mutex_own());
-
-	for (lock = lock_rec_get_first(hash, block, heap_no);
-	     lock != NULL;
-	     lock = lock_rec_get_next(heap_no, lock)) {
-
-		if (lock_get_wait(lock)) {
-			lock_rec_cancel(lock);
-		} else {
-			lock_rec_reset_nth_bit(lock, heap_no);
-		}
-	}
-}
-
-/*************************************************************//**
-Resets the lock bits for a single record. Releases transactions waiting for
-lock requests here. */
-static
-void
-lock_rec_reset_and_release_wait(
-/*============================*/
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
-	ulint			heap_no)/*!< in: heap number of record */
+lock_rec_reset_and_release_wait(const hash_cell_t &cell, const page_id_t id,
+                                ulint heap_no)
 {
-	lock_rec_reset_and_release_wait_low(
-		&lock_sys.rec_hash, block, heap_no);
-
-	lock_rec_reset_and_release_wait_low(
-		&lock_sys.prdt_hash, block, PAGE_HEAP_NO_INFIMUM);
-	lock_rec_reset_and_release_wait_low(
-		&lock_sys.prdt_page_hash, block, PAGE_HEAP_NO_INFIMUM);
+  for (lock_t *lock= lock_sys.get_first(cell, id, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+  {
+    if (lock->is_waiting())
+      lock_rec_cancel(lock);
+    else
+    {
+      TMTrxGuard tg{*lock->trx};
+      lock_rec_reset_nth_bit(lock, heap_no);
+    }
+  }
 }
 
 /** Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
 of another record as gap type locks, but does not reset the lock bits of
 the other record. Also waiting lock requests on rec are inherited as
 GRANTED gap locks.
-@param heir_block   block containing the record which inherits
-@param block block  containing the record from which inherited; does NOT reset
-                    the locks on this record
+@param heir_cell    heir hash table cell
+@param heir         page containing the record which inherits
+@param donor_cell   donor hash table cell
+@param donor        page containing the record from which inherited; does NOT
+                    reset the locks on this record
+@param heir_page    heir page frame
 @param heir_heap_no heap_no of the inheriting record
 @param heap_no      heap_no of the donating record
 @tparam from_split  true if the function is invoked from
@@ -2320,11 +2162,12 @@ GRANTED gap locks.
                     locks are not inherited to supremum if transaction
                     isolation level less or equal to READ COMMITTED */
 template <bool from_split= false>
-static void lock_rec_inherit_to_gap(const buf_block_t *heir_block,
-                                    const buf_block_t *block,
-                                    ulint heir_heap_no, ulint heap_no)
+static void
+lock_rec_inherit_to_gap(hash_cell_t &heir_cell, const page_id_t heir,
+                        const hash_cell_t &donor_cell, const page_id_t donor,
+                        const page_t *heir_page, ulint heir_heap_no,
+                        ulint heap_no)
 {
-  ut_ad(lock_mutex_own());
   ut_ad(!from_split || heir_heap_no == PAGE_HEAP_NO_SUPREMUM);
 
   /* At READ UNCOMMITTED or READ COMMITTED isolation level,
@@ -2333,13 +2176,13 @@ static void lock_rec_inherit_to_gap(const buf_block_t *heir_block,
   DO want S-locks/X-locks(taken for replace) set by a consistency
   constraint to be inherited also then. */
 
-  for (lock_t *lock= lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
-       lock != NULL; lock= lock_rec_get_next(heap_no, lock))
+  for (lock_t *lock= lock_sys_t::get_first(donor_cell, donor, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
   {
-
+    trx_t *lock_trx= lock->trx;
     if (!lock->trx->is_not_inheriting_locks() &&
-        !lock_rec_get_insert_intention(lock) &&
-        (lock->trx->isolation_level > TRX_ISO_READ_COMMITTED ||
+        !lock->is_insert_intention() &&
+        (lock_trx->isolation_level > TRX_ISO_READ_COMMITTED ||
          /* When we are in a page split (not purge), then we don't set a lock
          on supremum if the donor lock type is LOCK_REC_NOT_GAP. That is, do
          not create bogus gap locks for non-gap locks for READ UNCOMMITTED and
@@ -2347,11 +2190,11 @@ static void lock_rec_inherit_to_gap(const buf_block_t *heir_block,
          LOCK_GAP require a gap before the record to be locked, that is why
          setting lock on supremmum is necessary. */
          ((!from_split || !lock->is_record_not_gap()) &&
-          (lock_get_mode(lock) != (lock->trx->duplicates ? LOCK_S : LOCK_X)))))
+          lock->mode() != (lock_trx->duplicates ? LOCK_S : LOCK_X))))
     {
-      lock_rec_add_to_queue(LOCK_REC | LOCK_GAP | lock_get_mode(lock),
-                            heir_block, heir_heap_no, lock->index, lock->trx,
-                            FALSE);
+      lock_rec_add_to_queue(LOCK_GAP | lock->mode(), heir_cell, heir,
+                            heir_page, heir_heap_no, lock->index, lock_trx,
+                            false);
     }
   }
 }
@@ -2372,40 +2215,33 @@ lock_rec_inherit_to_gap_if_gap_lock(
 						does NOT reset the locks
 						on this record */
 {
-	lock_t*	lock;
-
-	lock_mutex_enter();
-
-	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
-	     lock != NULL;
-	     lock = lock_rec_get_next(heap_no, lock)) {
-
-		if (!lock->trx->is_not_inheriting_locks()
-		    && !lock_rec_get_insert_intention(lock)
-		    && (heap_no == PAGE_HEAP_NO_SUPREMUM
-			|| !lock_rec_get_rec_not_gap(lock))) {
-
-			lock_rec_add_to_queue(
-				LOCK_REC | LOCK_GAP | lock_get_mode(lock),
-				block, heir_heap_no, lock->index,
-				lock->trx, FALSE);
-		}
-	}
+  const page_id_t id{block->page.id()};
+  LockGuard g{lock_sys.rec_hash, id};
 
-	lock_mutex_exit();
+  for (lock_t *lock= lock_sys_t::get_first(g.cell(), id, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+     if (!lock->trx->is_not_inheriting_locks() &&
+         !lock->is_insert_intention() && (heap_no == PAGE_HEAP_NO_SUPREMUM ||
+                                          !lock->is_record_not_gap()) &&
+         !lock_table_has(lock->trx, lock->index->table, LOCK_X))
+       lock_rec_add_to_queue(LOCK_GAP | lock->mode(),
+                             g.cell(), id, block->page.frame,
+                             heir_heap_no, lock->index, lock->trx, false);
 }
 
 /*************************************************************//**
 Moves the locks of a record to another record and resets the lock bits of
 the donating record. */
+TRANSACTIONAL_TARGET
 static
 void
-lock_rec_move_low(
-/*==============*/
-	hash_table_t*		lock_hash,	/*!< in: hash table to use */
-	const buf_block_t*	receiver,	/*!< in: buffer block containing
+lock_rec_move(
+	hash_cell_t&		receiver_cell,	/*!< in: hash table cell */
+	const buf_block_t&	receiver,	/*!< in: buffer block containing
 						the receiving record */
-	const buf_block_t*	donator,	/*!< in: buffer block containing
+	const page_id_t		receiver_id,	/*!< in: page identifier */
+	const hash_cell_t&	donator_cell,	/*!< in: hash table cell */
+	const page_id_t		donator_id,	/*!< in: page identifier of
 						the donating record */
 	ulint			receiver_heap_no,/*!< in: heap_no of the record
 						which gets the locks; there
@@ -2414,39 +2250,35 @@ lock_rec_move_low(
 	ulint			donator_heap_no)/*!< in: heap_no of the record
 						which gives the locks */
 {
-	lock_t*	lock;
+	ut_ad(!lock_sys_t::get_first(receiver_cell,
+				     receiver_id, receiver_heap_no));
 
-	ut_ad(lock_mutex_own());
-
-	/* If the lock is predicate lock, it resides on INFIMUM record */
-	ut_ad(lock_rec_get_first(
-		lock_hash, receiver, receiver_heap_no) == NULL
-	      || lock_hash == &lock_sys.prdt_hash
-	      || lock_hash == &lock_sys.prdt_page_hash);
-
-	for (lock = lock_rec_get_first(lock_hash,
-				       donator, donator_heap_no);
+	for (lock_t *lock = lock_sys_t::get_first(donator_cell, donator_id,
+						  donator_heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next(donator_heap_no, lock)) {
-
 		const auto type_mode = lock->type_mode;
-
-		lock_rec_reset_nth_bit(lock, donator_heap_no);
-
 		if (type_mode & LOCK_WAIT) {
-			lock_reset_lock_and_trx_wait(lock);
+			ut_ad(lock->trx->lock.wait_lock == lock);
+			lock->type_mode &= ~LOCK_WAIT;
 		}
 
+		trx_t* lock_trx = lock->trx;
+		lock_trx->mutex_lock();
+		lock_rec_reset_nth_bit(lock, donator_heap_no);
+
 		/* Note that we FIRST reset the bit, and then set the lock:
-		the function works also if donator == receiver */
+		the function works also if donator_id == receiver_id */
 
-		lock_rec_add_to_queue(
-			type_mode, receiver, receiver_heap_no,
-			lock->index, lock->trx, FALSE);
+		lock_rec_add_to_queue(type_mode, receiver_cell,
+				      receiver_id, receiver.page.frame,
+				      receiver_heap_no,
+				      lock->index, lock_trx, true);
+		lock_trx->mutex_unlock();
 	}
 
-	ut_ad(!lock_rec_get_first(&lock_sys.rec_hash,
-				  donator, donator_heap_no));
+	ut_ad(!lock_sys_t::get_first(donator_cell, donator_id,
+				     donator_heap_no));
 }
 
 /** Move all the granted locks to the front of the given lock list.
@@ -2483,32 +2315,11 @@ lock_move_granted_locks_to_front(
 }
 
 /*************************************************************//**
-Moves the locks of a record to another record and resets the lock bits of
-the donating record. */
-UNIV_INLINE
-void
-lock_rec_move(
-/*==========*/
-	const buf_block_t*	receiver,       /*!< in: buffer block containing
-						the receiving record */
-	const buf_block_t*	donator,        /*!< in: buffer block containing
-						the donating record */
-	ulint			receiver_heap_no,/*!< in: heap_no of the record
-						which gets the locks; there
-						must be no lock requests
-						on it! */
-	ulint			donator_heap_no)/*!< in: heap_no of the record
-                                                which gives the locks */
-{
-	lock_rec_move_low(&lock_sys.rec_hash, receiver, donator,
-			  receiver_heap_no, donator_heap_no);
-}
-
-/*************************************************************//**
 Updates the lock table when we have reorganized a page. NOTE: we copy
 also the locks set on the infimum of the page; the infimum may carry
 locks if an update of a record is occurring on the page, and its locks
 were temporarily stored on the infimum. */
+TRANSACTIONAL_TARGET
 void
 lock_move_reorganize_page(
 /*======================*/
@@ -2517,125 +2328,149 @@ lock_move_reorganize_page(
 	const buf_block_t*	oblock)	/*!< in: copy of the old, not
 					reorganized page */
 {
-	lock_t*		lock;
-	UT_LIST_BASE_NODE_T(lock_t)	old_locks;
-	mem_heap_t*	heap		= NULL;
-	ulint		comp;
+  mem_heap_t *heap;
 
-	lock_mutex_enter();
+  {
+    UT_LIST_BASE_NODE_T(lock_t) old_locks;
+    UT_LIST_INIT(old_locks, &lock_t::trx_locks);
 
-	/* FIXME: This needs to deal with predicate lock too */
-	lock = lock_sys.get_first(block->page.id());
+    const page_id_t id{block->page.id()};
+    const auto id_fold= id.fold();
+    {
+      TMLockGuard g{lock_sys.rec_hash, id};
+      if (!lock_sys_t::get_first(g.cell(), id))
+        return;
+    }
 
-	if (lock == NULL) {
-		lock_mutex_exit();
+    /* We will modify arbitrary trx->lock.trx_locks.
+    Do not bother with a memory transaction; we are going
+    to allocate memory and copy a lot of data. */
+    LockMutexGuard g{SRW_LOCK_CALL};
+    hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id_fold);
 
-		return;
-	}
+    /* Note: Predicate locks for SPATIAL INDEX are not affected by
+    page reorganize, because they do not refer to individual record
+    heap numbers. */
+    lock_t *lock= lock_sys_t::get_first(cell, id);
 
-	heap = mem_heap_create(256);
+    if (!lock)
+      return;
 
-	/* Copy first all the locks on the page to heap and reset the
-	bitmaps in the original locks; chain the copies of the locks
-	using the trx_locks field in them. */
+    heap= mem_heap_create(256);
 
-	UT_LIST_INIT(old_locks, &lock_t::trx_locks);
+    /* Copy first all the locks on the page to heap and reset the
+    bitmaps in the original locks; chain the copies of the locks
+    using the trx_locks field in them. */
 
-	do {
-		/* Make a copy of the lock */
-		lock_t*	old_lock = lock_rec_copy(lock, heap);
+    do
+    {
+      /* Make a copy of the lock */
+      lock_t *old_lock= lock_rec_copy(lock, heap);
 
-		UT_LIST_ADD_LAST(old_locks, old_lock);
+      UT_LIST_ADD_LAST(old_locks, old_lock);
 
-		/* Reset bitmap of lock */
-		lock_rec_bitmap_reset(lock);
+      /* Reset bitmap of lock */
+      lock_rec_bitmap_reset(lock);
 
-		if (lock_get_wait(lock)) {
+      if (lock->is_waiting())
+      {
+        ut_ad(lock->trx->lock.wait_lock == lock);
+        lock->type_mode&= ~LOCK_WAIT;
+      }
 
-			lock_reset_lock_and_trx_wait(lock);
-		}
+      lock= lock_rec_get_next_on_page(lock);
+    }
+    while (lock);
 
-		lock = lock_rec_get_next_on_page(lock);
-	} while (lock != NULL);
+    const ulint comp= page_is_comp(block->page.frame);
+    ut_ad(comp == page_is_comp(oblock->page.frame));
 
-	comp = page_is_comp(block->frame);
-	ut_ad(comp == page_is_comp(oblock->frame));
+    lock_move_granted_locks_to_front(old_locks);
 
-	lock_move_granted_locks_to_front(old_locks);
+    DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize",
+                    ut_list_reverse(old_locks););
 
-	DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize",
-			ut_list_reverse(old_locks););
+    for (lock= UT_LIST_GET_FIRST(old_locks); lock;
+         lock= UT_LIST_GET_NEXT(trx_locks, lock))
+    {
+      /* NOTE: we copy also the locks set on the infimum and
+      supremum of the page; the infimum may carry locks if an
+      update of a record is occurring on the page, and its locks
+      were temporarily stored on the infimum */
+      const rec_t *rec1= page_get_infimum_rec(block->page.frame);
+      const rec_t *rec2= page_get_infimum_rec(oblock->page.frame);
+
+      /* Set locks according to old locks */
+      for (;;)
+      {
+        ulint old_heap_no;
+        ulint new_heap_no;
+        ut_d(const rec_t* const orec= rec1);
+        ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2));
 
-	for (lock = UT_LIST_GET_FIRST(old_locks); lock;
-	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+        if (comp)
+        {
+          old_heap_no= rec_get_heap_no_new(rec2);
+          new_heap_no= rec_get_heap_no_new(rec1);
 
-		/* NOTE: we copy also the locks set on the infimum and
-		supremum of the page; the infimum may carry locks if an
-		update of a record is occurring on the page, and its locks
-		were temporarily stored on the infimum */
-		const rec_t*	rec1 = page_get_infimum_rec(
-			buf_block_get_frame(block));
-		const rec_t*	rec2 = page_get_infimum_rec(
-			buf_block_get_frame(oblock));
-
-		/* Set locks according to old locks */
-		for (;;) {
-			ulint	old_heap_no;
-			ulint	new_heap_no;
-			ut_d(const rec_t* const orec = rec1);
-			ut_ad(page_rec_is_metadata(rec1)
-			      == page_rec_is_metadata(rec2));
-
-			if (comp) {
-				old_heap_no = rec_get_heap_no_new(rec2);
-				new_heap_no = rec_get_heap_no_new(rec1);
-
-				rec1 = page_rec_get_next_low(rec1, TRUE);
-				rec2 = page_rec_get_next_low(rec2, TRUE);
-			} else {
-				old_heap_no = rec_get_heap_no_old(rec2);
-				new_heap_no = rec_get_heap_no_old(rec1);
-				ut_ad(!memcmp(rec1, rec2,
-					      rec_get_data_size_old(rec2)));
-
-				rec1 = page_rec_get_next_low(rec1, FALSE);
-				rec2 = page_rec_get_next_low(rec2, FALSE);
-			}
+          rec1= page_rec_get_next_low(rec1, TRUE);
+          rec2= page_rec_get_next_low(rec2, TRUE);
+        }
+        else
+        {
+          old_heap_no= rec_get_heap_no_old(rec2);
+          new_heap_no= rec_get_heap_no_old(rec1);
+          ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2)));
 
-			/* Clear the bit in old_lock. */
-			if (old_heap_no < lock->un_member.rec_lock.n_bits
-			    && lock_rec_reset_nth_bit(lock, old_heap_no)) {
-				ut_ad(!page_rec_is_metadata(orec));
+          rec1= page_rec_get_next_low(rec1, FALSE);
+          rec2= page_rec_get_next_low(rec2, FALSE);
+        }
 
-				/* NOTE that the old lock bitmap could be too
-				small for the new heap number! */
+        trx_t *lock_trx= lock->trx;
+	lock_trx->mutex_lock();
 
-				lock_rec_add_to_queue(
-					lock->type_mode, block, new_heap_no,
-					lock->index, lock->trx, FALSE);
-			}
+	/* Clear the bit in old_lock. */
+	if (old_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, old_heap_no))
+        {
+          ut_ad(!page_rec_is_metadata(orec));
 
-			if (new_heap_no == PAGE_HEAP_NO_SUPREMUM) {
-				ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
-				break;
-			}
-		}
+          /* NOTE that the old lock bitmap could be too
+          small for the new heap number! */
+          lock_rec_add_to_queue(lock->type_mode, cell, id, block->page.frame,
+                                new_heap_no, lock->index, lock_trx, true);
+        }
 
-		ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
-	}
+	lock_trx->mutex_unlock();
+
+        if (!rec1 || !rec2)
+        {
+          ut_ad(!rec1 == !rec2);
+          ut_ad(new_heap_no == PAGE_HEAP_NO_SUPREMUM);
+          ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
+          break;
+        }
+      }
 
-	lock_mutex_exit();
+      ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+    }
+  }
 
-	mem_heap_free(heap);
+  mem_heap_free(heap);
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
-	ut_ad(lock_rec_validate_page(block));
+  if (fil_space_t *space= fil_space_t::get(id.space()))
+  {
+    ut_ad(lock_rec_validate_page(block, space->is_latched()));
+    space->release();
+  }
 #endif
 }
 
 /*************************************************************//**
 Moves the explicit locks on user records to another page if a record
 list end is moved to another page. */
+TRANSACTIONAL_TARGET
 void
 lock_move_rec_list_end(
 /*===================*/
@@ -2644,111 +2479,133 @@ lock_move_rec_list_end(
 	const rec_t*		rec)		/*!< in: record on page: this
 						is the first record moved */
 {
-	lock_t*		lock;
-	const ulint	comp	= page_rec_is_comp(rec);
-
-	ut_ad(buf_block_get_frame(block) == page_align(rec));
-	ut_ad(comp == page_is_comp(buf_block_get_frame(new_block)));
+  const ulint comp= page_rec_is_comp(rec);
 
-	lock_mutex_enter();
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == page_is_comp(new_block->page.frame));
 
-	/* Note: when we move locks from record to record, waiting locks
-	and possible granted gap type locks behind them are enqueued in
-	the original order, because new elements are inserted to a hash
-	table to the end of the hash chain, and lock_rec_add_to_queue
-	does not reuse locks if there are waiters in the queue. */
-
-	for (lock = lock_sys.get_first(block->page.id());
-	     lock;
-	     lock = lock_rec_get_next_on_page(lock)) {
-		const rec_t*	rec1	= rec;
-		const rec_t*	rec2;
-		const auto	type_mode = lock->type_mode;
-
-		if (comp) {
-			if (page_offset(rec1) == PAGE_NEW_INFIMUM) {
-				rec1 = page_rec_get_next_low(rec1, TRUE);
-			}
-
-			rec2 = page_rec_get_next_low(
-				buf_block_get_frame(new_block)
-				+ PAGE_NEW_INFIMUM, TRUE);
-		} else {
-			if (page_offset(rec1) == PAGE_OLD_INFIMUM) {
-				rec1 = page_rec_get_next_low(rec1, FALSE);
-			}
-
-			rec2 = page_rec_get_next_low(
-				buf_block_get_frame(new_block)
-				+ PAGE_OLD_INFIMUM, FALSE);
-		}
-
-		/* Copy lock requests on user records to new page and
-		reset the lock bits on the old */
-
-		for (;;) {
-			ut_ad(page_rec_is_metadata(rec1)
-			      == page_rec_is_metadata(rec2));
-			ut_d(const rec_t* const orec = rec1);
-
-			ulint	rec1_heap_no;
-			ulint	rec2_heap_no;
-
-			if (comp) {
-				rec1_heap_no = rec_get_heap_no_new(rec1);
+  const page_id_t id{block->page.id()};
+  const page_id_t new_id{new_block->page.id()};
+  {
+    /* This would likely be too large for a memory transaction. */
+    LockMultiGuard g{lock_sys.rec_hash, id, new_id};
+
+    /* Note: when we move locks from record to record, waiting locks
+    and possible granted gap type locks behind them are enqueued in
+    the original order, because new elements are inserted to a hash
+    table to the end of the hash chain, and lock_rec_add_to_queue
+    does not reuse locks if there are waiters in the queue. */
+    for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
+         lock= lock_rec_get_next_on_page(lock))
+    {
+      const rec_t *rec1= rec;
+      const rec_t *rec2;
+      const auto type_mode= lock->type_mode;
 
-				if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) {
-					break;
-				}
+      if (comp)
+      {
+        if (page_offset(rec1) == PAGE_NEW_INFIMUM)
+          rec1= page_rec_get_next_low(rec1, TRUE);
+        rec2= page_rec_get_next_low(new_block->page.frame + PAGE_NEW_INFIMUM,
+                                    TRUE);
+      }
+      else
+      {
+        if (page_offset(rec1) == PAGE_OLD_INFIMUM)
+          rec1= page_rec_get_next_low(rec1, FALSE);
+        rec2= page_rec_get_next_low(new_block->page.frame + PAGE_OLD_INFIMUM,
+                                    FALSE);
+      }
 
-				rec2_heap_no = rec_get_heap_no_new(rec2);
-				rec1 = page_rec_get_next_low(rec1, TRUE);
-				rec2 = page_rec_get_next_low(rec2, TRUE);
-			} else {
-				rec1_heap_no = rec_get_heap_no_old(rec1);
+      if (UNIV_UNLIKELY(!rec1 || !rec2))
+      {
+        ut_ad("corrupted page" == 0);
+        return;
+      }
 
-				if (rec1_heap_no == PAGE_HEAP_NO_SUPREMUM) {
-					break;
-				}
+      /* Copy lock requests on user records to new page and
+      reset the lock bits on the old */
+      for (;;)
+      {
+        ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2));
+        ut_d(const rec_t* const orec= rec1);
 
-				rec2_heap_no = rec_get_heap_no_old(rec2);
+        ulint rec1_heap_no;
+        ulint rec2_heap_no;
 
-				ut_ad(rec_get_data_size_old(rec1)
-				      == rec_get_data_size_old(rec2));
+        if (comp)
+        {
+          rec1_heap_no= rec_get_heap_no_new(rec1);
+          if (!(rec1= page_rec_get_next_low(rec1, TRUE)))
+          {
+            ut_ad(rec1_heap_no == PAGE_HEAP_NO_SUPREMUM);
+            break;
+          }
+          rec2_heap_no= rec_get_heap_no_new(rec2);
+          rec2= page_rec_get_next_low(rec2, TRUE);
+        }
+        else
+        {
+          ut_d(const rec_t *old1= rec1);
+          rec1_heap_no= rec_get_heap_no_old(rec1);
+          if (!(rec1= page_rec_get_next_low(rec1, FALSE)))
+          {
+            ut_ad(rec1_heap_no == PAGE_HEAP_NO_SUPREMUM);
+            break;
+          }
+
+          ut_ad(rec_get_data_size_old(old1) == rec_get_data_size_old(rec2));
+          ut_ad(!memcmp(old1, rec2, rec_get_data_size_old(old1)));
+
+          rec2_heap_no= rec_get_heap_no_old(rec2);
+          rec2= page_rec_get_next_low(rec2, FALSE);
+        }
 
-				ut_ad(!memcmp(rec1, rec2,
-					      rec_get_data_size_old(rec1)));
+        if (UNIV_UNLIKELY(!rec2))
+        {
+          ut_ad("corrupted page" == 0);
+          return;
+        }
 
-				rec1 = page_rec_get_next_low(rec1, FALSE);
-				rec2 = page_rec_get_next_low(rec2, FALSE);
-			}
+        trx_t *lock_trx= lock->trx;
+        lock_trx->mutex_lock();
 
-			if (rec1_heap_no < lock->un_member.rec_lock.n_bits
-			    && lock_rec_reset_nth_bit(lock, rec1_heap_no)) {
-				ut_ad(!page_rec_is_metadata(orec));
+        if (rec1_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, rec1_heap_no))
+        {
+          ut_ad(!page_rec_is_metadata(orec));
 
-				if (type_mode & LOCK_WAIT) {
-					lock_reset_lock_and_trx_wait(lock);
-				}
+          if (type_mode & LOCK_WAIT)
+          {
+            ut_ad(lock_trx->lock.wait_lock == lock);
+            lock->type_mode&= ~LOCK_WAIT;
+          }
 
-				lock_rec_add_to_queue(
-					type_mode, new_block, rec2_heap_no,
-					lock->index, lock->trx, FALSE);
-			}
-		}
-	}
+          lock_rec_add_to_queue(type_mode, g.cell2(), new_id,
+                                new_block->page.frame,
+                                rec2_heap_no, lock->index, lock_trx, true);
+        }
 
-	lock_mutex_exit();
+        lock_trx->mutex_unlock();
+      }
+    }
+  }
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
-	ut_ad(lock_rec_validate_page(block));
-	ut_ad(lock_rec_validate_page(new_block));
+  if (fil_space_t *space= fil_space_t::get(id.space()))
+  {
+    const bool is_latched{space->is_latched()};
+    ut_ad(lock_rec_validate_page(block, is_latched));
+    ut_ad(lock_rec_validate_page(new_block, is_latched));
+    space->release();
+  }
 #endif
 }
 
 /*************************************************************//**
 Moves the explicit locks on user records to another page if a record
 list start is moved to another page. */
+TRANSACTIONAL_TARGET
 void
 lock_move_rec_list_start(
 /*=====================*/
@@ -2764,104 +2621,114 @@ lock_move_rec_list_start(
 						before the records
 						were copied */
 {
-	lock_t*		lock;
-	const ulint	comp	= page_rec_is_comp(rec);
+  const ulint comp= page_rec_is_comp(rec);
 
-	ut_ad(block->frame == page_align(rec));
-	ut_ad(new_block->frame == page_align(old_end));
-	ut_ad(comp == page_rec_is_comp(old_end));
-	ut_ad(!page_rec_is_metadata(rec));
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == page_is_comp(new_block->page.frame));
+  ut_ad(new_block->page.frame == page_align(old_end));
+  ut_ad(!page_rec_is_metadata(rec));
+  const page_id_t id{block->page.id()};
+  const page_id_t new_id{new_block->page.id()};
 
-	lock_mutex_enter();
+  {
+    /* This would likely be too large for a memory transaction. */
+    LockMultiGuard g{lock_sys.rec_hash, id, new_id};
 
-	for (lock = lock_sys.get_first(block->page.id());
-	     lock;
-	     lock = lock_rec_get_next_on_page(lock)) {
-		const rec_t*	rec1;
-		const rec_t*	rec2;
-		const auto	type_mode = lock->type_mode;
-
-		if (comp) {
-			rec1 = page_rec_get_next_low(
-				buf_block_get_frame(block)
-				+ PAGE_NEW_INFIMUM, TRUE);
-			rec2 = page_rec_get_next_low(old_end, TRUE);
-		} else {
-			rec1 = page_rec_get_next_low(
-				buf_block_get_frame(block)
-				+ PAGE_OLD_INFIMUM, FALSE);
-			rec2 = page_rec_get_next_low(old_end, FALSE);
-		}
+    for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
+         lock= lock_rec_get_next_on_page(lock))
+    {
+      const rec_t *rec1;
+      const rec_t *rec2;
+      const auto type_mode= lock->type_mode;
 
-		/* Copy lock requests on user records to new page and
-		reset the lock bits on the old */
+      if (comp)
+      {
+        rec1= page_rec_get_next_low(block->page.frame + PAGE_NEW_INFIMUM,
+                                    TRUE);
+        rec2= page_rec_get_next_low(old_end, TRUE);
+      }
+      else
+      {
+        rec1= page_rec_get_next_low(block->page.frame + PAGE_OLD_INFIMUM,
+                                    FALSE);
+        rec2= page_rec_get_next_low(old_end, FALSE);
+      }
 
-		while (rec1 != rec) {
-			ut_ad(page_rec_is_metadata(rec1)
-			      == page_rec_is_metadata(rec2));
-			ut_d(const rec_t* const prev = rec1);
+      /* Copy lock requests on user records to new page and
+      reset the lock bits on the old */
 
-			ulint	rec1_heap_no;
-			ulint	rec2_heap_no;
+      while (rec1 != rec)
+      {
+        if (UNIV_UNLIKELY(!rec1 || !rec2))
+        {
+          ut_ad("corrupted page" == 0);
+          return;
+        }
 
-			if (comp) {
-				rec1_heap_no = rec_get_heap_no_new(rec1);
-				rec2_heap_no = rec_get_heap_no_new(rec2);
+        ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2));
+        ut_d(const rec_t* const prev= rec1);
 
-				rec1 = page_rec_get_next_low(rec1, TRUE);
-				rec2 = page_rec_get_next_low(rec2, TRUE);
-			} else {
-				rec1_heap_no = rec_get_heap_no_old(rec1);
-				rec2_heap_no = rec_get_heap_no_old(rec2);
+        ulint rec1_heap_no;
+        ulint rec2_heap_no;
 
-				ut_ad(!memcmp(rec1, rec2,
-					      rec_get_data_size_old(rec2)));
+        if (comp)
+        {
+          rec1_heap_no= rec_get_heap_no_new(rec1);
+          rec2_heap_no= rec_get_heap_no_new(rec2);
 
-				rec1 = page_rec_get_next_low(rec1, FALSE);
-				rec2 = page_rec_get_next_low(rec2, FALSE);
-			}
+          rec1= page_rec_get_next_low(rec1, TRUE);
+          rec2= page_rec_get_next_low(rec2, TRUE);
+        }
+        else
+        {
+          rec1_heap_no= rec_get_heap_no_old(rec1);
+          rec2_heap_no= rec_get_heap_no_old(rec2);
 
-			if (rec1_heap_no < lock->un_member.rec_lock.n_bits
-			    && lock_rec_reset_nth_bit(lock, rec1_heap_no)) {
-				ut_ad(!page_rec_is_metadata(prev));
+          ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2)));
 
-				if (type_mode & LOCK_WAIT) {
-					lock_reset_lock_and_trx_wait(lock);
-				}
+          rec1= page_rec_get_next_low(rec1, FALSE);
+          rec2= page_rec_get_next_low(rec2, FALSE);
+        }
 
-				lock_rec_add_to_queue(
-					type_mode, new_block, rec2_heap_no,
-					lock->index, lock->trx, FALSE);
-			}
-		}
+        trx_t *lock_trx= lock->trx;
+        lock_trx->mutex_lock();
+
+        if (rec1_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, rec1_heap_no))
+        {
+          ut_ad(!page_rec_is_metadata(prev));
+
+          if (type_mode & LOCK_WAIT)
+          {
+            ut_ad(lock_trx->lock.wait_lock == lock);
+            lock->type_mode&= ~LOCK_WAIT;
+          }
+
+          lock_rec_add_to_queue(type_mode, g.cell2(), new_id,
+                                new_block->page.frame,
+                                rec2_heap_no, lock->index, lock_trx, true);
+        }
+
+        lock_trx->mutex_unlock();
+      }
 
 #ifdef UNIV_DEBUG
-		if (page_rec_is_supremum(rec)) {
-			ulint	i;
-
-			for (i = PAGE_HEAP_NO_USER_LOW;
-			     i < lock_rec_get_n_bits(lock); i++) {
-				if (lock_rec_get_nth_bit(lock, i)) {
-					ib::fatal()
-						<< "lock_move_rec_list_start():"
-						<< i << " not moved in "
-						<<  (void*) lock;
-				}
-			}
-		}
+      if (page_rec_is_supremum(rec))
+        for (auto i= lock_rec_get_n_bits(lock); --i > PAGE_HEAP_NO_USER_LOW; )
+          ut_ad(!lock_rec_get_nth_bit(lock, i));
 #endif /* UNIV_DEBUG */
-	}
-
-	lock_mutex_exit();
+    }
+  }
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
-	ut_ad(lock_rec_validate_page(block));
+  ut_ad(lock_rec_validate_page(block));
 #endif
 }
 
 /*************************************************************//**
 Moves the explicit locks on user records to another page if a record
 list start is moved to another page. */
+TRANSACTIONAL_TARGET
 void
 lock_rtr_move_rec_list(
 /*===================*/
@@ -2872,74 +2739,80 @@ lock_rtr_move_rec_list(
 						moved */
 	ulint			num_move)       /*!< in: num of rec to move */
 {
-	lock_t*		lock;
-	ulint		comp;
+  if (!num_move)
+    return;
 
-	if (!num_move) {
-		return;
-	}
+  const ulint comp= page_rec_is_comp(rec_move[0].old_rec);
 
-	comp = page_rec_is_comp(rec_move[0].old_rec);
+  ut_ad(block->page.frame == page_align(rec_move[0].old_rec));
+  ut_ad(new_block->page.frame == page_align(rec_move[0].new_rec));
+  ut_ad(comp == page_rec_is_comp(rec_move[0].new_rec));
+  const page_id_t id{block->page.id()};
+  const page_id_t new_id{new_block->page.id()};
 
-	ut_ad(block->frame == page_align(rec_move[0].old_rec));
-	ut_ad(new_block->frame == page_align(rec_move[0].new_rec));
-	ut_ad(comp == page_rec_is_comp(rec_move[0].new_rec));
-
-	lock_mutex_enter();
-
-	for (lock = lock_sys.get_first(block->page.id());
-	     lock;
-	     lock = lock_rec_get_next_on_page(lock)) {
-		ulint		moved = 0;
-		const rec_t*	rec1;
-		const rec_t*	rec2;
-		const auto	type_mode = lock->type_mode;
+  {
+    /* This would likely be too large for a memory transaction. */
+    LockMultiGuard g{lock_sys.rec_hash, id, new_id};
 
-		/* Copy lock requests on user records to new page and
-		reset the lock bits on the old */
+    for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
+         lock= lock_rec_get_next_on_page(lock))
+    {
+      const rec_t *rec1;
+      const rec_t *rec2;
+      const auto type_mode= lock->type_mode;
 
-		while (moved < num_move) {
-			ulint	rec1_heap_no;
-			ulint	rec2_heap_no;
+      /* Copy lock requests on user records to new page and
+      reset the lock bits on the old */
 
-			rec1 = rec_move[moved].old_rec;
-			rec2 = rec_move[moved].new_rec;
-			ut_ad(!page_rec_is_metadata(rec1));
-			ut_ad(!page_rec_is_metadata(rec2));
+      for (ulint moved= 0; moved < num_move; moved++)
+      {
+        ulint rec1_heap_no;
+        ulint rec2_heap_no;
 
-			if (comp) {
-				rec1_heap_no = rec_get_heap_no_new(rec1);
-				rec2_heap_no = rec_get_heap_no_new(rec2);
+        rec1= rec_move[moved].old_rec;
+        rec2= rec_move[moved].new_rec;
+        ut_ad(!page_rec_is_metadata(rec1));
+        ut_ad(!page_rec_is_metadata(rec2));
 
-			} else {
-				rec1_heap_no = rec_get_heap_no_old(rec1);
-				rec2_heap_no = rec_get_heap_no_old(rec2);
+        if (comp)
+        {
+          rec1_heap_no= rec_get_heap_no_new(rec1);
+          rec2_heap_no= rec_get_heap_no_new(rec2);
+        }
+        else
+        {
+          rec1_heap_no= rec_get_heap_no_old(rec1);
+          rec2_heap_no= rec_get_heap_no_old(rec2);
 
-				ut_ad(!memcmp(rec1, rec2,
-					      rec_get_data_size_old(rec2)));
-			}
+          ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2)));
+        }
 
-			if (rec1_heap_no < lock->un_member.rec_lock.n_bits
-			    && lock_rec_reset_nth_bit(lock, rec1_heap_no)) {
-				if (type_mode & LOCK_WAIT) {
-					lock_reset_lock_and_trx_wait(lock);
-				}
+        trx_t *lock_trx= lock->trx;
+        lock_trx->mutex_lock();
 
-				lock_rec_add_to_queue(
-					type_mode, new_block, rec2_heap_no,
-					lock->index, lock->trx, FALSE);
+        if (rec1_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, rec1_heap_no))
+        {
+          if (type_mode & LOCK_WAIT)
+          {
+            ut_ad(lock_trx->lock.wait_lock == lock);
+            lock->type_mode&= ~LOCK_WAIT;
+          }
 
-				rec_move[moved].moved = true;
-			}
+          lock_rec_add_to_queue(type_mode, g.cell2(), new_id,
+                                new_block->page.frame,
+                                rec2_heap_no, lock->index, lock_trx, true);
 
-			moved++;
-		}
-	}
+          rec_move[moved].moved= true;
+        }
 
-	lock_mutex_exit();
+        lock_trx->mutex_unlock();
+      }
+    }
+  }
 
 #ifdef UNIV_DEBUG_LOCK_VALIDATE
-	ut_ad(lock_rec_validate_page(block));
+  ut_ad(lock_rec_validate_page(block));
 #endif
 }
 /*************************************************************//**
@@ -2950,36 +2823,57 @@ lock_update_split_right(
 	const buf_block_t*	right_block,	/*!< in: right page */
 	const buf_block_t*	left_block)	/*!< in: left page */
 {
-	ulint	heap_no = lock_get_min_heap_no(right_block);
-
-	lock_mutex_enter();
-
-	/* Move the locks on the supremum of the left page to the supremum
-	of the right page */
+  const ulint h= lock_get_min_heap_no(right_block);
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
 
-	lock_rec_move(right_block, left_block,
-		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
 
-	/* Inherit the locks to the supremum of left page from the successor
-	of the infimum on right page */
+  /* Move the locks on the supremum of the left page to the supremum
+  of the right page */
 
-	lock_rec_inherit_to_gap<true>(left_block, right_block,
-				PAGE_HEAP_NO_SUPREMUM, heap_no);
+  lock_rec_move(g.cell2(), *right_block, r, g.cell1(), l,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
 
-	lock_mutex_exit();
+  /* Inherit the locks to the supremum of left page from the successor
+  of the infimum on right page */
+  lock_rec_inherit_to_gap<true>(g.cell1(), l, g.cell2(), r,
+                                left_block->page.frame, PAGE_HEAP_NO_SUPREMUM,
+                                h);
 }
 
 void lock_update_node_pointer(const buf_block_t *left_block,
                               const buf_block_t *right_block)
 {
   const ulint h= lock_get_min_heap_no(right_block);
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
 
-  lock_mutex_enter();
-  lock_rec_inherit_to_gap(right_block, left_block,
+  lock_rec_inherit_to_gap(g.cell2(), r, g.cell1(), l, right_block->page.frame,
                           h, PAGE_HEAP_NO_SUPREMUM);
-  lock_mutex_exit();
 }
 
+#ifdef UNIV_DEBUG
+static void lock_assert_no_spatial(const page_id_t id)
+{
+  const auto id_fold= id.fold();
+  auto cell= lock_sys.prdt_page_hash.cell_get(id_fold);
+  auto latch= lock_sys_t::hash_table::latch(cell);
+  latch->acquire();
+  /* there should exist no page lock on the left page,
+  otherwise, it will be blocked from merge */
+  ut_ad(!lock_sys_t::get_first(*cell, id));
+  latch->release();
+  cell= lock_sys.prdt_hash.cell_get(id_fold);
+  latch= lock_sys_t::hash_table::latch(cell);
+  latch->acquire();
+  ut_ad(!lock_sys_t::get_first(*cell, id));
+  latch->release();
+}
+#endif
+
 /*************************************************************//**
 Updates the lock table when a page is merged to the right. */
 void
@@ -2995,77 +2889,56 @@ lock_update_merge_right(
 						page which will be
 						discarded */
 {
-	ut_ad(!page_rec_is_metadata(orig_succ));
-
-	lock_mutex_enter();
-
-	/* Inherit the locks from the supremum of the left page to the
-	original successor of infimum on the right page, to which the left
-	page was merged */
-
-	lock_rec_inherit_to_gap(right_block, left_block,
-				page_rec_get_heap_no(orig_succ),
-				PAGE_HEAP_NO_SUPREMUM);
+  ut_ad(!page_rec_is_metadata(orig_succ));
 
-	/* Reset the locks on the supremum of the left page, releasing
-	waiting transactions */
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
 
-	lock_rec_reset_and_release_wait_low(
-		&lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
+  /* Inherit the locks from the supremum of the left page to the
+  original successor of infimum on the right page, to which the left
+  page was merged */
+  lock_rec_inherit_to_gap(g.cell2(), r, g.cell1(), l, right_block->page.frame,
+                          page_rec_get_heap_no(orig_succ),
+                          PAGE_HEAP_NO_SUPREMUM);
 
-	/* there should exist no page lock on the left page,
-	otherwise, it will be blocked from merge */
-	ut_ad(!lock_sys.get_first_prdt_page(left_block->page.id()));
+  /* Reset the locks on the supremum of the left page, releasing
+  waiting transactions */
+  lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
+  lock_rec_free_all_from_discard_page(l, g.cell1(), lock_sys.rec_hash);
 
-	lock_rec_free_all_from_discard_page(left_block);
-
-	lock_mutex_exit();
+  ut_d(lock_assert_no_spatial(l));
 }
 
-/*************************************************************//**
-Updates the lock table when the root page is copied to another in
-btr_root_raise_and_insert. Note that we leave lock structs on the
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
 root page, even though they do not make sense on other than leaf
 pages: the reason is that in a pessimistic update the infimum record
 of the root page will act as a dummy carrier of the locks of the record
 to be updated. */
-void
-lock_update_root_raise(
-/*===================*/
-	const buf_block_t*	block,	/*!< in: index page to which copied */
-	const buf_block_t*	root)	/*!< in: root page */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root)
 {
-	lock_mutex_enter();
-
-	/* Move the locks on the supremum of the root to the supremum
-	of block */
-
-	lock_rec_move(block, root,
-		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
-	lock_mutex_exit();
+  const page_id_t id{block.page.id()};
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, id, root};
+  /* Move the locks on the supremum of the root to the supremum of block */
+  lock_rec_move(g.cell1(), block, id, g.cell2(), root,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
 }
 
-/*************************************************************//**
-Updates the lock table when a page is copied to another and the original page
-is removed from the chain of leaf pages, except if page is the root! */
-void
-lock_update_copy_and_discard(
-/*=========================*/
-	const buf_block_t*	new_block,	/*!< in: index page to
-						which copied */
-	const buf_block_t*	block)		/*!< in: index page;
-						NOT the root! */
+/** Update the lock table when a page is copied to another.
+@param new_block  the target page
+@param old        old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old)
 {
-	lock_mutex_enter();
-
-	/* Move the locks on the supremum of the old page to the supremum
-	of new_page */
-
-	lock_rec_move(new_block, block,
-		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
-	lock_rec_free_all_from_discard_page(block);
-
-	lock_mutex_exit();
+  const page_id_t id{new_block.page.id()};
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, id, old};
+  /* Move the locks on the supremum of the old page to the supremum of new */
+  lock_rec_move(g.cell1(), new_block, id, g.cell2(), old,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+  lock_rec_free_all_from_discard_page(old, g.cell2(), lock_sys.rec_hash);
 }
 
 /*************************************************************//**
@@ -3076,69 +2949,58 @@ lock_update_split_left(
 	const buf_block_t*	right_block,	/*!< in: right page */
 	const buf_block_t*	left_block)	/*!< in: left page */
 {
-	ulint	heap_no = lock_get_min_heap_no(right_block);
-
-	lock_mutex_enter();
-
-	/* Inherit the locks to the supremum of the left page from the
-	successor of the infimum on the right page */
-
-	lock_rec_inherit_to_gap<true>(left_block, right_block,
-				PAGE_HEAP_NO_SUPREMUM, heap_no);
-
-	lock_mutex_exit();
-}
-
-/*************************************************************//**
-Updates the lock table when a page is merged to the left. */
-void
-lock_update_merge_left(
-/*===================*/
-	const buf_block_t*	left_block,	/*!< in: left page to
-						which merged */
-	const rec_t*		orig_pred,	/*!< in: original predecessor
-						of supremum on the left page
-						before merge */
-	const buf_block_t*	right_block)	/*!< in: merged index page
-						which will be discarded */
-{
-	const rec_t*	left_next_rec;
-
-	ut_ad(left_block->frame == page_align(orig_pred));
-
-	lock_mutex_enter();
-
-	left_next_rec = page_rec_get_next_const(orig_pred);
-
-	if (!page_rec_is_supremum(left_next_rec)) {
-
-		/* Inherit the locks on the supremum of the left page to the
-		first record which was moved from the right page */
-
-		lock_rec_inherit_to_gap(left_block, left_block,
-					page_rec_get_heap_no(left_next_rec),
-					PAGE_HEAP_NO_SUPREMUM);
-
-		/* Reset the locks on the supremum of the left page,
-		releasing waiting transactions */
-
-		lock_rec_reset_and_release_wait_low(
-			&lock_sys.rec_hash, left_block, PAGE_HEAP_NO_SUPREMUM);
-	}
-
-	/* Move the locks from the supremum of right page to the supremum
-	of the left page */
-
-	lock_rec_move(left_block, right_block,
-		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+  ulint h= lock_get_min_heap_no(right_block);
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+  /* Inherit the locks to the supremum of the left page from the
+  successor of the infimum on the right page */
+  lock_rec_inherit_to_gap<true>(g.cell1(), l, g.cell2(), r,
+                                left_block->page.frame, PAGE_HEAP_NO_SUPREMUM,
+                                h);
+}
+
+/** Update the lock table when a page is merged to the left.
+@param left      left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right     merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+                            const page_id_t right)
+{
+  ut_ad(left.page.frame == page_align(orig_pred));
+
+  const page_id_t l{left.page.id()};
+  const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
+  if (UNIV_UNLIKELY(!left_next_rec))
+  {
+    ut_ad("corrupted page" == 0);
+    return;
+  }
 
-	/* there should exist no page lock on the right page,
-	otherwise, it will be blocked from merge */
-	ut_ad(!lock_sys.get_first_prdt_page(right_block->page.id()));
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, right};
+  if (!page_rec_is_supremum(left_next_rec))
+  {
+    /* Inherit the locks on the supremum of the left page to the
+    first record which was moved from the right page */
+    lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left.page.frame,
+                            page_rec_get_heap_no(left_next_rec),
+                            PAGE_HEAP_NO_SUPREMUM);
+
+    /* Reset the locks on the supremum of the left page,
+    releasing waiting transactions */
+    lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
+  }
 
-	lock_rec_free_all_from_discard_page(right_block);
+  /* Move the locks from the supremum of right page to the supremum
+  of the left page */
+  lock_rec_move(g.cell1(), left, l, g.cell2(), right,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+  lock_rec_free_all_from_discard_page(right, g.cell2(), lock_sys.rec_hash);
 
-	lock_mutex_exit();
+  /* there should exist no page lock on the right page,
+  otherwise, it will be blocked from merge */
+  ut_d(lock_assert_no_spatial(right));
 }
 
 /*************************************************************//**
@@ -3147,9 +3009,9 @@ inherited from rec. */
 void
 lock_rec_reset_and_inherit_gap_locks(
 /*=================================*/
-	const buf_block_t*	heir_block,	/*!< in: block containing the
+	const buf_block_t&	heir_block,	/*!< in: block containing the
 						record which inherits */
-	const buf_block_t*	block,		/*!< in: block containing the
+	const page_id_t		donor,		/*!< in: page containing the
 						record from which inherited;
 						does NOT reset the locks on
 						this record */
@@ -3158,13 +3020,12 @@ lock_rec_reset_and_inherit_gap_locks(
 	ulint			heap_no)	/*!< in: heap_no of the
 						donating record */
 {
-	lock_mutex_enter();
-
-	lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
-
-	lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
-
-	lock_mutex_exit();
+  const page_id_t heir{heir_block.page.id()};
+  /* This is a rare operation and likely too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, heir, donor};
+  lock_rec_reset_and_release_wait(g.cell1(), heir, heir_heap_no);
+  lock_rec_inherit_to_gap(g.cell1(), heir, g.cell2(), donor,
+                          heir_block.page.frame, heir_heap_no, heap_no);
 }
 
 /*************************************************************//**
@@ -3179,16 +3040,16 @@ lock_update_discard(
 	const buf_block_t*	block)		/*!< in: index page
 						which will be discarded */
 {
-	const page_t*	page = block->frame;
+	const page_t*	page = block->page.frame;
 	const rec_t*	rec;
 	ulint		heap_no;
+	const page_id_t	heir(heir_block->page.id());
 	const page_id_t	page_id(block->page.id());
+	/* This would likely be too large for a memory transaction. */
+	LockMultiGuard	g{lock_sys.rec_hash, heir, page_id};
 
-	lock_mutex_enter();
-
-	if (lock_sys.get_first(page_id)) {
-		ut_ad(!lock_sys.get_first_prdt(page_id));
-		ut_ad(!lock_sys.get_first_prdt_page(page_id));
+	if (lock_sys_t::get_first(g.cell2(), page_id)) {
+		ut_d(lock_assert_no_spatial(page_id));
 		/* Inherit all the locks on the page to the record and
 		reset all the locks on the page */
 
@@ -3198,11 +3059,13 @@ lock_update_discard(
 			do {
 				heap_no = rec_get_heap_no_new(rec);
 
-				lock_rec_inherit_to_gap(heir_block, block,
+				lock_rec_inherit_to_gap(g.cell1(), heir,
+							g.cell2(), page_id,
+							heir_block->page.frame,
 							heir_heap_no, heap_no);
 
 				lock_rec_reset_and_release_wait(
-					block, heap_no);
+					g.cell2(), page_id, heap_no);
 
 				rec = page + rec_get_next_offs(rec, TRUE);
 			} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
@@ -3212,26 +3075,35 @@ lock_update_discard(
 			do {
 				heap_no = rec_get_heap_no_old(rec);
 
-				lock_rec_inherit_to_gap(heir_block, block,
+				lock_rec_inherit_to_gap(g.cell1(), heir,
+							g.cell2(), page_id,
+							heir_block->page.frame,
 							heir_heap_no, heap_no);
 
 				lock_rec_reset_and_release_wait(
-					block, heap_no);
+					g.cell2(), page_id, heap_no);
 
 				rec = page + rec_get_next_offs(rec, FALSE);
 			} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
 		}
 
-		lock_rec_free_all_from_discard_page_low(page_id,
-							&lock_sys.rec_hash);
+		lock_rec_free_all_from_discard_page(page_id, g.cell2(),
+						    lock_sys.rec_hash);
 	} else {
-		lock_rec_free_all_from_discard_page_low(page_id,
-							&lock_sys.prdt_hash);
-		lock_rec_free_all_from_discard_page_low(
-			page_id, &lock_sys.prdt_page_hash);
+		const auto fold = page_id.fold();
+		auto cell = lock_sys.prdt_hash.cell_get(fold);
+		auto latch = lock_sys_t::hash_table::latch(cell);
+		latch->acquire();
+		lock_rec_free_all_from_discard_page(page_id, *cell,
+						    lock_sys.prdt_hash);
+		latch->release();
+		cell = lock_sys.prdt_page_hash.cell_get(fold);
+		latch = lock_sys_t::hash_table::latch(cell);
+		latch->acquire();
+		lock_rec_free_all_from_discard_page(page_id, *cell,
+						    lock_sys.prdt_page_hash);
+		latch->release();
 	}
-
-	lock_mutex_exit();
 }
 
 /*************************************************************//**
@@ -3245,7 +3117,7 @@ lock_update_insert(
 	ulint	receiver_heap_no;
 	ulint	donator_heap_no;
 
-	ut_ad(block->frame == page_align(rec));
+	ut_ad(block->page.frame == page_align(rec));
 	ut_ad(!page_rec_is_metadata(rec));
 
 	/* Inherit the gap-locking locks for rec, in gap mode, from the next
@@ -3253,12 +3125,18 @@ lock_update_insert(
 
 	if (page_rec_is_comp(rec)) {
 		receiver_heap_no = rec_get_heap_no_new(rec);
-		donator_heap_no = rec_get_heap_no_new(
-			page_rec_get_next_low(rec, TRUE));
+		rec = page_rec_get_next_low(rec, TRUE);
+		if (UNIV_UNLIKELY(!rec)) {
+			return;
+		}
+		donator_heap_no = rec_get_heap_no_new(rec);
 	} else {
 		receiver_heap_no = rec_get_heap_no_old(rec);
-		donator_heap_no = rec_get_heap_no_old(
-			page_rec_get_next_low(rec, FALSE));
+		rec = page_rec_get_next_low(rec, FALSE);
+		if (UNIV_UNLIKELY(!rec)) {
+			return;
+		}
+		donator_heap_no = rec_get_heap_no_old(rec);
 	}
 
 	lock_rec_inherit_to_gap_if_gap_lock(
@@ -3273,7 +3151,7 @@ lock_update_delete(
 	const buf_block_t*	block,	/*!< in: buffer block containing rec */
 	const rec_t*		rec)	/*!< in: the record to be removed */
 {
-	const page_t*	page = block->frame;
+	const page_t*	page = block->page.frame;
 	ulint		heap_no;
 	ulint		next_heap_no;
 
@@ -3292,17 +3170,16 @@ lock_update_delete(
 								       FALSE));
 	}
 
-	lock_mutex_enter();
+	const page_id_t id{block->page.id()};
+	LockGuard g{lock_sys.rec_hash, id};
 
 	/* Let the next record inherit the locks from rec, in gap mode */
 
-	lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no);
+	lock_rec_inherit_to_gap(g.cell(), id, g.cell(), id, block->page.frame,
+				next_heap_no, heap_no);
 
 	/* Reset the lock bits on rec and release waiting transactions */
-
-	lock_rec_reset_and_release_wait(block, heap_no);
-
-	lock_mutex_exit();
+	lock_rec_reset_and_release_wait(g.cell(), id, heap_no);
 }
 
 /*********************************************************************//**
@@ -3322,102 +3199,89 @@ lock_rec_store_on_page_infimum(
 					bits are reset on the
 					record */
 {
-	ulint	heap_no = page_rec_get_heap_no(rec);
-
-	ut_ad(block->frame == page_align(rec));
+  const ulint heap_no= page_rec_get_heap_no(rec);
 
-	lock_mutex_enter();
+  ut_ad(block->page.frame == page_align(rec));
+  const page_id_t id{block->page.id()};
+  ut_d(SCOPE_EXIT(
+      []() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); }));
 
-	lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
-
-	lock_mutex_exit();
+  LockGuard g{lock_sys.rec_hash, id};
+  lock_rec_move(g.cell(), *block, id, g.cell(), id,
+                PAGE_HEAP_NO_INFIMUM, heap_no);
 }
 
-/*********************************************************************//**
-Restores the state of explicit lock requests on a single record, where the
-state was stored on the infimum of the page. */
-void
-lock_rec_restore_from_page_infimum(
-/*===============================*/
-	const buf_block_t*	block,	/*!< in: buffer block containing rec */
-	const rec_t*		rec,	/*!< in: record whose lock state
-					is restored */
-	const buf_block_t*	donator)/*!< in: page (rec is not
-					necessarily on this page)
-					whose infimum stored the lock
-					state; lock bits are reset on
-					the infimum */
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block   buffer block containing rec
+@param rec     record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+					const rec_t *rec, page_id_t donator)
 {
-	ulint	heap_no = page_rec_get_heap_no(rec);
-
-	lock_mutex_enter();
-
-	lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
-
-	lock_mutex_exit();
+  const ulint heap_no= page_rec_get_heap_no(rec);
+  const page_id_t id{block.page.id()};
+  LockMultiGuard g{lock_sys.rec_hash, id, donator};
+  lock_rec_move(g.cell1(), block, id, g.cell2(), donator, heap_no,
+                PAGE_HEAP_NO_INFIMUM);
 }
 
 /*========================= TABLE LOCKS ==============================*/
 
-/** Functor for accessing the embedded node within a table lock. */
-struct TableLockGetNode {
-	ut_list_node<lock_t>& operator() (lock_t& elem)
-	{
-		return(elem.un_member.tab_lock.locks);
-	}
-};
-
-/*********************************************************************//**
-Creates a table lock object and adds it as the last in the lock queue
-of the table. Does NOT check for deadlocks or lock compatibility.
-@return own: new lock object */
-UNIV_INLINE
-lock_t*
-lock_table_create(
-/*==============*/
-	dict_table_t*	table,	/*!< in/out: database table
-				in dictionary cache */
-	unsigned	type_mode,/*!< in: lock mode possibly ORed with
-				LOCK_WAIT */
-	trx_t*		trx	/*!< in: trx */
-#ifdef WITH_WSREP
-	, lock_t*	c_lock = NULL	/*!< in: conflicting lock */
-#endif
-	)
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table      table on which the lock is created
+@param type_mode  lock type and mode
+@param trx        transaction
+@param c_lock     conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+                          lock_t *c_lock)
 {
 	lock_t*		lock;
 
-	ut_ad(table && trx);
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(trx));
-	ut_ad(trx->is_recovered || trx->state == TRX_STATE_ACTIVE);
-	ut_ad(!trx->auto_commit || trx->will_lock);
-
-	if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
+	lock_sys.assert_locked(*table);
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(!trx->is_wsrep() || lock_sys.is_writer());
+	ut_ad(trx->state == TRX_STATE_ACTIVE || trx->is_recovered);
+	ut_ad(!trx->is_autocommit_non_locking());
+	/* During CREATE TABLE, we will write to newly created FTS_*_CONFIG
+	on which no lock has been created yet. */
+	ut_ad(!trx->dict_operation_lock_mode
+	      || (strstr(table->name.m_name, "/FTS_")
+		  && strstr(table->name.m_name, "_CONFIG") + sizeof("_CONFIG")
+		  == table->name.m_name + strlen(table->name.m_name) + 1));
+
+	switch (LOCK_MODE_MASK & type_mode) {
+	case LOCK_AUTO_INC:
 		++table->n_waiting_or_granted_auto_inc_locks;
-	}
-
-	/* For AUTOINC locking we reuse the lock instance only if
-	there is no wait involved else we allocate the waiting lock
-	from the transaction lock heap. */
-	if (type_mode == LOCK_AUTO_INC) {
+		/* For AUTOINC locking we reuse the lock instance only if
+		there is no wait involved else we allocate the waiting lock
+		from the transaction lock heap. */
+		if (type_mode == LOCK_AUTO_INC) {
+			lock = table->autoinc_lock;
 
-		lock = table->autoinc_lock;
-
-		table->autoinc_trx = trx;
-
-		ib_vector_push(trx->autoinc_locks, &lock);
-
-	} else if (trx->lock.table_cached
-		   < UT_ARR_SIZE(trx->lock.table_pool)) {
-		lock = &trx->lock.table_pool[trx->lock.table_cached++];
-	} else {
+			ut_ad(!table->autoinc_trx);
+			table->autoinc_trx = trx;
 
-		lock = static_cast<lock_t*>(
-			mem_heap_alloc(trx->lock.lock_heap, sizeof(*lock)));
+			ib_vector_push(trx->autoinc_locks, &lock);
+			goto allocated;
+		}
 
+		break;
+	case LOCK_X:
+	case LOCK_S:
+		++table->n_lock_x_or_s;
+		break;
 	}
 
+	lock = trx->lock.table_cached < array_elements(trx->lock.table_pool)
+		? &trx->lock.table_pool[trx->lock.table_cached++]
+		: static_cast<lock_t*>(
+			mem_heap_alloc(trx->lock.lock_heap, sizeof *lock));
+
+allocated:
 	lock->type_mode = ib_uint32_t(type_mode | LOCK_TABLE);
 	lock->trx = trx;
 
@@ -3427,46 +3291,19 @@ lock_table_create(
 
 	UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
 
-#ifdef WITH_WSREP
-	if (c_lock && trx->is_wsrep()) {
-		if (wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
-			ut_list_insert(table->locks, c_lock, lock,
-				       TableLockGetNode());
-			if (UNIV_UNLIKELY(wsrep_debug)) {
-				wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
-				wsrep_report_bf_lock_wait(c_lock->trx->mysql_thd, c_lock->trx->id);
-			}
-		} else {
-			ut_list_append(table->locks, lock, TableLockGetNode());
-		}
-
-		trx_mutex_enter(c_lock->trx);
-
-		if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-			c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
-
-			if (UNIV_UNLIKELY(wsrep_debug)) {
-				wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
-				wsrep_report_bf_lock_wait(c_lock->trx->mysql_thd, c_lock->trx->id);
-				wsrep_print_wait_locks(c_lock);
-			}
-
-			/* The lock release will call lock_grant(),
-			which would acquire trx->mutex again. */
-			trx_mutex_exit(trx);
-			lock_cancel_waiting_and_release(
-				c_lock->trx->lock.wait_lock);
-			trx_mutex_enter(trx);
-		}
-
-		trx_mutex_exit(c_lock->trx);
-	} else
-#endif /* WITH_WSREP */
 	ut_list_append(table->locks, lock, TableLockGetNode());
 
 	if (type_mode & LOCK_WAIT) {
-
-		lock_set_lock_and_trx_wait(lock, trx);
+		if (trx->lock.wait_trx) {
+			ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx);
+			ut_ad(trx->lock.wait_lock);
+			ut_ad((*trx->lock.wait_lock).trx == trx);
+		} else {
+			ut_ad(c_lock);
+			trx->lock.wait_trx = c_lock->trx;
+			ut_ad(!trx->lock.wait_lock);
+		}
+		trx->lock.wait_lock = lock;
 	}
 
 	lock->trx->lock.table_locks.push_back(lock);
@@ -3487,7 +3324,6 @@ lock_table_pop_autoinc_locks(
 /*=========================*/
 	trx_t*	trx)	/*!< in/out: transaction that owns the AUTOINC locks */
 {
-	ut_ad(lock_mutex_own());
 	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
 
 	/* Skip any gaps, gaps are NULL lock entries in the
@@ -3512,21 +3348,20 @@ lock_table_remove_autoinc_lock(
 	lock_t*	lock,	/*!< in: table lock */
 	trx_t*	trx)	/*!< in/out: transaction that owns the lock */
 {
-	lock_t*	autoinc_lock;
-	lint	i = ib_vector_size(trx->autoinc_locks) - 1;
+	ut_ad(lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE));
+	lock_sys.assert_locked(*lock->un_member.tab_lock.table);
+	ut_ad(trx->mutex_is_owner());
 
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_mode(lock) == LOCK_AUTO_INC);
-	ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
-	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
+	auto s = ib_vector_size(trx->autoinc_locks);
+	ut_ad(s);
 
 	/* With stored functions and procedures the user may drop
 	a table within the same "statement". This special case has
 	to be handled by deleting only those AUTOINC locks that were
 	held by the table being dropped. */
 
-	autoinc_lock = *static_cast<lock_t**>(
-		ib_vector_get(trx->autoinc_locks, i));
+	lock_t*	autoinc_lock = *static_cast<lock_t**>(
+		ib_vector_get(trx->autoinc_locks, --s));
 
 	/* This is the default fast case. */
 
@@ -3538,13 +3373,13 @@ lock_table_remove_autoinc_lock(
 
 		/* Handle freeing the locks from within the stack. */
 
-		while (--i >= 0) {
+		while (s) {
 			autoinc_lock = *static_cast<lock_t**>(
-				ib_vector_get(trx->autoinc_locks, i));
+				ib_vector_get(trx->autoinc_locks, --s));
 
 			if (autoinc_lock == lock) {
 				void*	null_var = NULL;
-				ib_vector_set(trx->autoinc_locks, i, &null_var);
+				ib_vector_set(trx->autoinc_locks, s, &null_var);
 				return;
 			}
 		}
@@ -3559,46 +3394,50 @@ Removes a table lock request from the queue and the trx list of locks;
 this is a low-level function which does NOT check if waiting requests
 can now be granted. */
 UNIV_INLINE
-void
+const dict_table_t*
 lock_table_remove_low(
 /*==================*/
 	lock_t*	lock)	/*!< in/out: table lock */
 {
+	ut_ad(lock->is_table());
+
 	trx_t*		trx;
 	dict_table_t*	table;
 
-	ut_ad(lock_mutex_own());
-
+	ut_ad(lock->is_table());
 	trx = lock->trx;
 	table = lock->un_member.tab_lock.table;
+	lock_sys.assert_locked(*table);
+	ut_ad(trx->mutex_is_owner());
 
 	/* Remove the table from the transaction's AUTOINC vector, if
 	the lock that is being released is an AUTOINC lock. */
-	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+	switch (lock->mode()) {
+	case LOCK_AUTO_INC:
+		ut_ad((table->autoinc_trx == trx) == !lock->is_waiting());
 
-		/* The table's AUTOINC lock can get transferred to
-		another transaction before we get here. */
 		if (table->autoinc_trx == trx) {
 			table->autoinc_trx = NULL;
-		}
-
-		/* The locks must be freed in the reverse order from
-		the one in which they were acquired. This is to avoid
-		traversing the AUTOINC lock vector unnecessarily.
-
-		We only store locks that were granted in the
-		trx->autoinc_locks vector (see lock_table_create()
-		and lock_grant()). Therefore it can be empty and we
-		need to check for that. */
-
-		if (!lock_get_wait(lock)
-		    && !ib_vector_is_empty(trx->autoinc_locks)) {
+			/* The locks must be freed in the reverse order from
+			the one in which they were acquired. This is to avoid
+			traversing the AUTOINC lock vector unnecessarily.
 
+			We only store locks that were granted in the
+			trx->autoinc_locks vector (see lock_table_create()
+			and lock_grant()). */
 			lock_table_remove_autoinc_lock(lock, trx);
 		}
 
-		ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
-		table->n_waiting_or_granted_auto_inc_locks--;
+		ut_ad(table->n_waiting_or_granted_auto_inc_locks);
+		--table->n_waiting_or_granted_auto_inc_locks;
+		break;
+	case LOCK_X:
+	case LOCK_S:
+		ut_ad(table->n_lock_x_or_s);
+		--table->n_lock_x_or_s;
+		break;
+	default:
+		break;
 	}
 
 	UT_LIST_REMOVE(trx->lock.trx_locks, lock);
@@ -3606,14 +3445,14 @@ lock_table_remove_low(
 
 	MONITOR_INC(MONITOR_TABLELOCK_REMOVED);
 	MONITOR_DEC(MONITOR_NUM_TABLELOCK);
+	return table;
 }
 
 /*********************************************************************//**
 Enqueues a waiting request for a table lock which cannot be granted
 immediately. Checks for deadlocks.
 @retval	DB_LOCK_WAIT	if the waiting lock was enqueued
-@retval	DB_DEADLOCK	if this transaction was chosen as the victim
-@retval	DB_SUCCESS	if the other transaction committed or aborted */
+@retval	DB_DEADLOCK	if this transaction was chosen as the victim */
 static
 dberr_t
 lock_table_enqueue_waiting(
@@ -3621,75 +3460,30 @@ lock_table_enqueue_waiting(
 	unsigned	mode,	/*!< in: lock mode this transaction is
 				requesting */
 	dict_table_t*	table,	/*!< in/out: table */
-	que_thr_t*	thr	/*!< in: query thread */
-#ifdef WITH_WSREP
-	, lock_t*	c_lock	/*!< in: conflicting lock or NULL */
-#endif
-)
+	que_thr_t*	thr,	/*!< in: query thread */
+	lock_t*		c_lock)	/*!< in: conflicting lock or NULL */
 {
-	trx_t*		trx;
-	lock_t*		lock;
-
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked(*table);
 	ut_ad(!srv_read_only_mode);
 
-	trx = thr_get_trx(thr);
-	ut_ad(trx_mutex_own(trx));
-	ut_a(!que_thr_stop(thr));
-
-	switch (trx_get_dict_operation(trx)) {
-	case TRX_DICT_OP_NONE:
-		break;
-	case TRX_DICT_OP_TABLE:
-	case TRX_DICT_OP_INDEX:
-		ib::error() << "A table lock wait happens in a dictionary"
-			" operation. Table " << table->name
-			<< ". " << BUG_REPORT_MSG;
-		ut_ad(0);
-	}
-
-#ifdef WITH_WSREP
-	if (trx->is_wsrep() && trx->lock.was_chosen_as_deadlock_victim) {
-		return(DB_DEADLOCK);
-	}
-#endif /* WITH_WSREP */
+	trx_t* trx = thr_get_trx(thr);
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(!trx->dict_operation_lock_mode);
 
 	/* Enqueue the lock request that will wait to be granted */
-	lock = lock_table_create(table, mode | LOCK_WAIT, trx
-#ifdef WITH_WSREP
-				 , c_lock
-#endif
-				 );
-
-	const trx_t*	victim_trx =
-		DeadlockChecker::check_and_resolve(lock, trx);
-
-	if (victim_trx != 0) {
-		ut_ad(victim_trx == trx);
-
-		/* The order here is important, we don't want to
-		lose the state of the lock before calling remove. */
-		lock_table_remove_low(lock);
-		lock_reset_lock_and_trx_wait(lock);
-
-		return(DB_DEADLOCK);
-
-	} else if (trx->lock.wait_lock == NULL) {
-		/* Deadlock resolution chose another transaction as a victim,
-		and we accidentally got our lock granted! */
-
-		return(DB_SUCCESS);
-	}
-
-	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
-
-	trx->lock.wait_started = time(NULL);
-	trx->lock.was_chosen_as_deadlock_victim = false;
-
-	ut_a(que_thr_stop(thr));
+	lock_table_create(table, mode | LOCK_WAIT, trx, c_lock);
+
+	trx->lock.wait_thr = thr;
+        /* Apart from Galera, only transactions that have waiting lock
+        may be chosen as deadlock victims. Only one lock can be waited for at a
+        time, and a transaction is associated with a single thread. That is why
+        there must not be waiting lock requests if the transaction is deadlock
+        victim and it is not WSREP. Galera transaction abort can be invoked
+        from MDL acquisition code when the transaction does not have waiting
+        lock, that's why we check only deadlock victim bit here. */
+        ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
 
 	MONITOR_INC(MONITOR_TABLELOCK_WAIT);
-
 	return(DB_LOCK_WAIT);
 }
 
@@ -3709,32 +3503,24 @@ lock_table_other_has_incompatible(
 	const dict_table_t*	table,	/*!< in: table */
 	lock_mode		mode)	/*!< in: lock mode */
 {
-	lock_t*	lock;
+	lock_sys.assert_locked(*table);
 
-	ut_ad(lock_mutex_own());
+	static_assert(LOCK_IS == 0, "compatibility");
+	static_assert(LOCK_IX == 1, "compatibility");
 
-	for (lock = UT_LIST_GET_LAST(table->locks);
-	     lock != NULL;
-	     lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
+	if (UNIV_LIKELY(mode <= LOCK_IX && !table->n_lock_x_or_s)) {
+		return(NULL);
+	}
 
-		if (lock->trx != trx
-		    && !lock_mode_compatible(lock_get_mode(lock), mode)
-		    && (wait || !lock_get_wait(lock))) {
+	for (lock_t* lock = UT_LIST_GET_LAST(table->locks);
+	     lock;
+	     lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
 
-#ifdef WITH_WSREP
-			if (lock->trx->is_wsrep()) {
-				if (UNIV_UNLIKELY(wsrep_debug)) {
-					ib::info() << "WSREP: table lock abort for table:"
-						   << table->name;
-					ib::info() << " SQL: "
-					   << wsrep_thd_query(lock->trx->mysql_thd);
-				}
-				trx_mutex_enter(lock->trx);
-				wsrep_kill_victim((trx_t *)trx, (lock_t *)lock);
-				trx_mutex_exit(lock->trx);
-			}
-#endif /* WITH_WSREP */
+		trx_t* lock_trx = lock->trx;
 
+		if (lock_trx != trx
+		    && !lock_mode_compatible(lock->mode(), mode)
+		    && (wait || !lock->is_waiting())) {
 			return(lock);
 		}
 	}
@@ -3742,175 +3528,158 @@ lock_table_other_has_incompatible(
 	return(NULL);
 }
 
-/*********************************************************************//**
-Locks the specified database table in the mode given. If the lock cannot
-be granted immediately, the query thread is put to wait.
-@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
-dberr_t
-lock_table(
-/*=======*/
-	unsigned	flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
-				does nothing */
-	dict_table_t*	table,	/*!< in/out: database table
-				in dictionary cache */
-	lock_mode	mode,	/*!< in: lock mode */
-	que_thr_t*	thr)	/*!< in: query thread */
+/** Aqcuire or enqueue a table lock */
+static dberr_t lock_table_low(dict_table_t *table, lock_mode mode,
+                              que_thr_t *thr, trx_t *trx)
 {
-	trx_t*		trx;
-	dberr_t		err;
-	lock_t*		wait_for;
-
-	ut_ad(table && thr);
-
-	/* Given limited visibility of temp-table we can avoid
-	locking overhead */
-	if ((flags & BTR_NO_LOCKING_FLAG)
-	    || srv_read_only_mode
-	    || table->is_temporary()) {
-
-		return(DB_SUCCESS);
-	}
-
-	ut_a(flags == 0);
-
-	trx = thr_get_trx(thr);
-
-	/* Look for equal or stronger locks the same trx already
-	has on the table. No need to acquire the lock mutex here
-	because only this transacton can add/access table locks
-	to/from trx_t::table_locks. */
+  DBUG_EXECUTE_IF("innodb_table_deadlock", return DB_DEADLOCK;);
+  lock_t *wait_for=
+    lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode);
+  dberr_t err= DB_SUCCESS;
 
-	if (lock_table_has(trx, table, mode)) {
+  trx->mutex_lock();
 
-		return(DB_SUCCESS);
-	}
+  if (wait_for)
+    err= lock_table_enqueue_waiting(mode, table, thr, wait_for);
+  else
+    lock_table_create(table, mode, trx, nullptr);
 
-	/* Read only transactions can write to temp tables, we don't want
-	to promote them to RW transactions. Their updates cannot be visible
-	to other transactions. Therefore we can keep them out
-	of the read views. */
+  trx->mutex_unlock();
 
-	if ((mode == LOCK_IX || mode == LOCK_X)
-	    && !trx->read_only
-	    && trx->rsegs.m_redo.rseg == 0) {
+  return err;
+}
 
-		trx_set_rw_mode(trx);
-	}
+#ifdef WITH_WSREP
+/** Aqcuire or enqueue a table lock in Galera replication mode. */
+ATTRIBUTE_NOINLINE
+static dberr_t lock_table_wsrep(dict_table_t *table, lock_mode mode,
+                                que_thr_t *thr, trx_t *trx)
+{
+  LockMutexGuard g{SRW_LOCK_CALL};
+  return lock_table_low(table, mode, thr, trx);
+}
+#endif
 
-	lock_mutex_enter();
+/** Acquire a table lock.
+@param table   table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode    lock mode
+@param thr     SQL execution thread
+@retval DB_SUCCESS    if the lock was acquired
+@retval DB_DEADLOCK   if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT  if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+                   lock_mode mode, que_thr_t *thr)
+{
+  ut_ad(table);
 
-	DBUG_EXECUTE_IF("fatal-semaphore-timeout",
-		{ os_thread_sleep(3600000000LL); });
+  if (!fktable && table->is_temporary())
+    return DB_SUCCESS;
 
-	/* We have to check if the new lock is compatible with any locks
-	other transactions have in the table lock queue. */
+  ut_ad(fktable || table->get_ref_count() || !table->can_be_evicted);
 
-	wait_for = lock_table_other_has_incompatible(
-		trx, LOCK_WAIT, table, mode);
+  trx_t *trx= thr_get_trx(thr);
 
-	trx_mutex_enter(trx);
+  /* Look for equal or stronger locks the same trx already has on the
+  table. No need to acquire LockMutexGuard here because only the
+  thread that is executing a transaction can access trx_t::table_locks. */
+  if (lock_table_has(trx, table, mode) || srv_read_only_mode)
+    return DB_SUCCESS;
 
-	/* Another trx has a request on the table in an incompatible
-	mode: this trx may have to wait */
+  if ((mode == LOCK_IX || mode == LOCK_X) &&
+      !trx->read_only && !trx->rsegs.m_redo.rseg)
+    trx_set_rw_mode(trx);
 
-	if (wait_for != NULL) {
-		err = lock_table_enqueue_waiting(flags | mode, table,
-						 thr
 #ifdef WITH_WSREP
-						 , wait_for
+  if (trx->is_wsrep())
+    return lock_table_wsrep(table, mode, thr, trx);
 #endif
-						 );
-	} else {
-		lock_table_create(table, flags | mode, trx);
-
-		ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
-
-		err = DB_SUCCESS;
-	}
-
-	lock_mutex_exit();
-
-	trx_mutex_exit(trx);
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  dberr_t err;
+  if (fktable != nullptr && *fktable != table)
+    err= DB_DEADLOCK;
+  else
+  {
+    table->lock_mutex_lock();
+    err= lock_table_low(table, mode, thr, trx);
+    table->lock_mutex_unlock();
+  }
+  lock_sys.rd_unlock();
 
-	return(err);
+  return err;
 }
 
-/*********************************************************************//**
-Creates a table IX lock object for a resurrected transaction. */
-void
-lock_table_ix_resurrect(
-/*====================*/
-	dict_table_t*	table,	/*!< in/out: table */
-	trx_t*		trx)	/*!< in/out: transaction */
+/** Create a table lock object for a resurrected transaction.
+@param table    table to be X-locked
+@param trx      transaction
+@param mode     LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode)
 {
-	ut_ad(trx->is_recovered);
+  ut_ad(trx->is_recovered);
+  ut_ad(mode == LOCK_X || mode == LOCK_IX);
 
-	if (lock_table_has(trx, table, LOCK_IX)) {
-		return;
-	}
+  if (lock_table_has(trx, table, mode))
+    return;
 
-	lock_mutex_enter();
-
-	/* We have to check if the new lock is compatible with any locks
-	other transactions have in the table lock queue. */
-
-	ut_ad(!lock_table_other_has_incompatible(
-		      trx, LOCK_WAIT, table, LOCK_IX));
+  {
+    /* This is executed at server startup while no connections
+    are alowed. Do not bother with lock elision. */
+    LockMutexGuard g{SRW_LOCK_CALL};
+    ut_ad(!lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode));
 
-	trx_mutex_enter(trx);
-	lock_table_create(table, LOCK_IX, trx);
-	lock_mutex_exit();
-	trx_mutex_exit(trx);
+    trx->mutex_lock();
+    lock_table_create(table, mode, trx);
+  }
+  trx->mutex_unlock();
 }
 
-/*********************************************************************//**
-Checks if a waiting table lock request still has to wait in a queue.
-@return TRUE if still has to wait */
-static
-bool
-lock_table_has_to_wait_in_queue(
-/*============================*/
-	const lock_t*	wait_lock)	/*!< in: waiting table lock */
+/** Find a lock that a waiting table lock request still has to wait for. */
+static const lock_t *lock_table_has_to_wait_in_queue(const lock_t *wait_lock)
 {
-	const dict_table_t*	table;
-	const lock_t*		lock;
+  ut_ad(wait_lock->is_waiting());
+  ut_ad(wait_lock->is_table());
 
-	ut_ad(lock_mutex_own());
-	ut_ad(lock_get_wait(wait_lock));
+  dict_table_t *table= wait_lock->un_member.tab_lock.table;
+  lock_sys.assert_locked(*table);
 
-	table = wait_lock->un_member.tab_lock.table;
+  static_assert(LOCK_IS == 0, "compatibility");
+  static_assert(LOCK_IX == 1, "compatibility");
 
-	for (lock = UT_LIST_GET_FIRST(table->locks);
-	     lock != wait_lock;
-	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+  if (UNIV_LIKELY(wait_lock->mode() <= LOCK_IX && !table->n_lock_x_or_s))
+    return nullptr;
 
-		if (lock_has_to_wait(wait_lock, lock)) {
+  for (const lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock != wait_lock;
+       lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+    if (lock_has_to_wait(wait_lock, lock))
+      return lock;
 
-			return(true);
-		}
-	}
-
-	return(false);
+  return nullptr;
 }
 
 /*************************************************************//**
 Removes a table lock request, waiting or granted, from the queue and grants
 locks to other transactions in the queue, if they now are entitled to a
-lock. */
-static
-void
-lock_table_dequeue(
-/*===============*/
-	lock_t*	in_lock)/*!< in/out: table lock object; transactions waiting
-			behind will get their lock requests granted, if
-			they are now qualified to it */
+lock.
+@param[in,out]	in_lock		table lock
+@param[in]	owns_wait_mutex	whether lock_sys.wait_mutex is held */
+static void lock_table_dequeue(lock_t *in_lock, bool owns_wait_mutex)
 {
-	ut_ad(lock_mutex_own());
-	ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
-
+#ifdef SAFE_MUTEX
+	ut_ad(owns_wait_mutex == mysql_mutex_is_owner(&lock_sys.wait_mutex));
+#endif
+	ut_ad(in_lock->trx->mutex_is_owner());
 	lock_t*	lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
 
-	lock_table_remove_low(in_lock);
+	const dict_table_t* table = lock_table_remove_low(in_lock);
+
+	static_assert(LOCK_IS == 0, "compatibility");
+	static_assert(LOCK_IX == 1, "compatibility");
+
+	if (UNIV_LIKELY(in_lock->mode() <= LOCK_IX && !table->n_lock_x_or_s)) {
+		return;
+	}
+
+	bool acquired = false;
 
 	/* Check if waiting locks in the queue can now be granted: grant
 	locks if there are no conflicting locks ahead. */
@@ -3918,135 +3687,121 @@ lock_table_dequeue(
 	for (/* No op */;
 	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+		if (!lock->is_waiting()) {
+			continue;
+		}
 
-		if (lock_get_wait(lock)
-		    && !lock_table_has_to_wait_in_queue(lock)) {
+		if (!owns_wait_mutex) {
+			mysql_mutex_lock(&lock_sys.wait_mutex);
+			acquired = owns_wait_mutex = true;
+		}
+
+		ut_ad(lock->trx->lock.wait_trx);
+		ut_ad(lock->trx->lock.wait_lock);
 
+		if (const lock_t* c = lock_table_has_to_wait_in_queue(lock)) {
+			trx_t* c_trx = c->trx;
+			lock->trx->lock.wait_trx = c_trx;
+			if (c_trx->lock.wait_trx
+			    && innodb_deadlock_detect
+			    && Deadlock::to_check.emplace(c_trx).second) {
+				Deadlock::to_be_checked = true;
+			}
+		} else {
 			/* Grant the lock */
 			ut_ad(in_lock->trx != lock->trx);
+			in_lock->trx->mutex_unlock();
 			lock_grant(lock);
+			in_lock->trx->mutex_lock();
 		}
 	}
-}
-
-/** Sets a lock on a table based on the given mode.
-@param[in]	table	table to lock
-@param[in,out]	trx	transaction
-@param[in]	mode	LOCK_X or LOCK_S
-@return error code or DB_SUCCESS. */
-dberr_t
-lock_table_for_trx(
-	dict_table_t*	table,
-	trx_t*		trx,
-	enum lock_mode	mode)
-{
-	mem_heap_t*	heap;
-	que_thr_t*	thr;
-	dberr_t		err;
-	sel_node_t*	node;
-	heap = mem_heap_create(512);
-
-	node = sel_node_create(heap);
-	thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
-	thr->graph->state = QUE_FORK_ACTIVE;
 
-	/* We use the select query graph as the dummy graph needed
-	in the lock module call */
+	if (acquired) {
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
+	}
+}
 
-	thr = static_cast<que_thr_t*>(
-		que_fork_get_first_thr(
-			static_cast<que_fork_t*>(que_node_get_parent(thr))));
 
-	thr->start_running();
+/** Sets a lock on a table based on the given mode.
+@param table	table to lock
+@param trx	transaction
+@param mode	LOCK_X or LOCK_S
+@param no_wait  whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+                           bool no_wait)
+{
+  mem_heap_t *heap= mem_heap_create(512);
+  sel_node_t *node= sel_node_create(heap);
+  que_thr_t *thr= pars_complete_graph_for_exec(node, trx, heap, nullptr);
+  thr->graph->state= QUE_FORK_ACTIVE;
+
+  thr= static_cast<que_thr_t*>
+    (que_fork_get_first_thr(static_cast<que_fork_t*>
+                            (que_node_get_parent(thr))));
 
 run_again:
-	thr->run_node = thr;
-	thr->prev_node = thr->common.parent;
-
-	err = lock_table(0, table, mode, thr);
-
-	trx->error_state = err;
-
-	if (UNIV_LIKELY(err == DB_SUCCESS)) {
-		thr->stop_no_error();
-	} else {
-		que_thr_stop_for_mysql(thr);
-
-		if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
-			goto run_again;
-		}
-	}
+  thr->run_node= thr;
+  thr->prev_node= thr->common.parent;
+  dberr_t err= lock_table(table, nullptr, mode, thr);
+
+  switch (err) {
+  case DB_SUCCESS:
+    break;
+  case DB_LOCK_WAIT:
+    if (no_wait)
+    {
+      lock_sys.cancel_lock_wait_for_trx(trx);
+      break;
+    }
+    /* fall through */
+  default:
+    trx->error_state= err;
+    if (row_mysql_handle_errors(&err, trx, thr, nullptr))
+      goto run_again;
+  }
 
-	que_graph_free(thr->graph);
-	trx->op_info = "";
+  que_graph_free(thr->graph);
+  trx->op_info= "";
 
-	return(err);
+  return err;
 }
 
-/*=========================== LOCK RELEASE ==============================*/
-static
-void
-lock_grant_and_move_on_rec(
-	lock_t*			first_lock,
-	ulint			heap_no)
+/** Exclusively lock the data dictionary tables.
+@param trx  dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx)
 {
-	lock_t*		lock;
-	const page_id_t	page_id(first_lock->un_member.rec_lock.page_id);
-	const ulint	rec_fold= page_id.fold();
-	lock_t*		previous = static_cast<lock_t*>(
-		lock_sys.rec_hash.array[lock_sys.hash(page_id)]
-		.node);
-	if (previous == NULL) {
-		return;
-	}
-	if (previous == first_lock) {
-		lock = previous;
-	} else {
-		while (previous->hash &&
-				previous->hash != first_lock) {
-			previous = previous->hash;
-	    }
-		lock = previous->hash;
-	}
-	/* Grant locks if there are no conflicting locks ahead.
-	 Move granted locks to the head of the list. */
-	while (lock) {
-		ut_ad(!lock->trx->is_wsrep());
-		/* If the lock is a wait lock on this page, and it does not need to wait. */
-		if (lock->un_member.rec_lock.page_id == page_id
-			&& lock_rec_get_nth_bit(lock, heap_no)
-			&& lock_get_wait(lock)
-			&& !lock_rec_has_to_wait_in_queue(lock)) {
-
-			lock_grant(lock);
-
-			if (previous != NULL) {
-				/* Move the lock to the head of the list. */
-				HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
-				lock_rec_insert_to_head(lock, rec_fold);
-			} else {
-				/* Already at the head of the list. */
-				previous = lock;
-			}
-			/* Move on to the next lock. */
-			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
-		} else {
-			previous = lock;
-			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
-		}
-	}
+  dberr_t err;
+  if (!(err= lock_table_for_trx(dict_sys.sys_tables, trx, LOCK_X)) &&
+      !(err= lock_table_for_trx(dict_sys.sys_columns, trx, LOCK_X)) &&
+      !(err= lock_table_for_trx(dict_sys.sys_indexes, trx, LOCK_X)) &&
+      !(err= lock_table_for_trx(dict_sys.sys_fields, trx, LOCK_X)))
+  {
+    if (dict_sys.sys_foreign)
+      err= lock_table_for_trx(dict_sys.sys_foreign, trx, LOCK_X);
+    if (!err && dict_sys.sys_foreign_cols)
+      err= lock_table_for_trx(dict_sys.sys_foreign_cols, trx, LOCK_X);
+    if (!err && dict_sys.sys_virtual)
+      err= lock_table_for_trx(dict_sys.sys_virtual, trx, LOCK_X);
+  }
+  return err;
 }
 
+/*=========================== LOCK RELEASE ==============================*/
+
 /*************************************************************//**
 Removes a granted record lock of a transaction from the queue and grants
 locks to other transactions waiting in the queue if they now are entitled
 to a lock. */
+TRANSACTIONAL_TARGET
 void
 lock_rec_unlock(
 /*============*/
 	trx_t*			trx,	/*!< in/out: transaction that has
 					set a record lock */
-	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const page_id_t		id,	/*!< in: page containing rec */
 	const rec_t*		rec,	/*!< in: record */
 	lock_mode		lock_mode)/*!< in: LOCK_S or LOCK_X */
 {
@@ -4056,31 +3811,26 @@ lock_rec_unlock(
 
 	ut_ad(trx);
 	ut_ad(rec);
-	ut_ad(block->frame == page_align(rec));
 	ut_ad(!trx->lock.wait_lock);
 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 	ut_ad(!page_rec_is_metadata(rec));
 
 	heap_no = page_rec_get_heap_no(rec);
 
-	lock_mutex_enter();
-	trx_mutex_enter(trx);
+	LockGuard g{lock_sys.rec_hash, id};
 
-	first_lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+	first_lock = lock_sys_t::get_first(g.cell(), id, heap_no);
 
 	/* Find the last lock with the same lock_mode and transaction
 	on the record. */
 
 	for (lock = first_lock; lock != NULL;
 	     lock = lock_rec_get_next(heap_no, lock)) {
-		if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
+		if (lock->trx == trx && lock->mode() == lock_mode) {
 			goto released;
 		}
 	}
 
-	lock_mutex_exit();
-	trx_mutex_exit(trx);
-
 	{
 		ib::error	err;
 		err << "Unlock row could not find a " << lock_mode
@@ -4095,232 +3845,392 @@ lock_rec_unlock(
 	return;
 
 released:
-	ut_a(!lock_get_wait(lock));
-	lock_rec_reset_nth_bit(lock, heap_no);
+	ut_a(!lock->is_waiting());
+	{
+		TMTrxGuard tg{*trx};
+		lock_rec_reset_nth_bit(lock, heap_no);
+	}
 
-	if (innodb_lock_schedule_algorithm
-		== INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
-		thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
+	/* Check if we can now grant waiting lock requests */
 
-		/* Check if we can now grant waiting lock requests */
+	for (lock = first_lock; lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+		if (!lock->is_waiting()) {
+			continue;
+		}
+		mysql_mutex_lock(&lock_sys.wait_mutex);
+		ut_ad(lock->trx->lock.wait_trx);
+		ut_ad(lock->trx->lock.wait_lock);
 
-		for (lock = first_lock; lock != NULL;
-			 lock = lock_rec_get_next(heap_no, lock)) {
-			if (!lock_get_wait(lock)) {
-				continue;
-			}
-			const lock_t* c = lock_rec_has_to_wait_in_queue(lock);
-			if (!c) {
-				/* Grant the lock */
-				ut_ad(trx != lock->trx);
-				lock_grant(lock);
-			}
+		if (const lock_t* c = lock_rec_has_to_wait_in_queue(g.cell(),
+								    lock)) {
+			lock->trx->lock.wait_trx = c->trx;
+		} else {
+			/* Grant the lock */
+			ut_ad(trx != lock->trx);
+			lock_grant(lock);
 		}
-	} else {
-		lock_grant_and_move_on_rec(first_lock, heap_no);
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
 	}
-
-	lock_mutex_exit();
-	trx_mutex_exit(trx);
 }
 
-#ifdef UNIV_DEBUG
-/*********************************************************************//**
-Check if a transaction that has X or IX locks has set the dict_op
-code correctly. */
-static
-void
-lock_check_dict_lock(
-/*==================*/
-	const lock_t*	lock)	/*!< in: lock to check */
-{
-	if (lock_get_type_low(lock) == LOCK_REC) {
-		ut_ad(!lock->index->table->is_temporary());
-
-		/* Check if the transcation locked a record
-		in a system table in X mode. It should have set
-		the dict_op code correctly if it did. */
-		if (lock->index->table->id < DICT_HDR_FIRST_ID
-		    && lock_get_mode(lock) == LOCK_X) {
-
-			ut_ad(lock_get_mode(lock) != LOCK_IX);
-			ut_ad(lock->trx->dict_operation != TRX_DICT_OP_NONE);
-		}
-	} else {
-		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks.
+@return whether the operation succeeded */
+TRANSACTIONAL_TARGET static bool lock_release_try(trx_t *trx)
+{
+  /* At this point, trx->lock.trx_locks cannot be modified by other
+  threads, because our transaction has been committed.
+  See the checks and assertions in lock_rec_create_low() and
+  lock_rec_add_to_queue().
+
+  The function lock_table_create() should never be invoked on behalf
+  of a transaction running in another thread. Also there, we will
+  assert that the current transaction be active. */
+  DBUG_ASSERT(trx->state == TRX_STATE_COMMITTED_IN_MEMORY);
+  DBUG_ASSERT(!trx->is_referenced());
+
+  bool all_released= true;
+restart:
+  ulint count= 1000;
+  /* We will not attempt hardware lock elision (memory transaction)
+  here. Both lock_rec_dequeue_from_page() and lock_table_dequeue()
+  would likely lead to a memory transaction due to a system call, to
+  wake up a waiting transaction. */
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  /* Note: Anywhere else, trx->mutex is not held while acquiring
+  a lock table latch, but here we are following the opposite order.
+  To avoid deadlocks, we only try to acquire the lock table latches
+  but not keep waiting for them. */
 
-		const dict_table_t* table = lock->un_member.tab_lock.table;
-		ut_ad(!table->is_temporary());
+  for (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; )
+  {
+    ut_ad(lock->trx == trx);
+    lock_t *prev= UT_LIST_GET_PREV(trx_locks, lock);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      ut_ad(lock->mode() != LOCK_X ||
+            lock->index->table->id >= DICT_HDR_FIRST_ID ||
+            trx->dict_operation || trx->was_dict_operation);
+      auto &lock_hash= lock_sys.hash_get(lock->type_mode);
+      auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
+      auto latch= lock_sys_t::hash_table::latch(cell);
+      if (!latch->try_acquire())
+        all_released= false;
+      else
+      {
+        lock_rec_dequeue_from_page(lock, false);
+        latch->release();
+      }
+    }
+    else
+    {
+      dict_table_t *table= lock->un_member.tab_lock.table;
+      ut_ad(!table->is_temporary());
+      ut_ad(table->id >= DICT_HDR_FIRST_ID ||
+            (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
+            trx->dict_operation || trx->was_dict_operation);
+      if (!table->lock_mutex_trylock())
+        all_released= false;
+      else
+      {
+        lock_table_dequeue(lock, false);
+        table->lock_mutex_unlock();
+      }
+    }
 
-		/* Check if the transcation locked a system table
-		in IX mode. It should have set the dict_op code
-		correctly if it did. */
-		if (table->id < DICT_HDR_FIRST_ID
-		    && (lock_get_mode(lock) == LOCK_X
-			|| lock_get_mode(lock) == LOCK_IX)) {
+    lock= all_released ? UT_LIST_GET_LAST(trx->lock.trx_locks) : prev;
+    if (!--count)
+      break;
+  }
 
-			ut_ad(lock->trx->dict_operation != TRX_DICT_OP_NONE);
-		}
-	}
+  lock_sys.rd_unlock();
+  trx->mutex_unlock();
+  if (all_released && !count)
+    goto restart;
+  return all_released;
 }
-#endif /* UNIV_DEBUG */
 
 /** Release the explicit locks of a committing transaction,
 and release possible other transactions waiting because of these locks. */
-void lock_release(trx_t* trx)
+void lock_release(trx_t *trx)
 {
 #ifdef UNIV_DEBUG
-	std::set<table_id_t> to_evict;
-	if (innodb_evict_tables_on_commit_debug && !trx->is_recovered)
-# if 1 /* if dict_stats_exec_sql() were not playing dirty tricks */
-	if (!mutex_own(&dict_sys.mutex))
-# else /* this would be more proper way to do it */
-	if (!trx->dict_operation_lock_mode && !trx->dict_operation)
-# endif
-	for (const auto& p : trx->mod_tables)
-		if (!p.first->is_temporary())
-			to_evict.emplace(p.first->id);
+  std::set<table_id_t> to_evict;
+  if (innodb_evict_tables_on_commit_debug &&
+      !trx->is_recovered && !dict_sys.locked())
+    for (const auto& p : trx->mod_tables)
+      if (!p.first->is_temporary())
+        to_evict.emplace(p.first->id);
 #endif
-	ulint		count = 0;
-	trx_id_t	max_trx_id = trx_sys.get_max_trx_id();
-
-	lock_mutex_enter();
-	ut_ad(!trx_mutex_own(trx));
+  ulint count;
 
-	for (lock_t* lock = UT_LIST_GET_LAST(trx->lock.trx_locks);
-	     lock != NULL;
-	     lock = UT_LIST_GET_LAST(trx->lock.trx_locks)) {
-
-		ut_d(lock_check_dict_lock(lock));
+  for (count= 5; count--; )
+    if (lock_release_try(trx))
+      goto released;
 
-		if (lock_get_type_low(lock) == LOCK_REC) {
-
-			lock_rec_dequeue_from_page(lock);
-		} else {
-			dict_table_t*	table;
+  /* Fall back to acquiring lock_sys.latch in exclusive mode */
+restart:
+  count= 1000;
+  /* There is probably no point to try lock elision here;
+  in lock_release_try() it is different. */
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
 
-			table = lock->un_member.tab_lock.table;
-
-			if (lock_get_mode(lock) != LOCK_IS
-			    && trx->undo_no != 0) {
-
-				/* The trx may have modified the table. We
-				block the use of the MySQL query cache for
-				all currently active transactions. */
+  while (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks))
+  {
+    ut_ad(lock->trx == trx);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      ut_ad(lock->mode() != LOCK_X ||
+            lock->index->table->id >= DICT_HDR_FIRST_ID ||
+            trx->dict_operation || trx->was_dict_operation);
+      lock_rec_dequeue_from_page(lock, false);
+    }
+    else
+    {
+      ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+      ut_ad(!table->is_temporary());
+      ut_ad(table->id >= DICT_HDR_FIRST_ID ||
+            (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
+            trx->dict_operation || trx->was_dict_operation);
+      lock_table_dequeue(lock, false);
+    }
 
-				table->query_cache_inv_trx_id = max_trx_id;
-			}
+    if (!--count)
+      break;
+  }
 
-			lock_table_dequeue(lock);
-		}
+  lock_sys.wr_unlock();
+  trx->mutex_unlock();
+  if (!count)
+    goto restart;
 
-		if (count == LOCK_RELEASE_INTERVAL) {
-			/* Release the  mutex for a while, so that we
-			do not monopolize it */
+released:
+  if (UNIV_UNLIKELY(Deadlock::to_be_checked))
+  {
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    lock_sys.deadlock_check();
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+  }
 
-			lock_mutex_exit();
+  trx->lock.n_rec_locks= 0;
 
-			lock_mutex_enter();
+#ifdef UNIV_DEBUG
+  if (to_evict.empty())
+    return;
+  dict_sys.lock(SRW_LOCK_CALL);
+  LockMutexGuard g{SRW_LOCK_CALL};
+  for (const table_id_t id : to_evict)
+    if (dict_table_t *table= dict_sys.find_table(id))
+      if (!table->get_ref_count() && !UT_LIST_GET_LEN(table->locks))
+        dict_sys.remove(table, true);
+  dict_sys.unlock();
+#endif
+}
 
-			count = 0;
-		}
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx)
+{
+  ut_ad(lock_sys.is_writer());
+  ut_ad(trx->mutex_is_owner());
+  ut_ad(trx->dict_operation);
 
-		++count;
-	}
+  while (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks))
+  {
+    ut_ad(lock->trx == trx);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      ut_ad(lock->mode() != LOCK_X ||
+            lock->index->table->id >= DICT_HDR_FIRST_ID ||
+            trx->dict_operation);
+      lock_rec_dequeue_from_page(lock, false);
+    }
+    else
+    {
+      ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+      ut_ad(!table->is_temporary());
+      ut_ad(table->id >= DICT_HDR_FIRST_ID ||
+            (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
+            trx->dict_operation);
+      lock_table_dequeue(lock, false);
+    }
+  }
+}
 
-	lock_mutex_exit();
+/** Release non-exclusive locks on XA PREPARE,
+and wake up possible other transactions waiting because of these locks.
+@param trx   transaction in XA PREPARE state
+@return whether all locks were released */
+static bool lock_release_on_prepare_try(trx_t *trx)
+{
+  /* At this point, trx->lock.trx_locks can still be modified by other
+  threads to convert implicit exclusive locks into explicit ones.
+
+  The function lock_table_create() should never be invoked on behalf
+  of a transaction that is running in another thread. Also there, we
+  will assert that the current transaction be active. */
+  DBUG_ASSERT(trx->state == TRX_STATE_PREPARED);
+
+  bool all_released= true;
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  /* Note: Normally, trx->mutex is not held while acquiring
+  a lock table latch, but here we are following the opposite order.
+  To avoid deadlocks, we only try to acquire the lock table latches
+  but not keep waiting for them. */
+
+  for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock;
+       lock= prev)
+  {
+    ut_ad(lock->trx == trx);
+    prev= UT_LIST_GET_PREV(trx_locks, lock);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      if (lock->mode() == LOCK_X && !lock->is_gap()) {
+        ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED ||
+              /* Insert-intention lock is valid for supremum for isolation
+              level > TRX_ISO_READ_COMMITTED */
+              lock->mode() == LOCK_X ||
+              !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
+        continue;
+      }
+      auto &lock_hash= lock_sys.hash_get(lock->type_mode);
+      auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
+      auto latch= lock_sys_t::hash_table::latch(cell);
+      if (latch->try_acquire())
+      {
+        lock_rec_dequeue_from_page(lock, false);
+        latch->release();
+      }
+      else
+        all_released= false;
+    }
+    else
+    {
+      dict_table_t *table= lock->un_member.tab_lock.table;
+      ut_ad(!table->is_temporary());
+      switch (lock->mode()) {
+      case LOCK_IS:
+      case LOCK_S:
+        if (table->lock_mutex_trylock())
+        {
+          lock_table_dequeue(lock, false);
+          table->lock_mutex_unlock();
+        }
+        else
+          all_released= false;
+        break;
+      case LOCK_IX:
+      case LOCK_X:
+        ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
+        /* fall through */
+      default:
+        break;
+      }
+    }
+  }
 
-#ifdef UNIV_DEBUG
-	if (to_evict.empty()) {
-		return;
-	}
-	mutex_enter(&dict_sys.mutex);
-	lock_mutex_enter();
-	for (table_id_t id : to_evict) {
-		if (dict_table_t *table = dict_table_open_on_id(
-			    id, TRUE, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)) {
-			if (!table->get_ref_count()
-			    && !UT_LIST_GET_LEN(table->locks)) {
-				dict_sys.remove(table, true);
-			}
-		}
-	}
-	lock_mutex_exit();
-	mutex_exit(&dict_sys.mutex);
-#endif
+  lock_sys.rd_unlock();
+  trx->mutex_unlock();
+  return all_released;
 }
 
 /** Release non-exclusive locks on XA PREPARE,
 and release possible other transactions waiting because of these locks. */
 void lock_release_on_prepare(trx_t *trx)
 {
-  ulint count= 0;
-  lock_mutex_enter();
-  ut_ad(!trx_mutex_own(trx));
+  auto _ = make_scope_exit([trx]() { trx->set_skip_lock_inheritance(); });
 
-  for (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; )
+  for (ulint count= 5; count--; )
+    if (lock_release_on_prepare_try(trx))
+      return;
+
+  LockMutexGuard g{SRW_LOCK_CALL};
+  trx->mutex_lock();
+
+  for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock;
+       lock= prev)
   {
     ut_ad(lock->trx == trx);
-
-    if (lock_get_type_low(lock) == LOCK_REC)
+    prev= UT_LIST_GET_PREV(trx_locks, lock);
+    if (!lock->is_table())
     {
       ut_ad(!lock->index->table->is_temporary());
-      if (lock_rec_get_gap(lock) || lock_get_mode(lock) != LOCK_X)
-        lock_rec_dequeue_from_page(lock);
+      if (lock->mode() != LOCK_X || lock->is_gap())
+        lock_rec_dequeue_from_page(lock, false);
       else
-      {
-        ut_ad(trx->dict_operation ||
-              lock->index->table->id >= DICT_HDR_FIRST_ID);
         ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED ||
               /* Insert-intention lock is valid for supremum for isolation
               level > TRX_ISO_READ_COMMITTED */
-              lock_get_mode(lock) == LOCK_X ||
+              lock->mode() == LOCK_X ||
               !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
-retain_lock:
-        lock= UT_LIST_GET_PREV(trx_locks, lock);
-        continue;
-      }
     }
     else
     {
-      ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
       ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
       ut_ad(!table->is_temporary());
-
-      switch (lock_get_mode(lock)) {
+      switch (lock->mode()) {
       case LOCK_IS:
       case LOCK_S:
-        lock_table_dequeue(lock);
+        lock_table_dequeue(lock, false);
         break;
       case LOCK_IX:
       case LOCK_X:
         ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
         /* fall through */
       default:
-        goto retain_lock;
+        break;
       }
     }
+  }
 
-    if (++count == LOCK_RELEASE_INTERVAL)
-    {
-      lock_mutex_exit();
-      count= 0;
-      lock_mutex_enter();
-    }
+  trx->mutex_unlock();
+}
+
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD
+void lock_release_on_rollback(trx_t *trx, dict_table_t *table)
+{
+  trx->mod_tables.erase(table);
 
-    lock= UT_LIST_GET_LAST(trx->lock.trx_locks);
+  /* This is very rarely executed code, in the rare case that an
+  CREATE TABLE operation is being rolled back. Theoretically,
+  we might try to remove the locks in multiple memory transactions. */
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  for (lock_t *next, *lock= UT_LIST_GET_FIRST(table->locks); lock; lock= next)
+  {
+    next= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+    ut_ad(lock->trx == trx);
+    UT_LIST_REMOVE(trx->lock.trx_locks, lock);
+    ut_list_remove(table->locks, lock, TableLockGetNode());
   }
 
-  lock_mutex_exit();
+  for (lock_t *p, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; lock= p)
+  {
+    p= UT_LIST_GET_PREV(trx_locks, lock);
+    ut_ad(lock->trx == trx);
+    if (lock->is_table())
+      ut_ad(lock->un_member.tab_lock.table != table);
+    else if (lock->index->table == table)
+      lock_rec_dequeue_from_page(lock, false);
+  }
 
-  trx->set_skip_lock_inheritance();
+  lock_sys.wr_unlock();
+  trx->mutex_unlock();
 }
 
-/* True if a lock mode is S or X */
-#define IS_LOCK_S_OR_X(lock) \
-	(lock_get_mode(lock) == LOCK_S \
-	 || lock_get_mode(lock) == LOCK_X)
-
 /*********************************************************************//**
 Removes table locks of the transaction on a table to be dropped. */
 static
@@ -4331,38 +4241,24 @@ lock_trx_table_locks_remove(
 {
 	trx_t*		trx = lock_to_remove->trx;
 
-	ut_ad(lock_mutex_own());
-
-	/* It is safe to read this because we are holding the lock mutex */
-	if (!trx->lock.cancel) {
-		trx_mutex_enter(trx);
-	} else {
-		ut_ad(trx_mutex_own(trx));
-	}
+	ut_ad(lock_to_remove->is_table());
+	lock_sys.assert_locked(*lock_to_remove->un_member.tab_lock.table);
+	ut_ad(trx->mutex_is_owner());
 
 	for (lock_list::iterator it = trx->lock.table_locks.begin(),
              end = trx->lock.table_locks.end(); it != end; ++it) {
 		const lock_t*	lock = *it;
 
 		ut_ad(!lock || trx == lock->trx);
-		ut_ad(!lock || lock_get_type_low(lock) & LOCK_TABLE);
+		ut_ad(!lock || lock->is_table());
 		ut_ad(!lock || lock->un_member.tab_lock.table);
 
 		if (lock == lock_to_remove) {
 			*it = NULL;
-
-			if (!trx->lock.cancel) {
-				trx_mutex_exit(trx);
-			}
-
 			return;
 		}
 	}
 
-	if (!trx->lock.cancel) {
-		trx_mutex_exit(trx);
-	}
-
 	/* Lock must exist in the vector. */
 	ut_error;
 }
@@ -4376,32 +4272,37 @@ static
 void
 lock_table_print(FILE* file, const lock_t* lock)
 {
-	ut_ad(lock_mutex_own());
-	ut_a(lock_get_type_low(lock) == LOCK_TABLE);
+	lock_sys.assert_locked();
+	ut_a(lock->is_table());
 
 	fputs("TABLE LOCK table ", file);
 	ut_print_name(file, lock->trx,
 		      lock->un_member.tab_lock.table->name.m_name);
-	fprintf(file, " trx id " TRX_ID_FMT, trx_get_id_for_print(lock->trx));
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
 
-	if (lock_get_mode(lock) == LOCK_S) {
+	switch (auto mode = lock->mode()) {
+	case LOCK_S:
 		fputs(" lock mode S", file);
-	} else if (lock_get_mode(lock) == LOCK_X) {
+		break;
+	case LOCK_X:
 		ut_ad(lock->trx->id != 0);
 		fputs(" lock mode X", file);
-	} else if (lock_get_mode(lock) == LOCK_IS) {
+		break;
+	case LOCK_IS:
 		fputs(" lock mode IS", file);
-	} else if (lock_get_mode(lock) == LOCK_IX) {
+		break;
+	case LOCK_IX:
 		ut_ad(lock->trx->id != 0);
 		fputs(" lock mode IX", file);
-	} else if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+		break;
+	case LOCK_AUTO_INC:
 		fputs(" lock mode AUTO-INC", file);
-	} else {
-		fprintf(file, " unknown lock mode %lu",
-			(ulong) lock_get_mode(lock));
+		break;
+	default:
+		fprintf(file, " unknown lock mode %u", mode);
 	}
 
-	if (lock_get_wait(lock)) {
+	if (lock->is_waiting()) {
 		fputs(" waiting", file);
 	}
 
@@ -4414,10 +4315,10 @@ lock_table_print(FILE* file, const lock_t* lock)
 @param[in,out]	mtr	mini-transaction for accessing the record */
 static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
 {
-	ut_ad(lock_mutex_own());
-	ut_a(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(!lock->is_table());
 
-	const page_id_t page_id(lock->un_member.rec_lock.page_id);
+	const page_id_t page_id{lock->un_member.rec_lock.page_id};
+	ut_d(lock_sys.hash_get(lock->type_mode).assert_locked(page_id));
 
 	fprintf(file, "RECORD LOCKS space id %u page no %u n bits " ULINTPF
 		" index %s of table ",
@@ -4425,29 +4326,32 @@ static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
 		lock_rec_get_n_bits(lock),
 		lock->index->name());
 	ut_print_name(file, lock->trx, lock->index->table->name.m_name);
-	fprintf(file, " trx id " TRX_ID_FMT, trx_get_id_for_print(lock->trx));
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
 
-	if (lock_get_mode(lock) == LOCK_S) {
+	switch (lock->mode()) {
+	case LOCK_S:
 		fputs(" lock mode S", file);
-	} else if (lock_get_mode(lock) == LOCK_X) {
+		break;
+	case LOCK_X:
 		fputs(" lock_mode X", file);
-	} else {
+		break;
+	default:
 		ut_error;
 	}
 
-	if (lock_rec_get_gap(lock)) {
+	if (lock->is_gap()) {
 		fputs(" locks gap before rec", file);
 	}
 
-	if (lock_rec_get_rec_not_gap(lock)) {
+	if (lock->is_record_not_gap()) {
 		fputs(" locks rec but not gap", file);
 	}
 
-	if (lock_rec_get_insert_intention(lock)) {
+	if (lock->is_insert_intention()) {
 		fputs(" insert intention", file);
 	}
 
-	if (lock_get_wait(lock)) {
+	if (lock->is_waiting()) {
 		fputs(" waiting", file);
 	}
 
@@ -4470,7 +4374,7 @@ static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
 		fprintf(file, "Record lock, heap no %lu", (ulong) i);
 
 		if (block) {
-			ut_ad(page_is_leaf(block->frame));
+			ut_ad(page_is_leaf(block->page.frame));
 			const rec_t*	rec;
 
 			rec = page_find_rec_with_heap_no(
@@ -4507,12 +4411,13 @@ http://bugs.mysql.com/36942 */
 /*********************************************************************//**
 Calculates the number of record lock structs in the record lock hash table.
 @return number of record locks */
+TRANSACTIONAL_TARGET
 static ulint lock_get_n_rec_locks()
 {
 	ulint	n_locks	= 0;
 	ulint	i;
 
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked();
 
 	for (i = 0; i < lock_sys.rec_hash.n_cells; i++) {
 		const lock_t*	lock;
@@ -4533,26 +4438,25 @@ static ulint lock_get_n_rec_locks()
 
 /*********************************************************************//**
 Prints info of locks for all transactions.
-@return FALSE if not able to obtain lock mutex
-and exits without printing info */
+@return FALSE if not able to acquire lock_sys.latch (and dislay info) */
 ibool
 lock_print_info_summary(
 /*====================*/
 	FILE*	file,	/*!< in: file where to print */
-	ibool	nowait)	/*!< in: whether to wait for the lock mutex */
+	ibool	nowait)	/*!< in: whether to wait for lock_sys.latch */
 {
-	/* if nowait is FALSE, wait on the lock mutex,
-	otherwise return immediately if fail to obtain the
-	mutex. */
+	/* Here, lock elision does not make sense, because
+	for the output we are going to invoke system calls,
+	which would interrupt a memory transaction. */
 	if (!nowait) {
-		lock_mutex_enter();
-	} else if (lock_mutex_enter_nowait()) {
+		lock_sys.wr_lock(SRW_LOCK_CALL);
+	} else if (!lock_sys.wr_lock_try()) {
 		fputs("FAIL TO OBTAIN LOCK MUTEX,"
 		      " SKIP LOCK INFO PRINTING\n", file);
 		return(FALSE);
 	}
 
-	if (lock_deadlock_found) {
+	if (lock_sys.deadlocks) {
 		fputs("------------------------\n"
 		      "LATEST DETECTED DEADLOCK\n"
 		      "------------------------\n", file);
@@ -4579,7 +4483,7 @@ lock_print_info_summary(
 		? (purge_sys.running() ? "running"
 		   : purge_sys.paused() ? "stopped" : "running but idle")
 		: "disabled",
-		size_t{trx_sys.rseg_history_len});
+		trx_sys.history_size_approx());
 
 #ifdef PRINT_NUM_OF_LOCK_STRUCTS
 	fprintf(file,
@@ -4592,27 +4496,27 @@ lock_print_info_summary(
 /** Prints transaction lock wait and MVCC state.
 @param[in,out]	file	file where to print
 @param[in]	trx	transaction
-@param[in]	now	current time */
-void
-lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now)
+@param[in]	now	current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+                                        my_hrtime_t now)
 {
 	fprintf(file, "---");
 
 	trx_print_latched(file, trx, 600);
 	trx->read_view.print_limits(file);
 
-	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-
+	if (const lock_t* wait_lock = trx->lock.wait_lock) {
+		const my_hrtime_t suspend_time= trx->lock.suspend_time;
 		fprintf(file,
-			"------- TRX HAS BEEN WAITING %lu SEC"
+			"------- TRX HAS BEEN WAITING %llu ns"
 			" FOR THIS LOCK TO BE GRANTED:\n",
-			(ulong) difftime(now, trx->lock.wait_started));
+			now.val - suspend_time.val);
 
-		if (lock_get_type_low(trx->lock.wait_lock) == LOCK_REC) {
+		if (!wait_lock->is_table()) {
 			mtr_t mtr;
-			lock_rec_print(file, trx->lock.wait_lock, mtr);
+			lock_rec_print(file, wait_lock, mtr);
 		} else {
-			lock_table_print(file, trx->lock.wait_lock);
+			lock_table_print(file, wait_lock);
 		}
 
 		fprintf(file, "------------------\n");
@@ -4631,15 +4535,13 @@ lock_trx_print_locks(
 	mtr_t mtr;
 	uint32_t i= 0;
 	/* Iterate over the transaction's locks. */
+	lock_sys.assert_locked();
 	for (lock_t *lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
 	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
-		if (lock_get_type_low(lock) == LOCK_REC) {
-
+		if (!lock->is_table()) {
 			lock_rec_print(file, lock, mtr);
 		} else {
-			ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
-
 			lock_table_print(file, lock);
 		}
 
@@ -4657,9 +4559,9 @@ lock_trx_print_locks(
 /** Functor to display all transactions */
 struct lock_print_info
 {
-  lock_print_info(FILE* file, time_t now) :
+  lock_print_info(FILE* file, my_hrtime_t now) :
     file(file), now(now),
-    purge_trx(purge_sys.query ? purge_sys.query->trx : NULL)
+    purge_trx(purge_sys.query ? purge_sys.query->trx : nullptr)
   {}
 
   void operator()(const trx_t &trx) const
@@ -4673,27 +4575,24 @@ struct lock_print_info
   }
 
   FILE* const file;
-  const time_t now;
+  const my_hrtime_t now;
   const trx_t* const purge_trx;
 };
 
 /*********************************************************************//**
-Prints info of locks for each transaction. This function assumes that the
-caller holds the lock mutex and more importantly it will release the lock
-mutex on behalf of the caller. (This should be fixed in the future). */
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
 void
 lock_print_info_all_transactions(
 /*=============================*/
 	FILE*		file)	/*!< in/out: file where to print */
 {
-	ut_ad(lock_mutex_own());
-
 	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
 
-	trx_sys.trx_list.for_each(lock_print_info(file, time(nullptr)));
-	lock_mutex_exit();
+	trx_sys.trx_list.for_each(lock_print_info(file, my_hrtime_coarse()));
+	lock_sys.wr_unlock();
 
-	ut_ad(lock_validate());
+	ut_d(lock_validate());
 }
 
 #ifdef UNIV_DEBUG
@@ -4709,7 +4608,7 @@ lock_trx_table_locks_find(
 {
 	bool		found = false;
 
-	ut_ad(trx_mutex_own(trx));
+	ut_ad(trx->mutex_is_owner());
 
 	for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
              end = trx->lock.table_locks.end(); it != end; ++it) {
@@ -4728,7 +4627,7 @@ lock_trx_table_locks_find(
 		}
 
 		ut_a(trx == lock->trx);
-		ut_a(lock_get_type_low(lock) & LOCK_TABLE);
+		ut_a(lock->is_table());
 		ut_a(lock->un_member.tab_lock.table != NULL);
 	}
 
@@ -4746,29 +4645,29 @@ lock_table_queue_validate(
 {
 	const lock_t*	lock;
 
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked(*table);
 
 	for (lock = UT_LIST_GET_FIRST(table->locks);
 	     lock != NULL;
 	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
 
 		/* lock->trx->state cannot change from or to NOT_STARTED
-		while we are holding the lock_sys.mutex. It may change
+		while we are holding the lock_sys.latch. It may change
 		from ACTIVE or PREPARED to PREPARED or COMMITTED. */
-		trx_mutex_enter(lock->trx);
+		lock->trx->mutex_lock();
 		check_trx_state(lock->trx);
 
 		if (lock->trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
-		} else if (!lock_get_wait(lock)) {
+		} else if (!lock->is_waiting()) {
 			ut_a(!lock_table_other_has_incompatible(
 				     lock->trx, 0, table,
-				     lock_get_mode(lock)));
+				     lock->mode()));
 		} else {
 			ut_a(lock_table_has_to_wait_in_queue(lock));
 		}
 
 		ut_a(lock_trx_table_locks_find(lock->trx, lock));
-		trx_mutex_exit(lock->trx);
+		lock->trx->mutex_unlock();
 	}
 
 	return(TRUE);
@@ -4783,9 +4682,9 @@ lock_rec_queue_validate(
 /*====================*/
 	bool			locked_lock_trx_sys,
 					/*!< in: if the caller holds
-					both the lock mutex and
+					both the lock_sys.latch and
 					trx_sys_t->lock. */
-	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const page_id_t		id,	/*!< in: page identifier */
 	const rec_t*		rec,	/*!< in: record to look at */
 	const dict_index_t*	index,	/*!< in: index, or NULL if not known */
 	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
@@ -4794,49 +4693,48 @@ lock_rec_queue_validate(
 	ulint		heap_no;
 
 	ut_a(rec);
-	ut_a(block->frame == page_align(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
 	ut_ad(page_rec_is_leaf(rec));
-	ut_ad(lock_mutex_own() == locked_lock_trx_sys);
 	ut_ad(!index || dict_index_is_clust(index)
 	      || !dict_index_is_online_ddl(index));
 
 	heap_no = page_rec_get_heap_no(rec);
 
 	if (!locked_lock_trx_sys) {
-		lock_mutex_enter();
+		lock_sys.wr_lock(SRW_LOCK_CALL);
 	}
 
+	hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id.fold());
+	lock_sys.assert_locked(cell);
+
 	if (!page_rec_is_user_rec(rec)) {
 
-		for (lock = lock_rec_get_first(&lock_sys.rec_hash,
-					       block, heap_no);
+		for (lock = lock_sys_t::get_first(cell, id, heap_no);
 		     lock != NULL;
 		     lock = lock_rec_get_next_const(heap_no, lock)) {
 
 			ut_ad(!index || lock->index == index);
 
-			trx_mutex_enter(lock->trx);
+			lock->trx->mutex_lock();
 			ut_ad(!lock->trx->read_only
 			      || !lock->trx->is_autocommit_non_locking());
 			ut_ad(trx_state_eq(lock->trx,
 					   TRX_STATE_COMMITTED_IN_MEMORY)
-			      || !lock_get_wait(lock)
-			      || lock_rec_has_to_wait_in_queue(lock));
-			trx_mutex_exit(lock->trx);
+			      || !lock->is_waiting()
+			      || lock_rec_has_to_wait_in_queue(cell, lock));
+			lock->trx->mutex_unlock();
 		}
 
 func_exit:
 		if (!locked_lock_trx_sys) {
-			lock_mutex_exit();
+			lock_sys.wr_unlock();
 		}
 
 		return true;
 	}
 
 	ut_ad(page_rec_is_leaf(rec));
-	ut_ad(lock_mutex_own());
 
 	const trx_id_t impl_trx_id = index && index->is_primary()
 		? lock_clust_rec_some_has_impl(rec, index, offsets)
@@ -4848,12 +4746,12 @@ func_exit:
 		/* impl_trx could have been committed before we
 		acquire its mutex, but not thereafter. */
 
-		mutex_enter(&impl_trx->mutex);
+		impl_trx->mutex_lock();
 		ut_ad(impl_trx->state != TRX_STATE_NOT_STARTED);
 		if (impl_trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
 		} else if (const lock_t* other_lock
 			   = lock_rec_other_has_expl_req(
-				   LOCK_S, block, true, heap_no,
+				   LOCK_S, cell, id, true, heap_no,
 				   impl_trx)) {
 			/* The impl_trx is holding an implicit lock on the
 			given record 'rec'. So there cannot be another
@@ -4881,28 +4779,29 @@ func_exit:
 			for BF abort or kill victim.
 			** There should not be two BF transactions waiting for same record lock
 			*/
-			if (other_lock->trx->is_wsrep() && !lock_get_wait(other_lock)) {
+			if (other_lock->trx->is_wsrep() && !other_lock->is_waiting()) {
 				wsrep_report_bf_lock_wait(impl_trx->mysql_thd, impl_trx->id);
 				wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
 
 				if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
-						       block, heap_no,
+						       cell, id, heap_no,
 						       impl_trx)) {
 					ib::info() << "WSREP impl BF lock conflict";
 				}
 			} else
 #endif /* WITH_WSREP */
 			{
-				ut_ad(lock_get_wait(other_lock));
+				ut_ad(other_lock->is_waiting());
 				ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
-						        block, heap_no, impl_trx));
+						        cell, id, heap_no,
+							impl_trx));
 			}
 		}
 
-		mutex_exit(&impl_trx->mutex);
+		impl_trx->mutex_unlock();
 	}
 
-	for (lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
+	for (lock = lock_sys_t::get_first(cell, id, heap_no);
 	     lock != NULL;
 	     lock = lock_rec_get_next_const(heap_no, lock)) {
 		ut_ad(!lock->trx->read_only
@@ -4913,19 +4812,16 @@ func_exit:
 			ut_a(lock->index == index);
 		}
 
-		if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
-
-			lock_mode	mode;
-
-			if (lock_get_mode(lock) == LOCK_S) {
-				mode = LOCK_X;
-			} else {
-				mode = LOCK_S;
-			}
+		if (lock->is_waiting()) {
+			ut_a(lock->is_gap()
+			     || lock_rec_has_to_wait_in_queue(cell, lock));
+		} else if (!lock->is_gap()) {
+			const lock_mode	mode = lock->mode() == LOCK_S
+				? LOCK_X : LOCK_S;
 
 			const lock_t*	other_lock
 				= lock_rec_other_has_expl_req(
-					mode, block, false, heap_no,
+					mode, cell, id, false, heap_no,
 					lock->trx);
 #ifdef WITH_WSREP
 			if (UNIV_UNLIKELY(other_lock && lock->trx->is_wsrep())) {
@@ -4942,26 +4838,17 @@ func_exit:
 			} else
 #endif /* WITH_WSREP */
 			ut_ad(!other_lock);
-		} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
-
-			ut_a(lock_rec_has_to_wait_in_queue(lock));
 		}
 	}
 
-	ut_ad(innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
-		  lock_queue_validate(lock));
-
 	goto func_exit;
 }
 
-/*********************************************************************//**
-Validates the record lock queues on a page.
-@return TRUE if ok */
-static
-ibool
-lock_rec_validate_page(
-/*===================*/
-	const buf_block_t*	block)	/*!< in: buffer block */
+/** Validate the record lock queues on a page.
+@param block    buffer pool block
+@param latched  whether the tablespace latch may be held
+@return true if ok */
+static bool lock_rec_validate_page(const buf_block_t *block, bool latched)
 {
 	const lock_t*	lock;
 	const rec_t*	rec;
@@ -4973,15 +4860,17 @@ lock_rec_validate_page(
 	rec_offs*	offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	lock_mutex_enter();
+	const page_id_t id{block->page.id()};
+
+	LockGuard g{lock_sys.rec_hash, id};
 loop:
-	lock = lock_sys.get_first(block->page.id());
+	lock = lock_sys_t::get_first(g.cell(), id);
 
 	if (!lock) {
 		goto function_exit;
 	}
 
-	DBUG_ASSERT(block->page.status != buf_page_t::FREED);
+	DBUG_ASSERT(!block->page.is_freed());
 
 	for (i = 0; i < nth_lock; i++) {
 
@@ -4996,13 +4885,13 @@ loop:
 	      || !lock->trx->is_autocommit_non_locking());
 
 	/* Only validate the record queues when this thread is not
-	holding a space->latch. */
-	if (!sync_check_find(SYNC_FSP))
+	holding a tablespace latch. */
+	if (!latched)
 	for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
 		bool locked = lock_rec_get_nth_bit(lock, i);
 		if (locked || i == PAGE_HEAP_NO_SUPREMUM) {
 
-			rec = page_find_rec_with_heap_no(block->frame, i);
+			rec = page_find_rec_with_heap_no(block->page.frame, i);
 			ut_a(rec);
 			ut_ad(!locked || page_rec_is_leaf(rec));
 
@@ -5015,7 +4904,7 @@ loop:
 				offsets = rec_get_offsets(rec, lock->index,
 					offsets, lock->index->n_core_fields,
 					ULINT_UNDEFINED, &heap);
-				lock_rec_queue_validate(TRUE, block, rec,
+				lock_rec_queue_validate(true, id, rec,
 					lock->index, offsets);
 			}
 
@@ -5031,8 +4920,6 @@ loop:
 	goto loop;
 
 function_exit:
-	lock_mutex_exit();
-
 	if (heap != NULL) {
 		mem_heap_free(heap);
 	}
@@ -5051,7 +4938,7 @@ lock_rec_validate(
 	page_id_t*	limit)		/*!< in/out: upper limit of
 					(space, page_no) */
 {
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked();
 
 	for (const lock_t* lock = static_cast<const lock_t*>(
 		     HASH_GET_FIRST(&lock_sys.rec_hash, start));
@@ -5060,7 +4947,7 @@ lock_rec_validate(
 
 		ut_ad(!lock->trx->read_only
 		      || !lock->trx->is_autocommit_non_locking());
-		ut_ad(lock_get_type(lock) == LOCK_REC);
+		ut_ad(!lock->is_table());
 
 		page_id_t current(lock->un_member.rec_lock.page_id);
 
@@ -5078,9 +4965,7 @@ Validate a record lock's block */
 static void lock_rec_block_validate(const page_id_t page_id)
 {
 	/* The lock and the block that it is referring to may be freed at
-	this point. We pass BUF_GET_POSSIBLY_FREED to skip a debug check.
-	If the lock exists in lock_rec_validate_page() we assert
-	block->page.status != FREED. */
+	this point. */
 
 	buf_block_t*	block;
 	mtr_t		mtr;
@@ -5097,15 +4982,12 @@ static void lock_rec_block_validate(const page_id_t page_id)
 		block = buf_page_get_gen(
 			page_id,
 			space->zip_size(),
-			RW_X_LATCH, NULL,
+			RW_S_LATCH, NULL,
 			BUF_GET_POSSIBLY_FREED,
-			__FILE__, __LINE__, &mtr, &err);
+			&mtr, &err);
 
-		if (block && block->page.status != buf_page_t::FREED) {
-			buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-
-			ut_ad(lock_rec_validate_page(block));
-		}
+		ut_ad(!block
+		      || lock_rec_validate_page(block, space->is_latched()));
 
 		mtr_commit(&mtr);
 
@@ -5113,65 +4995,48 @@ static void lock_rec_block_validate(const page_id_t page_id)
 	}
 }
 
-
 static my_bool lock_validate_table_locks(rw_trx_hash_element_t *element, void*)
 {
-  ut_ad(lock_mutex_own());
-  mutex_enter(&element->mutex);
+  lock_sys.assert_locked();
+  element->mutex.wr_lock();
   if (element->trx)
   {
     check_trx_state(element->trx);
     for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks);
          lock != NULL;
          lock= UT_LIST_GET_NEXT(trx_locks, lock))
-    {
-      if (lock_get_type_low(lock) & LOCK_TABLE)
+      if (lock->is_table())
         lock_table_queue_validate(lock->un_member.tab_lock.table);
-    }
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return 0;
 }
 
 
-/*********************************************************************//**
-Validates the lock system.
-@return TRUE if ok */
-static
-bool
-lock_validate()
-/*===========*/
+/** Validate the transactional locks. */
+static void lock_validate()
 {
-	std::set<page_id_t> pages;
-
-	lock_mutex_enter();
-
-	/* Validate table locks */
-	trx_sys.rw_trx_hash.iterate(lock_validate_table_locks);
-
-	/* Iterate over all the record locks and validate the locks. We
-	don't want to hog the lock_sys_t::mutex. Release it during the
-	validation check. */
-
-	for (ulint i = 0; i < lock_sys.rec_hash.n_cells; i++) {
-		page_id_t limit(0, 0);
-
-		while (const lock_t* lock = lock_rec_validate(i, &limit)) {
-			if (lock_rec_find_set_bit(lock) == ULINT_UNDEFINED) {
-				/* The lock bitmap is empty; ignore it. */
-				continue;
-			}
-			pages.insert(lock->un_member.rec_lock.page_id);
-		}
-	}
-
-	lock_mutex_exit();
+  std::set<page_id_t> pages;
+  {
+    LockMutexGuard g{SRW_LOCK_CALL};
+    /* Validate table locks */
+    trx_sys.rw_trx_hash.iterate(lock_validate_table_locks);
 
-	for (page_id_t page_id : pages) {
-		lock_rec_block_validate(page_id);
-	}
+    for (ulint i= 0; i < lock_sys.rec_hash.n_cells; i++)
+    {
+      page_id_t limit{0, 0};
+      while (const lock_t *lock= lock_rec_validate(i, &limit))
+      {
+        if (lock_rec_find_set_bit(lock) == ULINT_UNDEFINED)
+          /* The lock bitmap is empty; ignore it. */
+          continue;
+        pages.insert(lock->un_member.rec_lock.page_id);
+      }
+    }
+  }
 
-	return(true);
+  for (page_id_t page_id : pages)
+    lock_rec_block_validate(page_id);
 }
 #endif /* UNIV_DEBUG */
 /*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
@@ -5183,11 +5048,10 @@ be suspended for some reason; if not, then puts the transaction and
 the query thread to the lock wait state and inserts a waiting request
 for a gap x-lock to the lock queue.
 @return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+TRANSACTIONAL_TARGET
 dberr_t
 lock_rec_insert_check_and_lock(
 /*===========================*/
-	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
-				set, does nothing */
 	const rec_t*	rec,	/*!< in: record after which to insert */
 	buf_block_t*	block,	/*!< in/out: buffer block of rec */
 	dict_index_t*	index,	/*!< in: index */
@@ -5198,137 +5062,96 @@ lock_rec_insert_check_and_lock(
 				LOCK_GAP type locks from the successor
 				record */
 {
-	ut_ad(block->frame == page_align(rec));
-	ut_ad(!dict_index_is_online_ddl(index)
-	      || index->is_primary()
-	      || (flags & BTR_CREATE_FLAG));
-	ut_ad(mtr->is_named_space(index->table->space));
-	ut_ad(page_rec_is_leaf(rec));
-
-	if (flags & BTR_NO_LOCKING_FLAG) {
-
-		return(DB_SUCCESS);
-	}
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(mtr->is_named_space(index->table->space));
+  ut_ad(page_is_leaf(block->page.frame));
+  ut_ad(!index->table->is_temporary());
 
-	ut_ad(!index->table->is_temporary());
-	ut_ad(page_is_leaf(block->frame));
-
-	dberr_t		err;
-	lock_t*		lock;
-	bool		inherit_in = *inherit;
-	trx_t*		trx = thr_get_trx(thr);
-	const rec_t*	next_rec = page_rec_get_next_const(rec);
-	ulint		heap_no = page_rec_get_heap_no(next_rec);
-	ut_ad(!rec_is_metadata(next_rec, *index));
-
-	lock_mutex_enter();
-	/* Because this code is invoked for a running transaction by
-	the thread that is serving the transaction, it is not necessary
-	to hold trx->mutex here. */
-
-	/* When inserting a record into an index, the table must be at
-	least IX-locked. When we are building an index, we would pass
-	BTR_NO_LOCKING_FLAG and skip the locking altogether. */
-	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
-
-	lock = lock_rec_get_first(&lock_sys.rec_hash, block, heap_no);
-
-	if (lock == NULL) {
-		/* We optimize CPU time usage in the simplest case */
-
-		lock_mutex_exit();
-
-		if (inherit_in && !dict_index_is_clust(index)) {
-			/* Update the page max trx id field */
-			page_update_max_trx_id(block,
-					       buf_block_get_page_zip(block),
-					       trx->id, mtr);
-		}
-
-		*inherit = false;
-
-		return(DB_SUCCESS);
-	}
-
-	/* Spatial index does not use GAP lock protection. It uses
-	"predicate lock" to protect the "range" */
-	if (dict_index_is_spatial(index)) {
-		return(DB_SUCCESS);
-	}
-
-	*inherit = true;
-
-	/* If another transaction has an explicit lock request which locks
-	the gap, waiting or granted, on the successor, the insert has to wait.
-
-	An exception is the case where the lock by the another transaction
-	is a gap type lock which it placed to wait for its turn to insert. We
-	do not consider that kind of a lock conflicting with our insert. This
-	eliminates an unnecessary deadlock which resulted when 2 transactions
-	had to wait for their insert. Both had waiting gap type lock requests
-	on the successor, which produced an unnecessary deadlock. */
+  const rec_t *next_rec= page_rec_get_next_const(rec);
+  if (UNIV_UNLIKELY(!next_rec || rec_is_metadata(next_rec, *index)))
+    return DB_CORRUPTION;
 
-	const unsigned	type_mode = LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION;
-
-	if (
-#ifdef WITH_WSREP
-	    lock_t* c_lock =
-#endif /* WITH_WSREP */
-	    lock_rec_other_has_conflicting(type_mode, block, heap_no, trx)) {
-		/* Note that we may get DB_SUCCESS also here! */
-		trx_mutex_enter(trx);
+  dberr_t err= DB_SUCCESS;
+  bool inherit_in= *inherit;
+  trx_t *trx= thr_get_trx(thr);
+  ulint heap_no= page_rec_get_heap_no(next_rec);
+  const page_id_t id{block->page.id()};
 
-		err = lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
-			c_lock,
-#endif /* WITH_WSREP */
-			type_mode, block, heap_no, index, thr, NULL);
+  {
+    LockGuard g{lock_sys.rec_hash, id};
+    /* Because this code is invoked for a running transaction by
+    the thread that is serving the transaction, it is not necessary
+    to hold trx->mutex here. */
 
-		trx_mutex_exit(trx);
-	} else {
-		err = DB_SUCCESS;
-	}
+    /* When inserting a record into an index, the table must be at
+    least IX-locked. When we are building an index, we would pass
+    BTR_NO_LOCKING_FLAG and skip the locking altogether. */
+    ut_ad(lock_table_has(trx, index->table, LOCK_IX));
 
-	lock_mutex_exit();
+    *inherit= lock_sys_t::get_first(g.cell(), id, heap_no);
 
-	switch (err) {
-	case DB_SUCCESS_LOCKED_REC:
-		err = DB_SUCCESS;
-		/* fall through */
-	case DB_SUCCESS:
-		if (!inherit_in || dict_index_is_clust(index)) {
-			break;
-		}
+    if (*inherit)
+    {
+      /* Spatial index does not use GAP lock protection. It uses
+      "predicate lock" to protect the "range" */
+      if (index->is_spatial())
+        return DB_SUCCESS;
+
+      /* If another transaction has an explicit lock request which locks
+      the gap, waiting or granted, on the successor, the insert has to wait.
+
+      An exception is the case where the lock by the another transaction
+      is a gap type lock which it placed to wait for its turn to insert. We
+      do not consider that kind of a lock conflicting with our insert. This
+      eliminates an unnecessary deadlock which resulted when 2 transactions
+      had to wait for their insert. Both had waiting gap type lock requests
+      on the successor, which produced an unnecessary deadlock. */
+      const unsigned type_mode= LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION;
+
+      if (lock_t *c_lock= lock_rec_other_has_conflicting(type_mode,
+                                                         g.cell(), id,
+                                                         heap_no, trx))
+      {
+        trx->mutex_lock();
+        err= lock_rec_enqueue_waiting(c_lock, type_mode, id, block->page.frame,
+                                      heap_no, index, thr, nullptr);
+        trx->mutex_unlock();
+      }
+    }
+  }
 
-		/* Update the page max trx id field */
-		page_update_max_trx_id(
-			block, buf_block_get_page_zip(block), trx->id, mtr);
-	default:
-		/* We only care about the two return values. */
-		break;
-	}
+  switch (err) {
+  case DB_SUCCESS_LOCKED_REC:
+    err = DB_SUCCESS;
+    /* fall through */
+  case DB_SUCCESS:
+    if (!inherit_in || index->is_clust())
+      break;
+    /* Update the page max trx id field */
+    page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr);
+  default:
+    /* We only care about the two return values. */
+    break;
+  }
 
 #ifdef UNIV_DEBUG
-	{
-		mem_heap_t*	heap		= NULL;
-		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-		const rec_offs*	offsets;
-		rec_offs_init(offsets_);
+  {
+    mem_heap_t *heap= nullptr;
+    rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+    const rec_offs *offsets;
+    rec_offs_init(offsets_);
 
-		offsets = rec_get_offsets(next_rec, index, offsets_,
-					  index->n_core_fields,
-					  ULINT_UNDEFINED, &heap);
+    offsets= rec_get_offsets(next_rec, index, offsets_, index->n_core_fields,
+                             ULINT_UNDEFINED, &heap);
 
-		ut_ad(lock_rec_queue_validate(
-				FALSE, block, next_rec, index, offsets));
+    ut_ad(lock_rec_queue_validate(false, id, next_rec, index, offsets));
 
-		if (heap != NULL) {
-			mem_heap_free(heap);
-		}
-	}
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
 #endif /* UNIV_DEBUG */
 
-	return(err);
+  return err;
 }
 
 /*********************************************************************//**
@@ -5337,36 +5160,41 @@ has an implicit lock on the record. The transaction instance must have a
 reference count > 0 so that it can't be committed and freed before this
 function has completed. */
 static
-void
+bool
 lock_rec_convert_impl_to_expl_for_trx(
 /*==================================*/
-	const buf_block_t*	block,	/*!< in: buffer block of rec */
-	const rec_t*		rec,	/*!< in: user record on page */
-	dict_index_t*		index,	/*!< in: index of record */
 	trx_t*			trx,	/*!< in/out: active transaction */
-	ulint			heap_no)/*!< in: rec heap number to lock */
+	const page_id_t		id,	/*!< in: page identifier */
+	const rec_t*		rec,	/*!< in: user record on page */
+	dict_index_t*		index)	/*!< in: index of record */
 {
-	ut_ad(trx->is_referenced());
-	ut_ad(page_rec_is_leaf(rec));
-	ut_ad(!rec_is_metadata(rec, *index));
+  if (!trx)
+    return false;
 
-	DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx");
-	lock_mutex_enter();
-	trx_mutex_enter(trx);
-	ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+  ut_ad(trx->is_referenced());
+  ut_ad(page_rec_is_leaf(rec));
+  ut_ad(!rec_is_metadata(rec, *index));
 
-	if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)
-	    && !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
-				  block, heap_no, trx)) {
-		lock_rec_add_to_queue(LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP,
-				      block, heap_no, index, trx, true);
-	}
+  DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx");
+  ulint heap_no= page_rec_get_heap_no(rec);
 
-	lock_mutex_exit();
-	trx_mutex_exit(trx);
-	trx->release_reference();
+  {
+    LockGuard g{lock_sys.rec_hash, id};
+    trx->mutex_lock();
+    ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+
+    if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) &&
+        !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, heap_no,
+                           trx))
+      lock_rec_add_to_queue(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id,
+                            page_align(rec), heap_no, index, trx, true);
+  }
 
-	DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx");
+  trx->mutex_unlock();
+  trx->release_reference();
+
+  DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx");
+  return false;
 }
 
 
@@ -5374,8 +5202,9 @@ lock_rec_convert_impl_to_expl_for_trx(
 struct lock_rec_other_trx_holds_expl_arg
 {
   const ulint heap_no;
-  const buf_block_t * const block;
-  const trx_t *impl_trx;
+  const hash_cell_t &cell;
+  const page_id_t id;
+  const trx_t &impl_trx;
 };
 
 
@@ -5383,22 +5212,23 @@ static my_bool lock_rec_other_trx_holds_expl_callback(
   rw_trx_hash_element_t *element,
   lock_rec_other_trx_holds_expl_arg *arg)
 {
-  mutex_enter(&element->mutex);
+  element->mutex.wr_lock();
   if (element->trx)
   {
-    trx_mutex_enter(element->trx);
+    element->trx->mutex_lock();
     ut_ad(element->trx->state != TRX_STATE_NOT_STARTED);
     lock_t *expl_lock= element->trx->state == TRX_STATE_COMMITTED_IN_MEMORY
-      ? NULL : lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP, arg->block,
-                                 arg->heap_no, element->trx);
+      ? nullptr
+      : lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP,
+                          arg->cell, arg->id, arg->heap_no, element->trx);
     /*
       An explicit lock is held by trx other than the trx holding the implicit
       lock.
     */
-    ut_ad(!expl_lock || expl_lock->trx == arg->impl_trx);
-    trx_mutex_exit(element->trx);
+    ut_ad(!expl_lock || expl_lock->trx == &arg->impl_trx);
+    element->trx->mutex_unlock();
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return 0;
 }
 
@@ -5414,38 +5244,30 @@ static my_bool lock_rec_other_trx_holds_expl_callback(
   @param      caller_trx  trx of current thread
   @param[in]  trx         trx holding implicit lock on rec
   @param[in]  rec         user record
-  @param[in]  block       buffer block containing the record
+  @param[in]  id          page identifier
 */
-
 static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx,
                                           const rec_t *rec,
-                                          const buf_block_t *block)
+                                          const page_id_t id)
 {
   if (trx)
   {
     ut_ad(!page_rec_is_metadata(rec));
-    lock_mutex_enter();
+    LockGuard g{lock_sys.rec_hash, id};
     ut_ad(trx->is_referenced());
-    trx_mutex_enter(trx);
-    const trx_state_t state = trx->state;
-    trx_mutex_exit(trx);
+    const trx_state_t state{trx->state};
     ut_ad(state != TRX_STATE_NOT_STARTED);
     if (state == TRX_STATE_COMMITTED_IN_MEMORY)
-    {
-      /* The transaction was committed before our lock_mutex_enter(). */
-      lock_mutex_exit();
+      /* The transaction was committed before we acquired LockGuard. */
       return;
-    }
-    lock_rec_other_trx_holds_expl_arg arg= { page_rec_get_heap_no(rec), block,
-                                             trx };
+    lock_rec_other_trx_holds_expl_arg arg=
+    { page_rec_get_heap_no(rec), g.cell(), id, *trx };
     trx_sys.rw_trx_hash.iterate(caller_trx,
                                 lock_rec_other_trx_holds_expl_callback, &arg);
-    lock_mutex_exit();
   }
 }
 #endif /* UNIV_DEBUG */
 
-
 /** If an implicit x-lock exists on a record, convert it to an explicit one.
 
 Often, this is called by a transaction that is about to enter a lock wait
@@ -5457,31 +5279,34 @@ This may also be called by the same transaction that is already holding
 an implicit exclusive lock on the record. In this case, no explicit lock
 should be created.
 
+@tparam		is_primary	whether the index is the primary key
 @param[in,out]	caller_trx	current transaction
-@param[in]	block		index tree leaf page
+@param[in]	id		index tree leaf page identifier
 @param[in]	rec		record on the leaf page
 @param[in]	index		the index of the record
 @param[in]	offsets		rec_get_offsets(rec,index)
 @return	whether caller_trx already holds an exclusive lock on rec */
+template<bool is_primary>
 static
 bool
 lock_rec_convert_impl_to_expl(
 	trx_t*			caller_trx,
-	const buf_block_t*	block,
+	page_id_t		id,
 	const rec_t*		rec,
 	dict_index_t*		index,
 	const rec_offs*		offsets)
 {
 	trx_t*		trx;
 
-	ut_ad(!lock_mutex_own());
+	lock_sys.assert_unlocked();
 	ut_ad(page_rec_is_user_rec(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
 	ut_ad(page_rec_is_leaf(rec));
 	ut_ad(!rec_is_metadata(rec, *index));
+	ut_ad(index->is_primary() == is_primary);
 
-	if (dict_index_is_clust(index)) {
+	if (is_primary) {
 		trx_id_t	trx_id;
 
 		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
@@ -5504,24 +5329,10 @@ lock_rec_convert_impl_to_expl(
 			return true;
 		}
 
-		ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec,
-						   block));
+		ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, id));
 	}
 
-	if (trx != 0) {
-		ulint	heap_no = page_rec_get_heap_no(rec);
-
-		ut_ad(trx->is_referenced());
-
-		/* If the transaction is still active and has no
-		explicit x-lock set on the record, set one for it.
-		trx cannot be committed until the ref count is zero. */
-
-		lock_rec_convert_impl_to_expl_for_trx(
-			block, rec, index, trx, heap_no);
-	}
-
-	return false;
+	return lock_rec_convert_impl_to_expl_for_trx(trx, id, rec, index);
 }
 
 /*********************************************************************//**
@@ -5535,8 +5346,6 @@ lock queue.
 dberr_t
 lock_clust_rec_modify_check_and_lock(
 /*=================================*/
-	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
-					bit is set, does nothing */
 	const buf_block_t*	block,	/*!< in: buffer block of rec */
 	const rec_t*		rec,	/*!< in: record which should be
 					modified */
@@ -5550,12 +5359,8 @@ lock_clust_rec_modify_check_and_lock(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(page_rec_is_leaf(rec));
 	ut_ad(dict_index_is_clust(index));
-	ut_ad(block->frame == page_align(rec));
+	ut_ad(block->page.frame == page_align(rec));
 
-	if (flags & BTR_NO_LOCKING_FLAG) {
-
-		return(DB_SUCCESS);
-	}
 	ut_ad(!rec_is_metadata(rec, *index));
 	ut_ad(!index->table->is_temporary());
 
@@ -5566,16 +5371,18 @@ lock_clust_rec_modify_check_and_lock(
 	/* If a transaction has no explicit x-lock set on the record, set one
 	for it */
 
-	if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec, index,
-					  offsets)) {
+	if (lock_rec_convert_impl_to_expl<true>(thr_get_trx(thr),
+						block->page.id(),
+						rec, index, offsets)) {
 		/* We already hold an implicit exclusive lock. */
 		return DB_SUCCESS;
 	}
 
-	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+	err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
-	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(false, block->page.id(),
+				      rec, index, offsets));
 
 	if (err == DB_SUCCESS_LOCKED_REC) {
 		err = DB_SUCCESS;
@@ -5609,7 +5416,7 @@ lock_sec_rec_modify_check_and_lock(
 
 	ut_ad(!dict_index_is_clust(index));
 	ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG));
-	ut_ad(block->frame == page_align(rec));
+	ut_ad(block->page.frame == page_align(rec));
 	ut_ad(mtr->is_named_space(index->table->space));
 	ut_ad(page_rec_is_leaf(rec));
 	ut_ad(!rec_is_metadata(rec, *index));
@@ -5632,7 +5439,7 @@ lock_sec_rec_modify_check_and_lock(
 	high priority threads. To avoid this GAP-locking we mark that
 	this transaction is using unique key scan here. */
 	if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
-		trx->wsrep_UK_scan= true;
+		trx->wsrep = 3;
 #endif /* WITH_WSREP */
 
 	/* Another transaction cannot have an implicit lock on the record,
@@ -5640,11 +5447,11 @@ lock_sec_rec_modify_check_and_lock(
 	index record, and this would not have been possible if another active
 	transaction had modified this secondary index record. */
 
-	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+	err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP,
 			    block, heap_no, index, thr);
 
 #ifdef WITH_WSREP
-	trx->wsrep_UK_scan= false;
+	if (trx->wsrep == 3) trx->wsrep = 1;
 #endif /* WITH_WSREP */
 
 #ifdef UNIV_DEBUG
@@ -5659,7 +5466,7 @@ lock_sec_rec_modify_check_and_lock(
 					  ULINT_UNDEFINED, &heap);
 
 		ut_ad(lock_rec_queue_validate(
-			FALSE, block, rec, index, offsets));
+			      false, block->page.id(), rec, index, offsets));
 
 		if (heap != NULL) {
 			mem_heap_free(heap);
@@ -5707,11 +5514,10 @@ lock_sec_rec_read_check_and_lock(
 	que_thr_t*		thr)	/*!< in: query thread */
 {
 	dberr_t	err;
-	ulint	heap_no;
 
 	ut_ad(!dict_index_is_clust(index));
 	ut_ad(!dict_index_is_online_ddl(index));
-	ut_ad(block->frame == page_align(rec));
+	ut_ad(block->page.frame == page_align(rec));
 	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(page_rec_is_leaf(rec));
@@ -5725,18 +5531,16 @@ lock_sec_rec_read_check_and_lock(
 	}
 
 	ut_ad(!rec_is_metadata(rec, *index));
-	heap_no = page_rec_get_heap_no(rec);
-
-	/* Some transaction may have an implicit x-lock on the record only
-	if the max trx id for the page >= min trx id for the trx list or a
-	database recovery is running. */
 
 	trx_t *trx = thr_get_trx(thr);
-	if (!lock_table_has(trx, index->table, LOCK_X)
-	    && !page_rec_is_supremum(rec)
-	    && page_get_max_trx_id(block->frame) >= trx_sys.get_min_trx_id()
-	    && lock_rec_convert_impl_to_expl(thr_get_trx(thr), block, rec,
-					     index, offsets)
+
+	if (lock_table_has(trx, index->table, mode)) {
+		return DB_SUCCESS;
+	}
+
+	if (!page_rec_is_supremum(rec)
+	    && lock_rec_convert_impl_to_expl<false>(
+		       trx, block->page.id(), rec, index, offsets)
 	    && gap_mode == LOCK_REC_NOT_GAP) {
 		/* We already hold an implicit exclusive lock. */
 		return DB_SUCCESS;
@@ -5751,17 +5555,18 @@ lock_sec_rec_read_check_and_lock(
 	high priority threads. To avoid this GAP-locking we mark that
 	this transaction is using unique key scan here. */
 	if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
-		trx->wsrep_UK_scan= true;
+		trx->wsrep = 3;
 #endif /* WITH_WSREP */
 
-	err = lock_rec_lock(FALSE, gap_mode | mode,
-			    block, heap_no, index, thr);
+	err = lock_rec_lock(false, gap_mode | mode,
+			    block, page_rec_get_heap_no(rec), index, thr);
 
 #ifdef WITH_WSREP
-	trx->wsrep_UK_scan= false;
+	if (trx->wsrep == 3) trx->wsrep = 1;
 #endif /* WITH_WSREP */
 
-	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(false, block->page.id(),
+				      rec, index, offsets));
 
 	return(err);
 }
@@ -5795,11 +5600,8 @@ lock_clust_rec_read_check_and_lock(
 					LOCK_REC_NOT_GAP */
 	que_thr_t*		thr)	/*!< in: query thread */
 {
-	dberr_t	err;
-	ulint	heap_no;
-
 	ut_ad(dict_index_is_clust(index));
-	ut_ad(block->frame == page_align(rec));
+	ut_ad(block->page.frame == page_align(rec));
 	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
 	ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
 	      || gap_mode == LOCK_REC_NOT_GAP);
@@ -5814,22 +5616,24 @@ lock_clust_rec_read_check_and_lock(
 		return(DB_SUCCESS);
 	}
 
-	heap_no = page_rec_get_heap_no(rec);
+	const page_id_t id{block->page.id()};
+
+	ulint heap_no = page_rec_get_heap_no(rec);
 
 	trx_t *trx = thr_get_trx(thr);
 	if (!lock_table_has(trx, index->table, LOCK_X)
 	    && heap_no != PAGE_HEAP_NO_SUPREMUM
-	    && lock_rec_convert_impl_to_expl(trx, block, rec,
-					     index, offsets)
+	    && lock_rec_convert_impl_to_expl<true>(trx, id,
+						   rec, index, offsets)
 	    && gap_mode == LOCK_REC_NOT_GAP) {
 		/* We already hold an implicit exclusive lock. */
 		return DB_SUCCESS;
 	}
 
-	err = lock_rec_lock(FALSE, gap_mode | mode,
-			    block, heap_no, index, thr);
+	dberr_t err = lock_rec_lock(false, gap_mode | mode,
+				    block, heap_no, index, thr);
 
-	ut_ad(lock_rec_queue_validate(FALSE, block, rec, index, offsets));
+	ut_ad(lock_rec_queue_validate(false, id, rec, index, offsets));
 
 	DEBUG_SYNC_C("after_lock_clust_rec_read_check_and_lock");
 
@@ -5888,37 +5692,6 @@ lock_clust_rec_read_check_and_lock_alt(
 }
 
 /*******************************************************************//**
-Release the last lock from the transaction's autoinc locks. */
-UNIV_INLINE
-void
-lock_release_autoinc_last_lock(
-/*===========================*/
-	ib_vector_t*	autoinc_locks)	/*!< in/out: vector of AUTOINC locks */
-{
-	ulint		last;
-	lock_t*		lock;
-
-	ut_ad(lock_mutex_own());
-	ut_a(!ib_vector_is_empty(autoinc_locks));
-
-	/* The lock to be release must be the last lock acquired. */
-	last = ib_vector_size(autoinc_locks) - 1;
-	lock = *static_cast<lock_t**>(ib_vector_get(autoinc_locks, last));
-
-	/* Should have only AUTOINC locks in the vector. */
-	ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
-	ut_a(lock_get_type(lock) == LOCK_TABLE);
-
-	ut_a(lock->un_member.tab_lock.table != NULL);
-
-	/* This will remove the lock from the trx autoinc_locks too. */
-	lock_table_dequeue(lock);
-
-	/* Remove from the table vector too. */
-	lock_trx_table_locks_remove(lock);
-}
-
-/*******************************************************************//**
 Check if a transaction holds any autoinc locks.
 @return TRUE if the transaction holds any AUTOINC locks. */
 static
@@ -5932,179 +5705,233 @@ lock_trx_holds_autoinc_locks(
 	return(!ib_vector_is_empty(trx->autoinc_locks));
 }
 
-/*******************************************************************//**
-Release all the transaction's autoinc locks. */
-static
-void
-lock_release_autoinc_locks(
-/*=======================*/
-	trx_t*		trx)		/*!< in/out: transaction */
-{
-	ut_ad(lock_mutex_own());
-	/* If this is invoked for a running transaction by the thread
-	that is serving the transaction, then it is not necessary to
-	hold trx->mutex here. */
-
-	ut_a(trx->autoinc_locks != NULL);
-
-	/* We release the locks in the reverse order. This is to
-	avoid searching the vector for the element to delete at
-	the lower level. See (lock_table_remove_low()) for details. */
-	while (!ib_vector_is_empty(trx->autoinc_locks)) {
-
-		/* lock_table_remove_low() will also remove the lock from
-		the transaction's autoinc_locks vector. */
-		lock_release_autoinc_last_lock(trx->autoinc_locks);
-	}
-
-	/* Should release all locks. */
-	ut_a(ib_vector_is_empty(trx->autoinc_locks));
-}
-
-/*******************************************************************//**
-Gets the type of a lock. Non-inline version for using outside of the
-lock module.
-@return LOCK_TABLE or LOCK_REC */
-ulint
-lock_get_type(
-/*==========*/
-	const lock_t*	lock)	/*!< in: lock */
+/** Release all AUTO_INCREMENT locks of the transaction. */
+static void lock_release_autoinc_locks(trx_t *trx)
 {
-	return(lock_get_type_low(lock));
+  {
+    LockMutexGuard g{SRW_LOCK_CALL};
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    trx->mutex_lock();
+    auto autoinc_locks= trx->autoinc_locks;
+    ut_a(autoinc_locks);
+
+    /* We release the locks in the reverse order. This is to avoid
+    searching the vector for the element to delete at the lower level.
+    See (lock_table_remove_low()) for details. */
+    while (ulint size= ib_vector_size(autoinc_locks))
+    {
+      lock_t *lock= *static_cast<lock_t**>
+        (ib_vector_get(autoinc_locks, size - 1));
+      ut_ad(lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE));
+      lock_table_dequeue(lock, true);
+      lock_trx_table_locks_remove(lock);
+    }
+  }
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  trx->mutex_unlock();
 }
 
-/*******************************************************************//**
-Gets the id of the transaction owning a lock.
-@return transaction id */
-trx_id_t
-lock_get_trx_id(
-/*============*/
-	const lock_t*	lock)	/*!< in: lock */
+/** Cancel a waiting lock request and release possibly waiting transactions */
+template <bool from_deadlock= false>
+void lock_cancel_waiting_and_release(lock_t *lock)
 {
-	return(trx_get_id_for_print(lock->trx));
-}
-
-/*******************************************************************//**
-Gets the table on which the lock is.
-@return table */
-UNIV_INLINE
-dict_table_t*
-lock_get_table(
-/*===========*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	switch (lock_get_type_low(lock)) {
-	case LOCK_REC:
-		ut_ad(dict_index_is_clust(lock->index)
-		      || !dict_index_is_online_ddl(lock->index));
-		return(lock->index->table);
-	case LOCK_TABLE:
-		return(lock->un_member.tab_lock.table);
-	default:
-		ut_error;
-		return(NULL);
-	}
-}
+  lock_sys.assert_locked(*lock);
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  trx_t *trx= lock->trx;
+  trx->mutex_lock();
+  ut_d(const auto trx_state= trx->state);
+  ut_ad(trx_state == TRX_STATE_COMMITTED_IN_MEMORY ||
+        trx_state == TRX_STATE_ACTIVE);
 
-/*******************************************************************//**
-Gets the id of the table on which the lock is.
-@return id of the table */
-table_id_t
-lock_get_table_id(
-/*==============*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	dict_table_t* table = lock_get_table(lock);
-	ut_ad(!table->is_temporary());
-	return(table->id);
-}
+  if (!lock->is_table())
+    lock_rec_dequeue_from_page(lock, true);
+  else
+  {
+    if (lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE))
+    {
+      ut_ad(trx->autoinc_locks);
+      ib_vector_remove(trx->autoinc_locks, lock);
+    }
+    lock_table_dequeue(lock, true);
+    /* Remove the lock from table lock vector too. */
+    lock_trx_table_locks_remove(lock);
+  }
 
-/** Determine which table a lock is associated with.
-@param[in]	lock	the lock
-@return name of the table */
-const table_name_t&
-lock_get_table_name(
-	const lock_t*	lock)
-{
-	return(lock_get_table(lock)->name);
-}
+  /* Reset the wait flag and the back pointer to lock in trx. */
+  lock_reset_lock_and_trx_wait(lock);
 
-/*******************************************************************//**
-For a record lock, gets the index on which the lock is.
-@return index */
-const dict_index_t*
-lock_rec_get_index(
-/*===============*/
-	const lock_t*	lock)	/*!< in: lock */
-{
-	ut_a(lock_get_type_low(lock) == LOCK_REC);
-	ut_ad(dict_index_is_clust(lock->index)
-	      || !dict_index_is_online_ddl(lock->index));
+  lock_wait_end<from_deadlock>(trx);
 
-	return(lock->index);
+  trx->mutex_unlock();
 }
 
-/*******************************************************************//**
-For a record lock, gets the name of the index on which the lock is.
-The string should not be free()'d or modified.
-@return name of the index */
-const char*
-lock_rec_get_index_name(
-/*====================*/
-	const lock_t*	lock)	/*!< in: lock */
+void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)
 {
-	ut_a(lock_get_type_low(lock) == LOCK_REC);
-	ut_ad(dict_index_is_clust(lock->index)
-	      || !dict_index_is_online_ddl(lock->index));
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  if (lock_t *lock= trx->lock.wait_lock)
+  {
+    /* check if victim is still waiting */
+    if (lock->is_waiting())
+      lock_cancel_waiting_and_release(lock);
+  }
+  lock_sys.wr_unlock();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+}
+
+/** Cancel a waiting lock request.
+@tparam check_victim  whether to check for DB_DEADLOCK
+@param trx            active transaction
+@param lock           waiting lock request
+@retval DB_SUCCESS    if no lock existed
+@retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
+@retval DB_LOCK_WAIT  if the lock was canceled */
+template<bool check_victim>
+dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
+{
+  DEBUG_SYNC_C("lock_sys_t_cancel_enter");
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  /* trx->lock.wait_lock may be changed by other threads as long as
+  we are not holding lock_sys.latch.
+
+  So, trx->lock.wait_lock==lock does not necessarily hold, but both
+  pointers should be valid, because other threads cannot assign
+  trx->lock.wait_lock=nullptr (or invalidate *lock) while we are
+  holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock
+  (record or table lock) cannot be changed by other threads. So, it is
+  safe to call lock->is_table() while not holding lock_sys.latch. If
+  we have to release and reacquire lock_sys.wait_mutex, we must reread
+  trx->lock.wait_lock. We must also reread trx->lock.wait_lock after
+  lock_sys.latch acquiring, as it can be changed to not-null in lock moving
+  functions even if we hold lock_sys.wait_mutex. */
+  dberr_t err= DB_SUCCESS;
+  /* This would be too large for a memory transaction, except in the
+  DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */
+  if (lock->is_table())
+  {
+    if (!lock_sys.rd_lock_try())
+    {
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      lock_sys.rd_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      lock= trx->lock.wait_lock;
+      /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
+      unlocked, we need to return deadlock error if transaction was chosen
+      as deadlock victim to rollback it */
+      if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        err= DB_DEADLOCK;
+      else if (lock)
+        goto resolve_table_lock;
+    }
+    else
+    {
+      /* This function is invoked from the thread which executes the
+      transaction. Table locks are requested before record locks. Some other
+      transaction can't change trx->lock.wait_lock from table to record for the
+      current transaction at this point, because the current transaction has not
+      requested record locks yet. There is no need to move any table locks by
+      other threads. And trx->lock.wait_lock can't be set to null while we are
+      holding lock_sys.wait_mutex. That's why there is no need to reload
+      trx->lock.wait_lock here. */
+      ut_ad(lock == trx->lock.wait_lock);
+resolve_table_lock:
+      dict_table_t *table= lock->un_member.tab_lock.table;
+      if (!table->lock_mutex_trylock())
+      {
+        /* The correct latching order is:
+        lock_sys.latch, table->lock_mutex_lock(), lock_sys.wait_mutex.
+        Thus, we must release lock_sys.wait_mutex for a blocking wait. */
+        mysql_mutex_unlock(&lock_sys.wait_mutex);
+        table->lock_mutex_lock();
+        mysql_mutex_lock(&lock_sys.wait_mutex);
+        /* Cache trx->lock.wait_lock under the corresponding latches. */
+        lock= trx->lock.wait_lock;
+        if (!lock)
+          goto retreat;
+        else if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        {
+          err= DB_DEADLOCK;
+          goto retreat;
+        }
+      }
+      else
+        /* Cache trx->lock.wait_lock under the corresponding latches if
+        it was not cached yet */
+        lock= trx->lock.wait_lock;
+      if (lock->is_waiting())
+        lock_cancel_waiting_and_release(lock);
+      /* Even if lock->is_waiting() did not hold above, we must return
+      DB_LOCK_WAIT, or otherwise optimistic parallel replication could
+      occasionally hang. Potentially affected tests:
+      rpl.rpl_parallel_optimistic
+      rpl.rpl_parallel_optimistic_nobinlog
+      rpl.rpl_parallel_optimistic_xa_lsu_off */
+      err= DB_LOCK_WAIT;
+retreat:
+      table->lock_mutex_unlock();
+    }
+    lock_sys.rd_unlock();
+  }
+  else
+  {
+    /* To prevent the record lock from being moved between pages
+    during a page split or merge, we must hold exclusive lock_sys.latch. */
+    if (!lock_sys.wr_lock_try())
+    {
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      lock_sys.wr_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      /* Cache trx->lock.wait_lock under the corresponding latches. */
+      lock= trx->lock.wait_lock;
+      /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
+      unlocked, we need to return deadlock error if transaction was chosen
+      as deadlock victim to rollback it */
+      if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        err= DB_DEADLOCK;
+      else if (lock)
+        goto resolve_record_lock;
+    }
+    else
+    {
+      /* Cache trx->lock.wait_lock under the corresponding latches if
+      it was not cached yet */
+      lock= trx->lock.wait_lock;
+resolve_record_lock:
+      if (lock->is_waiting())
+        lock_cancel_waiting_and_release(lock);
+      /* Even if lock->is_waiting() did not hold above, we must return
+      DB_LOCK_WAIT, or otherwise optimistic parallel replication could
+      occasionally hang. Potentially affected tests:
+      rpl.rpl_parallel_optimistic
+      rpl.rpl_parallel_optimistic_nobinlog
+      rpl.rpl_parallel_optimistic_xa_lsu_off */
+      err= DB_LOCK_WAIT;
+    }
+    lock_sys.wr_unlock();
+  }
 
-	return(lock->index->name);
+  return err;
 }
 
-/*********************************************************************//**
-Cancels a waiting lock request and releases possible other transactions
-waiting behind it. */
-void
-lock_cancel_waiting_and_release(
-/*============================*/
-	lock_t*	lock)	/*!< in/out: waiting lock request */
+/** Cancel a waiting lock request (if any) when killing a transaction */
+void lock_sys_t::cancel(trx_t *trx)
 {
-	que_thr_t*	thr;
-
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(lock->trx));
-	ut_ad(lock->trx->state == TRX_STATE_ACTIVE);
-
-	lock->trx->lock.cancel = true;
-
-	if (lock_get_type_low(lock) == LOCK_REC) {
-
-		lock_rec_dequeue_from_page(lock);
-	} else {
-		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
-
-		if (lock->trx->autoinc_locks != NULL) {
-			/* Release the transaction's AUTOINC locks. */
-			lock_release_autoinc_locks(lock->trx);
-		}
-
-		lock_table_dequeue(lock);
-		/* Remove the lock from table lock vector too. */
-		lock_trx_table_locks_remove(lock);
-	}
-
-	/* Reset the wait flag and the back pointer to lock in trx. */
-
-	lock_reset_lock_and_trx_wait(lock);
-
-	/* The following function releases the trx from lock wait. */
-
-	thr = que_thr_end_lock_wait(lock->trx);
-
-	if (thr != NULL) {
-		lock_wait_release_thread_if_suspended(thr);
-	}
-
-	lock->trx->lock.cancel = false;
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
+  if (lock_t *lock= trx->lock.wait_lock)
+  {
+    /* Dictionary transactions must be immune to KILL, because they
+    may be executed as part of a multi-transaction DDL operation, such
+    as rollback_inplace_alter_table() or ha_innobase::delete_table(). */
+    if (!trx->dict_operation)
+    {
+      trx->error_state= DB_INTERRUPTED;
+      cancel<false>(trx, lock);
+    }
+  }
+  lock_sys.deadlock_check();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
 }
 
 /*********************************************************************//**
@@ -6116,8 +5943,8 @@ lock_unlock_table_autoinc(
 /*======================*/
 	trx_t*	trx)	/*!< in/out: transaction */
 {
-	ut_ad(!lock_mutex_own());
-	ut_ad(!trx_mutex_own(trx));
+	lock_sys.assert_unlocked();
+	ut_ad(!trx->mutex_is_owner());
 	ut_ad(!trx->lock.wait_lock);
 
 	/* This can be invoked on NOT_STARTED, ACTIVE, PREPARED,
@@ -6131,72 +5958,40 @@ lock_unlock_table_autoinc(
 	necessary to hold trx->mutex here. */
 
 	if (lock_trx_holds_autoinc_locks(trx)) {
-		lock_mutex_enter();
-
 		lock_release_autoinc_locks(trx);
-
-		lock_mutex_exit();
 	}
 }
 
-static inline dberr_t lock_trx_handle_wait_low(trx_t* trx)
-{
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(trx));
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
 
-	if (trx->lock.was_chosen_as_deadlock_victim) {
-		return DB_DEADLOCK;
-	}
-	if (!trx->lock.wait_lock) {
-		/* The lock was probably granted before we got here. */
-		return DB_SUCCESS;
-	}
-
-	lock_cancel_waiting_and_release(trx->lock.wait_lock);
-	return DB_LOCK_WAIT;
-}
-
-/*********************************************************************//**
-Check whether the transaction has already been rolled back because it
-was selected as a deadlock victim, or if it has to wait then cancel
-the wait lock.
-@return DB_DEADLOCK, DB_LOCK_WAIT or DB_SUCCESS */
-dberr_t
-lock_trx_handle_wait(
-/*=================*/
-	trx_t*	trx)	/*!< in/out: trx lock state */
-{
-#ifdef WITH_WSREP
-	/* We already own mutexes */
-	if (trx->lock.was_chosen_as_wsrep_victim) {
-		return lock_trx_handle_wait_low(trx);
-	}
-#endif /* WITH_WSREP */
-	lock_mutex_enter();
-	trx_mutex_enter(trx);
-	dberr_t err = lock_trx_handle_wait_low(trx);
-	lock_mutex_exit();
-	trx_mutex_exit(trx);
-	return err;
-}
-
-/*********************************************************************//**
-Get the number of locks on a table.
-@return number of locks */
-ulint
-lock_table_get_n_locks(
-/*===================*/
-	const dict_table_t*	table)	/*!< in: table */
+@param trx           transaction that is or was waiting for a lock
+@retval DB_SUCCESS   if the lock was granted
+@retval DB_DEADLOCK  if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+                     lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx)
 {
-	ulint		n_table_locks;
-
-	lock_mutex_enter();
-
-	n_table_locks = UT_LIST_GET_LEN(table->locks);
-
-	lock_mutex_exit();
-
-	return(n_table_locks);
+  DEBUG_SYNC_C("lock_trx_handle_wait_enter");
+  if (trx->lock.was_chosen_as_deadlock_victim)
+    return DB_DEADLOCK;
+  DEBUG_SYNC_C("lock_trx_handle_wait_before_unlocked_wait_lock_check");
+  /* trx->lock.was_chosen_as_deadlock_victim must always be set before
+  trx->lock.wait_lock if the transaction was chosen as deadlock victim,
+  the function must not return DB_SUCCESS if
+  trx->lock.was_chosen_as_deadlock_victim is set. */
+  if (!trx->lock.wait_lock)
+    return trx->lock.was_chosen_as_deadlock_victim ? DB_DEADLOCK : DB_SUCCESS;
+  dberr_t err= DB_SUCCESS;
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  if (trx->lock.was_chosen_as_deadlock_victim)
+    err= DB_DEADLOCK;
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
+  else if (lock_t *wait_lock= trx->lock.wait_lock)
+    err= lock_sys_t::cancel<true>(trx, wait_lock);
+  lock_sys.deadlock_check();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  return err;
 }
 
 #ifdef UNIV_DEBUG
@@ -6210,11 +6005,11 @@ lock_table_get_n_locks(
 static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element,
                                        const dict_table_t *table)
 {
-  ut_ad(lock_mutex_own());
-  mutex_enter(&element->mutex);
+  lock_sys.assert_locked();
+  element->mutex.wr_lock();
   if (element->trx)
   {
-    trx_mutex_enter(element->trx);
+    element->trx->mutex_lock();
     check_trx_state(element->trx);
     if (element->trx->state != TRX_STATE_COMMITTED_IN_MEMORY)
     {
@@ -6223,7 +6018,7 @@ static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element,
            lock= UT_LIST_GET_NEXT(trx_locks, lock))
       {
         ut_ad(lock->trx == element->trx);
-        if (lock_get_type_low(lock) == LOCK_REC)
+        if (!lock->is_table())
         {
           ut_ad(lock->index->online_status != ONLINE_INDEX_CREATION ||
                 lock->index->is_primary());
@@ -6233,39 +6028,46 @@ static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element,
           ut_ad(lock->un_member.tab_lock.table != table);
       }
     }
-    trx_mutex_exit(element->trx);
+    element->trx->mutex_unlock();
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return 0;
 }
 #endif /* UNIV_DEBUG */
 
-/*******************************************************************//**
-Check if there are any locks (table or rec) against table.
+/** Check if there are any locks on a table.
 @return true if table has either table or record locks. */
-bool
-lock_table_has_locks(
-/*=================*/
-	const dict_table_t*	table)	/*!< in: check if there are any locks
-					held on records in this table or on the
-					table itself */
-{
-	ibool			has_locks;
-
-	ut_ad(table != NULL);
-	lock_mutex_enter();
-
-	has_locks = UT_LIST_GET_LEN(table->locks) > 0 || table->n_rec_locks > 0;
-
+TRANSACTIONAL_TARGET
+bool lock_table_has_locks(dict_table_t *table)
+{
+  if (table->n_rec_locks)
+    return true;
+  ulint len;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (table->lock_mutex_is_locked())
+      xabort();
+    len= UT_LIST_GET_LEN(table->locks);
+    xend();
+  }
+  else
+#endif
+  {
+    table->lock_mutex_lock();
+    len= UT_LIST_GET_LEN(table->locks);
+    table->lock_mutex_unlock();
+  }
+  if (len)
+    return true;
 #ifdef UNIV_DEBUG
-	if (!has_locks) {
-		trx_sys.rw_trx_hash.iterate(lock_table_locks_lookup, table);
-	}
+  {
+    LockMutexGuard g{SRW_LOCK_CALL};
+    trx_sys.rw_trx_hash.iterate(lock_table_locks_lookup,
+                                const_cast<const dict_table_t*>(table));
+  }
 #endif /* UNIV_DEBUG */
-
-	lock_mutex_exit();
-
-	return(has_locks);
+  return false;
 }
 
 /*******************************************************************//**
@@ -6278,17 +6080,6 @@ lock_table_lock_list_init(
 	UT_LIST_INIT(*lock_list, &lock_table_t::locks);
 }
 
-/*******************************************************************//**
-Initialise the trx lock list. */
-void
-lock_trx_lock_list_init(
-/*====================*/
-	trx_lock_list_t*	lock_list)	/*!< List to initialise */
-{
-	UT_LIST_INIT(*lock_list, &lock_t::trx_locks);
-}
-
-
 #ifdef UNIV_DEBUG
 /*******************************************************************//**
 Check if the transaction holds any locks on the sys tables
@@ -6302,7 +6093,7 @@ lock_trx_has_sys_table_locks(
 	const lock_t*	strongest_lock = 0;
 	lock_mode	strongest = LOCK_NONE;
 
-	lock_mutex_enter();
+	LockMutexGuard g{SRW_LOCK_CALL};
 
 	const lock_list::const_iterator end = trx->lock.table_locks.end();
 	lock_list::const_iterator it = trx->lock.table_locks.begin();
@@ -6315,7 +6106,7 @@ lock_trx_has_sys_table_locks(
 		if (lock != NULL
 		    && dict_is_sys_table(lock->un_member.tab_lock.table->id)) {
 
-			strongest = lock_get_mode(lock);
+			strongest = lock->mode();
 			ut_ad(strongest != LOCK_NONE);
 			strongest_lock = lock;
 			break;
@@ -6323,7 +6114,6 @@ lock_trx_has_sys_table_locks(
 	}
 
 	if (strongest == LOCK_NONE) {
-		lock_mutex_exit();
 		return(NULL);
 	}
 
@@ -6335,10 +6125,10 @@ lock_trx_has_sys_table_locks(
 		}
 
 		ut_ad(trx == lock->trx);
-		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
-		ut_ad(lock->un_member.tab_lock.table != NULL);
+		ut_ad(lock->is_table());
+		ut_ad(lock->un_member.tab_lock.table);
 
-		lock_mode	mode = lock_get_mode(lock);
+		lock_mode mode = lock->mode();
 
 		if (dict_is_sys_table(lock->un_member.tab_lock.table->id)
 		    && lock_mode_stronger_or_eq(mode, strongest)) {
@@ -6348,567 +6138,392 @@ lock_trx_has_sys_table_locks(
 		}
 	}
 
-	lock_mutex_exit();
-
 	return(strongest_lock);
 }
 
 /** Check if the transaction holds an explicit exclusive lock on a record.
 @param[in]	trx	transaction
 @param[in]	table	table
-@param[in]	block	leaf page
+@param[in]	id	leaf page identifier
 @param[in]	heap_no	heap number identifying the record
 @return whether an explicit X-lock is held */
-bool
-lock_trx_has_expl_x_lock(
-	const trx_t*		trx,	/*!< in: transaction to check */
-	const dict_table_t*	table,	/*!< in: table to check */
-	const buf_block_t*	block,	/*!< in: buffer block of the record */
-	ulint			heap_no)/*!< in: record heap number */
-{
-	ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM);
-
-	lock_mutex_enter();
-	ut_ad(lock_table_has(trx, table, LOCK_IX));
-	ut_ad(lock_table_has(trx, table, LOCK_X)
-	      || lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block, heap_no,
-				   trx));
-	lock_mutex_exit();
-	return(true);
-}
-#endif /* UNIV_DEBUG */
-
-/** rewind(3) the file used for storing the latest detected deadlock and
-print a heading message to stderr if printing of all deadlocks to stderr
-is enabled. */
-void
-DeadlockChecker::start_print()
-{
-	ut_ad(lock_mutex_own());
-
-	rewind(lock_latest_err_file);
-	ut_print_timestamp(lock_latest_err_file);
-
-	if (srv_print_all_deadlocks) {
-		ib::info() << "Transactions deadlock detected, dumping"
-			" detailed information.";
-	}
-}
-
-/** Print a message to the deadlock file and possibly to stderr.
-@param msg message to print */
-void
-DeadlockChecker::print(const char* msg)
-{
-	fputs(msg, lock_latest_err_file);
-
-	if (srv_print_all_deadlocks) {
-		ib::info() << msg;
-	}
-}
-
-/** Print transaction data to the deadlock file and possibly to stderr.
-@param trx transaction
-@param max_query_len max query length to print */
-void
-DeadlockChecker::print(const trx_t* trx, ulint max_query_len)
-{
-	ut_ad(lock_mutex_own());
-
-	ulint	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
-	ulint	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
-	ulint	heap_size = mem_heap_get_size(trx->lock.lock_heap);
-
-	trx_print_low(lock_latest_err_file, trx, max_query_len,
-		      n_rec_locks, n_trx_locks, heap_size);
-
-	if (srv_print_all_deadlocks) {
-		trx_print_low(stderr, trx, max_query_len,
-			      n_rec_locks, n_trx_locks, heap_size);
-	}
-}
-
-/** Print lock data to the deadlock file and possibly to stderr.
-@param lock record or table type lock */
-void
-DeadlockChecker::print(const lock_t* lock)
-{
-	ut_ad(lock_mutex_own());
-
-	if (lock_get_type_low(lock) == LOCK_REC) {
-		mtr_t mtr;
-		lock_rec_print(lock_latest_err_file, lock, mtr);
-
-		if (srv_print_all_deadlocks) {
-			lock_rec_print(stderr, lock, mtr);
-		}
-	} else {
-		lock_table_print(lock_latest_err_file, lock);
-
-		if (srv_print_all_deadlocks) {
-			lock_table_print(stderr, lock);
-		}
-	}
-}
-
-/** Get the next lock in the queue that is owned by a transaction whose
-sub-tree has not already been searched.
-Note: "next" here means PREV for table locks.
-
-@param lock Lock in queue
-@param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED
-
-@return next lock or NULL if at end of queue */
-const lock_t*
-DeadlockChecker::get_next_lock(const lock_t* lock, ulint heap_no) const
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+                              page_id_t id, ulint heap_no)
 {
-	ut_ad(lock_mutex_own());
-
-	do {
-		if (lock_get_type_low(lock) == LOCK_REC) {
-			ut_ad(heap_no != ULINT_UNDEFINED);
-			lock = lock_rec_get_next_const(heap_no, lock);
-		} else {
-			ut_ad(heap_no == ULINT_UNDEFINED);
-			ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
-
-			lock = UT_LIST_GET_NEXT(
-				un_member.tab_lock.locks, lock);
-		}
-
-	} while (lock != NULL && is_visited(lock));
-
-	ut_ad(lock == NULL
-	      || lock_get_type_low(lock) == lock_get_type_low(m_wait_lock));
-
-	return(lock);
-}
-
-/** Get the first lock to search. The search starts from the current
-wait_lock. What we are really interested in is an edge from the
-current wait_lock's owning transaction to another transaction that has
-a lock ahead in the queue. We skip locks where the owning transaction's
-sub-tree has already been searched.
-
-Note: The record locks are traversed from the oldest lock to the
-latest. For table locks we go from latest to oldest.
-
-For record locks, we first position the "iterator" on the first lock on
-the page and then reposition on the actual heap_no. This is required
-due to the way the record lock has is implemented.
-
-@param[out] heap_no if rec lock, else ULINT_UNDEFINED.
-@return first lock or NULL */
-const lock_t*
-DeadlockChecker::get_first_lock(ulint* heap_no) const
-{
-	ut_ad(lock_mutex_own());
-
-	const lock_t*	lock = m_wait_lock;
-
-	if (lock_get_type_low(lock) == LOCK_REC) {
-		/* We are only interested in records that match the heap_no. */
-		*heap_no = lock_rec_find_set_bit(lock);
-
-		ut_ad(*heap_no <= 0xffff);
-		ut_ad(*heap_no != ULINT_UNDEFINED);
-
-		/* Find the locks on the page. */
-		lock = lock_sys.get_first(
-			lock->type_mode & LOCK_PREDICATE
-			? lock_sys.prdt_hash
-			: lock_sys.rec_hash,
-			lock->un_member.rec_lock.page_id);
-
-		/* Position on the first lock on the physical record.*/
-		if (!lock_rec_get_nth_bit(lock, *heap_no)) {
-			lock = lock_rec_get_next_const(*heap_no, lock);
-		}
-
-		ut_a(!lock_get_wait(lock));
-	} else {
-		/* Table locks don't care about the heap_no. */
-		*heap_no = ULINT_UNDEFINED;
-		ut_ad(lock_get_type_low(lock) == LOCK_TABLE);
-		dict_table_t*	table = lock->un_member.tab_lock.table;
-		lock = UT_LIST_GET_FIRST(table->locks);
-	}
-
-	/* Must find at least two locks, otherwise there cannot be a
-	waiting lock, secondly the first lock cannot be the wait_lock. */
-	ut_a(lock != NULL);
-	ut_a(lock != m_wait_lock ||
-	     (innodb_lock_schedule_algorithm
-	      	== INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
-	      && !thd_is_replication_slave_thread(lock->trx->mysql_thd)));
-
-	/* Check that the lock type doesn't change. */
-	ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock));
-
-	return(lock);
+  ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM);
+  ut_ad(lock_table_has(&trx, &table, LOCK_IX));
+  if (!lock_table_has(&trx, &table, LOCK_X))
+  {
+    LockGuard g{lock_sys.rec_hash, id};
+    ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+                            g.cell(), id, heap_no, &trx));
+  }
+  return true;
 }
+#endif /* UNIV_DEBUG */
 
-/** Notify that a deadlock has been detected and print the conflicting
-transaction info.
-@param lock lock causing deadlock */
-void
-DeadlockChecker::notify(const lock_t* lock) const
+namespace Deadlock
 {
-	ut_ad(lock_mutex_own());
-
-	start_print();
-
-	print("\n*** (1) TRANSACTION:\n");
-
-	print(m_wait_lock->trx, 3000);
-
-	print("*** (1) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+  /** rewind(3) the file used for storing the latest detected deadlock and
+  print a heading message to stderr if printing of all deadlocks to stderr
+  is enabled. */
+  static void start_print()
+  {
+    lock_sys.assert_locked();
 
-	print(m_wait_lock);
+    rewind(lock_latest_err_file);
+    ut_print_timestamp(lock_latest_err_file);
 
-	print("*** (2) TRANSACTION:\n");
+    if (srv_print_all_deadlocks)
+      ib::info() << "Transactions deadlock detected,"
+                    " dumping detailed information.";
+  }
 
-	print(lock->trx, 3000);
+  /** Print a message to the deadlock file and possibly to stderr.
+  @param msg message to print */
+  static void print(const char *msg)
+  {
+    fputs(msg, lock_latest_err_file);
+    if (srv_print_all_deadlocks)
+      ib::info() << msg;
+  }
 
-	print("*** (2) HOLDS THE LOCK(S):\n");
+  /** Print transaction data to the deadlock file and possibly to stderr.
+  @param trx transaction */
+  static void print(const trx_t &trx)
+  {
+    lock_sys.assert_locked();
 
-	print(lock);
+    ulint n_rec_locks= trx.lock.n_rec_locks;
+    ulint n_trx_locks= UT_LIST_GET_LEN(trx.lock.trx_locks);
+    ulint heap_size= mem_heap_get_size(trx.lock.lock_heap);
 
-	/* It is possible that the joining transaction was granted its
-	lock when we rolled back some other waiting transaction. */
+    trx_print_low(lock_latest_err_file, &trx, 3000,
+                  n_rec_locks, n_trx_locks, heap_size);
 
-	if (m_start->lock.wait_lock != 0) {
-		print("*** (2) WAITING FOR THIS LOCK TO BE GRANTED:\n");
+    if (srv_print_all_deadlocks)
+      trx_print_low(stderr, &trx, 3000, n_rec_locks, n_trx_locks, heap_size);
+  }
 
-		print(m_start->lock.wait_lock);
-	}
+  /** Print lock data to the deadlock file and possibly to stderr.
+  @param lock record or table type lock */
+  static void print(const lock_t &lock)
+  {
+    lock_sys.assert_locked();
 
-	DBUG_PRINT("ib_lock", ("deadlock detected"));
-}
+    if (!lock.is_table())
+    {
+      mtr_t mtr;
+      lock_rec_print(lock_latest_err_file, &lock, mtr);
 
-/** Select the victim transaction that should be rolledback.
-@return victim transaction */
-const trx_t*
-DeadlockChecker::select_victim() const
-{
-	ut_ad(lock_mutex_own());
-	ut_ad(m_start->lock.wait_lock != 0);
-	ut_ad(m_wait_lock->trx != m_start);
+      if (srv_print_all_deadlocks)
+        lock_rec_print(stderr, &lock, mtr);
+    }
+    else
+    {
+      lock_table_print(lock_latest_err_file, &lock);
 
-	if (trx_weight_ge(m_wait_lock->trx, m_start)) {
-		/* The joining transaction is 'smaller',
-		choose it as the victim and roll it back. */
-#ifdef WITH_WSREP
-		if (wsrep_thd_is_BF(m_start->mysql_thd, FALSE)) {
-			return(m_wait_lock->trx);
-		}
-#endif /* WITH_WSREP */
-		return(m_start);
-	}
+      if (srv_print_all_deadlocks)
+        lock_table_print(stderr, &lock);
+    }
+  }
 
+  ATTRIBUTE_COLD
+  /** Report a deadlock (cycle in the waits-for graph).
+  @param trx        transaction waiting for a lock in this thread
+  @param current_trx whether trx belongs to the current thread
+  @return the transaction to be rolled back (unless one was committed already)
+  @return nullptr if no deadlock */
+  static trx_t *report(trx_t *const trx, bool current_trx)
+  {
+    mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+    ut_ad(xtest() || lock_sys.is_writer() == !current_trx);
+
+    /* Normally, trx should be a direct part of the deadlock
+    cycle. However, if innodb_deadlock_detect had been OFF in the
+    past, or if current_trx=false, trx may be waiting for a lock that
+    is held by a participant of a pre-existing deadlock, without being
+    part of the deadlock itself. That is, the path to the deadlock may be
+    P-shaped instead of O-shaped, with trx being at the foot of the P.
+
+    We will process the entire path leading to a cycle, and we will
+    choose the victim (to be aborted) among the cycle. */
+
+    static const char rollback_msg[]= "*** WE ROLL BACK TRANSACTION (%u)\n";
+    char buf[9 + sizeof rollback_msg];
+
+    /* If current_trx=true, trx is owned by this thread, and we can
+    safely invoke these without holding trx->mutex or lock_sys.latch.
+    If current_trx=false, a concurrent commit is protected by both
+    lock_sys.latch and lock_sys.wait_mutex. */
+    const undo_no_t trx_weight= TRX_WEIGHT(trx) |
+      (trx->mysql_thd &&
 #ifdef WITH_WSREP
-	if (wsrep_thd_is_BF(m_wait_lock->trx->mysql_thd, FALSE)) {
-		return(m_start);
-	}
+       (thd_has_edited_nontrans_tables(trx->mysql_thd) ||
+        (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)))
+#else
+       thd_has_edited_nontrans_tables(trx->mysql_thd)
 #endif /* WITH_WSREP */
+       ? 1ULL << 63 : 0);
 
-	return(m_wait_lock->trx);
-}
-
-/** Looks iteratively for a deadlock. Note: the joining transaction may
-have been granted its lock by the deadlock checks.
-@return 0 if no deadlock else the victim transaction instance.*/
-const trx_t*
-DeadlockChecker::search()
-{
-	ut_ad(lock_mutex_own());
-	ut_ad(!trx_mutex_own(m_start));
-
-	ut_ad(m_start != NULL);
-	ut_ad(m_wait_lock != NULL);
-	ut_ad(!m_wait_lock->trx->auto_commit || m_wait_lock->trx->will_lock);
-	ut_d(check_trx_state(m_wait_lock->trx));
-	ut_ad(m_mark_start <= s_lock_mark_counter);
-
-	/* Look at the locks ahead of wait_lock in the lock queue. */
-	ulint		heap_no;
-	const lock_t*	lock = get_first_lock(&heap_no);
-
-	for (;;) {
-		/* We should never visit the same sub-tree more than once. */
-		ut_ad(lock == NULL || !is_visited(lock));
-
-		while (m_n_elems > 0 && lock == NULL) {
-
-			/* Restore previous search state. */
-
-			pop(lock, heap_no);
-
-			lock = get_next_lock(lock, heap_no);
-		}
-
-		if (lock == NULL) {
-			break;
-		}
-
-		if (lock == m_wait_lock) {
+    trx_t *victim= nullptr;
+    undo_no_t victim_weight= ~0ULL;
+    unsigned victim_pos= 0, trx_pos= 0;
 
-			/* We can mark this subtree as searched */
-			ut_ad(lock->trx->lock.deadlock_mark <= m_mark_start);
-
-			lock->trx->lock.deadlock_mark = ++s_lock_mark_counter;
-
-			/* We are not prepared for an overflow. This 64-bit
-			counter should never wrap around. At 10^9 increments
-			per second, it would take 10^3 years of uptime. */
-
-			ut_ad(s_lock_mark_counter > 0);
-
-			/* Backtrack */
-			lock = NULL;
-			continue;
-		}
-
-		if (!lock_has_to_wait(m_wait_lock, lock)) {
-			/* No conflict, next lock */
-			lock = get_next_lock(lock, heap_no);
-			continue;
-		}
-
-		if (lock->trx == m_start) {
-			/* Found a cycle. */
-			notify(lock);
-			return select_victim();
-		}
-
-		if (is_too_deep()) {
-			/* Search too deep to continue. */
-			m_too_deep = true;
-			return m_start;
-		}
-
-		/* We do not need to report autoinc locks to the upper
-		layer. These locks are released before commit, so they
-		can not cause deadlocks with binlog-fixed commit
-		order. */
-		if (m_report_waiters
-		    && (lock_get_type_low(lock) != LOCK_TABLE
-			|| lock_get_mode(lock) != LOCK_AUTO_INC)) {
-			thd_rpl_deadlock_check(m_start->mysql_thd,
-					       lock->trx->mysql_thd);
-		}
-
-		if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-			/* Another trx ahead has requested a lock in an
-			incompatible mode, and is itself waiting for a lock. */
-
-			++m_cost;
-
-			if (!push(lock, heap_no)) {
-				m_too_deep = true;
-				return m_start;
-			}
-
-			m_wait_lock = lock->trx->lock.wait_lock;
-
-			lock = get_first_lock(&heap_no);
-
-			if (is_visited(lock)) {
-				lock = get_next_lock(lock, heap_no);
-			}
-		} else {
-			lock = get_next_lock(lock, heap_no);
-		}
-	}
-
-	ut_a(lock == NULL && m_n_elems == 0);
-
-	/* No deadlock found. */
-	return(0);
-}
-
-/** Print info about transaction that was rolled back.
-@param trx transaction rolled back
-@param lock lock trx wants */
-void
-DeadlockChecker::rollback_print(const trx_t* trx, const lock_t* lock)
-{
-	ut_ad(lock_mutex_own());
-
-	/* If the lock search exceeds the max step
-	or the max depth, the current trx will be
-	the victim. Print its information. */
-	start_print();
-
-	print("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
-	      " WAITS-FOR GRAPH, WE WILL ROLL BACK"
-	      " FOLLOWING TRANSACTION \n\n"
-	      "*** TRANSACTION:\n");
+    /* Here, lock elision does not make sense, because
+    for the output we are going to invoke system calls,
+    which would interrupt a memory transaction. */
+    if (current_trx && !lock_sys.wr_lock_try())
+    {
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      lock_sys.wr_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+    }
 
-	print(trx, 3000);
+    {
+      unsigned l= 0;
+      /* Now that we are holding lock_sys.wait_mutex again, check
+      whether a cycle still exists. */
+      trx_t *cycle= find_cycle(trx);
+      if (!cycle)
+        goto func_exit; /* One of the transactions was already aborted. */
+      for (trx_t *next= cycle;;)
+      {
+        next= next->lock.wait_trx;
+        l++;
+        const undo_no_t next_weight= TRX_WEIGHT(next) |
+          (next->mysql_thd &&
+#ifdef WITH_WSREP
+           (thd_has_edited_nontrans_tables(next->mysql_thd) ||
+            (next->is_wsrep() && wsrep_thd_is_BF(next->mysql_thd, false)))
+#else
+           thd_has_edited_nontrans_tables(next->mysql_thd)
+#endif /* WITH_WSREP */
+           ? 1ULL << 63 : 0);
+        if (next_weight < victim_weight)
+        {
+          victim_weight= next_weight;
+          victim= next;
+          victim_pos= l;
+        }
+        if (next == victim)
+          trx_pos= l;
+        if (next == cycle)
+          break;
+      }
 
-	print("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+      if (trx_pos && trx_weight == victim_weight)
+      {
+        victim= trx;
+        victim_pos= trx_pos;
+      }
 
-	print(lock);
-}
+      /* Finally, display the deadlock */
+      switch (const auto r= static_cast<enum report>(innodb_deadlock_report)) {
+      case REPORT_OFF:
+        break;
+      case REPORT_BASIC:
+      case REPORT_FULL:
+        start_print();
+        l= 0;
 
-/** Rollback transaction selected as the victim. */
-void
-DeadlockChecker::trx_rollback()
-{
-	ut_ad(lock_mutex_own());
+        for (trx_t *next= cycle;;)
+        {
+          next= next->lock.wait_trx;
+          ut_ad(next);
+          ut_ad(next->state == TRX_STATE_ACTIVE);
+          const lock_t *wait_lock= next->lock.wait_lock;
+          ut_ad(wait_lock);
+          snprintf(buf, sizeof buf, "\n*** (%u) TRANSACTION:\n", ++l);
+          print(buf);
+          print(*next);
+          print("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+          print(*wait_lock);
+          if (r == REPORT_BASIC);
+          else if (wait_lock->is_table())
+          {
+            if (const lock_t *lock=
+                UT_LIST_GET_FIRST(wait_lock->un_member.tab_lock.table->locks))
+            {
+              ut_ad(!lock->is_waiting());
+              print("*** CONFLICTING WITH:\n");
+              do
+                print(*lock);
+              while ((lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) &&
+                     !lock->is_waiting());
+            }
+            else
+              ut_ad("no conflicting table lock found" == 0);
+          }
+          else
+          {
+            const page_id_t id{wait_lock->un_member.rec_lock.page_id};
+            hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE
+                                 ? lock_sys.prdt_hash : lock_sys.rec_hash).
+              cell_get(id.fold());
+            if (const lock_t *lock= lock_sys_t::get_first(cell, id))
+            {
+              const ulint heap_no= lock_rec_find_set_bit(wait_lock);
+              if (!lock_rec_get_nth_bit(lock, heap_no))
+                lock= lock_rec_get_next_const(heap_no, lock);
+              ut_ad(!lock->is_waiting());
+              print("*** CONFLICTING WITH:\n");
+              do
+                print(*lock);
+              while ((lock= lock_rec_get_next_const(heap_no, lock)) &&
+                     !lock->is_waiting());
+            }
+            else
+              ut_ad("no conflicting record lock found" == 0);
+          }
+          if (next == cycle)
+            break;
+        }
+        snprintf(buf, sizeof buf, rollback_msg, victim_pos);
+        print(buf);
+      }
 
-	trx_t*	trx = m_wait_lock->trx;
+      ut_ad(victim->state == TRX_STATE_ACTIVE);
 
-	print("*** WE ROLL BACK TRANSACTION (1)\n");
+      /* victim->lock.was_chosen_as_deadlock_victim must always be set before
+      releasing waiting locks and reseting trx->lock.wait_lock */
+      victim->lock.was_chosen_as_deadlock_victim= true;
+      DEBUG_SYNC_C("deadlock_report_before_lock_releasing");
+      lock_cancel_waiting_and_release<true>(victim->lock.wait_lock);
 #ifdef WITH_WSREP
-	if (trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) {
-		wsrep_handle_SR_rollback(m_start->mysql_thd, trx->mysql_thd);
-	}
+      if (victim->is_wsrep() && wsrep_thd_is_SR(victim->mysql_thd))
+        wsrep_handle_SR_rollback(trx->mysql_thd, victim->mysql_thd);
 #endif
+    }
 
-	trx_mutex_enter(trx);
-
-	trx->lock.was_chosen_as_deadlock_victim = true;
-
-	lock_cancel_waiting_and_release(trx->lock.wait_lock);
-
-	trx_mutex_exit(trx);
+func_exit:
+    if (current_trx)
+      lock_sys.wr_unlock();
+    return victim;
+  }
 }
 
-/** Check if a joining lock request results in a deadlock.
-If a deadlock is found, we will resolve the deadlock by
-choosing a victim transaction and rolling it back.
-We will attempt to resolve all deadlocks.
-
-@param[in]	lock	the lock request
-@param[in,out]	trx	transaction requesting the lock
-
-@return trx if it was chosen as victim
-@retval	NULL if another victim was chosen,
-or there is no deadlock (any more) */
-const trx_t*
-DeadlockChecker::check_and_resolve(const lock_t* lock, trx_t* trx)
+/** Check if a lock request results in a deadlock.
+Resolve a deadlock by choosing a transaction that will be rolled back.
+@param trx    transaction requesting a lock
+@return whether trx must report DB_DEADLOCK */
+static bool Deadlock::check_and_resolve(trx_t *trx)
 {
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(trx));
-	ut_ad(trx->state == TRX_STATE_ACTIVE);
-	ut_ad(!trx->auto_commit || trx->will_lock);
-	ut_ad(!srv_read_only_mode);
-
-	if (!innobase_deadlock_detect) {
-		return(NULL);
-	}
-
-	/*  Release the mutex to obey the latching order.
-	This is safe, because DeadlockChecker::check_and_resolve()
-	is invoked when a lock wait is enqueued for the currently
-	running transaction. Because m_trx is a running transaction
-	(it is not currently suspended because of a lock wait),
-	its state can only be changed by this thread, which is
-	currently associated with the transaction. */
-
-	trx_mutex_exit(trx);
-
-	const trx_t*	victim_trx;
-	const bool	report_waiters = trx->mysql_thd
-		&& thd_need_wait_reports(trx->mysql_thd);
-
-	/* Try and resolve as many deadlocks as possible. */
-	do {
-		DeadlockChecker	checker(trx, lock, s_lock_mark_counter,
-					report_waiters);
-
-		victim_trx = checker.search();
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
 
-		/* Search too deep, we rollback the joining transaction only
-		if it is possible to rollback. Otherwise we rollback the
-		transaction that is holding the lock that the joining
-		transaction wants. */
-		if (checker.is_too_deep()) {
-
-			ut_ad(trx == checker.m_start);
-			ut_ad(trx == victim_trx);
-
-			rollback_print(victim_trx, lock);
-
-			MONITOR_INC(MONITOR_DEADLOCK);
-			srv_stats.lock_deadlock_count.inc();
-
-			break;
-
-		} else if (victim_trx != NULL && victim_trx != trx) {
-
-			ut_ad(victim_trx == checker.m_wait_lock->trx);
+  ut_ad(!trx->mutex_is_owner());
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  ut_ad(!srv_read_only_mode);
 
-			checker.trx_rollback();
+  if (!innodb_deadlock_detect)
+    return false;
 
-			lock_deadlock_found = true;
+  if (UNIV_LIKELY_NULL(find_cycle(trx)) && report(trx, true) == trx)
+    return true;
 
-			MONITOR_INC(MONITOR_DEADLOCK);
-			srv_stats.lock_deadlock_count.inc();
-		}
+  if (UNIV_LIKELY(!trx->lock.was_chosen_as_deadlock_victim))
+    return false;
 
-	} while (victim_trx != NULL && victim_trx != trx);
+  if (lock_t *wait_lock= trx->lock.wait_lock)
+    lock_sys_t::cancel<false>(trx, wait_lock);
 
-	/* If the joining transaction was selected as the victim. */
-	if (victim_trx != NULL) {
+  lock_sys.deadlock_check();
+  return true;
+}
 
-		print("*** WE ROLL BACK TRANSACTION (2)\n");
-#ifdef WITH_WSREP
-		if (trx->is_wsrep() && wsrep_thd_is_SR(trx->mysql_thd)) {
-			wsrep_handle_SR_rollback(trx->mysql_thd,
-						 victim_trx->mysql_thd);
-		}
+/** Check for deadlocks while holding only lock_sys.wait_mutex. */
+TRANSACTIONAL_TARGET
+void lock_sys_t::deadlock_check()
+{
+  ut_ad(!is_writer());
+  mysql_mutex_assert_owner(&wait_mutex);
+  bool acquired= false;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool elided= false;
 #endif
 
-		lock_deadlock_found = true;
-	}
-
-	trx_mutex_enter(trx);
-
-	return(victim_trx);
+  if (Deadlock::to_be_checked)
+  {
+    for (;;)
+    {
+      auto i= Deadlock::to_check.begin();
+      if (i == Deadlock::to_check.end())
+        break;
+      if (acquired);
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+      else if (xbegin())
+      {
+        if (latch.is_locked_or_waiting())
+          xabort();
+        acquired= elided= true;
+      }
+#endif
+      else
+      {
+        acquired= wr_lock_try();
+        if (!acquired)
+        {
+          acquired= true;
+          mysql_mutex_unlock(&wait_mutex);
+          lock_sys.wr_lock(SRW_LOCK_CALL);
+          mysql_mutex_lock(&wait_mutex);
+          continue;
+        }
+      }
+      trx_t *trx= *i;
+      Deadlock::to_check.erase(i);
+      if (Deadlock::find_cycle(trx))
+        Deadlock::report(trx, false);
+    }
+    Deadlock::to_be_checked= false;
+  }
+  ut_ad(Deadlock::to_check.empty());
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (elided)
+    return;
+#endif
+  if (acquired)
+    wr_unlock();
 }
 
-/*************************************************************//**
-Updates the lock table when a page is split and merged to
-two pages. */
-UNIV_INTERN
-void
-lock_update_split_and_merge(
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
 	const buf_block_t* left_block,	/*!< in: left page to which merged */
 	const rec_t* orig_pred,		/*!< in: original predecessor of
 					supremum on the left page before merge*/
 	const buf_block_t* right_block)	/*!< in: right page from which merged */
 {
-	const rec_t* left_next_rec;
-
-	ut_ad(page_is_leaf(left_block->frame));
-	ut_ad(page_is_leaf(right_block->frame));
-	ut_ad(page_align(orig_pred) == left_block->frame);
+  ut_ad(page_is_leaf(left_block->page.frame));
+  ut_ad(page_is_leaf(right_block->page.frame));
+  ut_ad(page_align(orig_pred) == left_block->page.frame);
 
-	lock_mutex_enter();
-
-	left_next_rec = page_rec_get_next_const(orig_pred);
-	ut_ad(!page_rec_is_metadata(left_next_rec));
-
-	/* Inherit the locks on the supremum of the left page to the
-	first record which was moved from the right page */
-	lock_rec_inherit_to_gap(
-		left_block, left_block,
-		page_rec_get_heap_no(left_next_rec),
-		PAGE_HEAP_NO_SUPREMUM);
-
-	/* Reset the locks on the supremum of the left page,
-	releasing waiting transactions */
-	lock_rec_reset_and_release_wait(left_block,
-					PAGE_HEAP_NO_SUPREMUM);
-
-	/* Inherit the locks to the supremum of the left page from the
-	successor of the infimum on the right page */
-	lock_rec_inherit_to_gap(left_block, right_block,
-				PAGE_HEAP_NO_SUPREMUM,
-				lock_get_min_heap_no(right_block));
-
-	lock_mutex_exit();
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
+  if (UNIV_UNLIKELY(!left_next_rec))
+  {
+    ut_ad("corrupted page" == 0);
+    return;
+  }
+  ut_ad(!page_rec_is_metadata(left_next_rec));
+
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+
+  /* Inherit the locks on the supremum of the left page to the
+  first record which was moved from the right page */
+  lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left_block->page.frame,
+                          page_rec_get_heap_no(left_next_rec),
+                          PAGE_HEAP_NO_SUPREMUM);
+
+  /* Reset the locks on the supremum of the left page,
+  releasing waiting transactions */
+  lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
+
+  /* Inherit the locks to the supremum of the left page from the
+  successor of the infimum on the right page */
+  lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, left_block->page.frame,
+                          PAGE_HEAP_NO_SUPREMUM,
+                          lock_get_min_heap_no(right_block));
 }
diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc
index dc1d8df4be1..2975659138d 100644
--- a/storage/innobase/lock/lock0prdt.cc
+++ b/storage/innobase/lock/lock0prdt.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2020, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -175,7 +175,7 @@ lock_prdt_has_to_wait(
 	if (trx != lock2->trx
 	    && !lock_mode_compatible(static_cast<lock_mode>(
 			             LOCK_MODE_MASK & type_mode),
-				     lock_get_mode(lock2))) {
+				     lock2->mode())) {
 
 		/* If it is a page lock, then return true (conflict) */
 		if (type_mode & LOCK_PRDT_PAGE) {
@@ -228,38 +228,31 @@ lock_t*
 lock_prdt_has_lock(
 /*===============*/
 	ulint			precise_mode,	/*!< in: LOCK_S or LOCK_X */
-	unsigned		type_mode,	/*!< in: LOCK_PREDICATE etc. */
-	const buf_block_t*	block,		/*!< in: buffer block
-						containing the record */
+	hash_cell_t&		cell,		/*!< hash table cell of id */
+	const page_id_t		id,		/*!< in: page identifier */
 	lock_prdt_t*		prdt,		/*!< in: The predicate to be
 						attached to the new lock */
 	const trx_t*		trx)		/*!< in: transaction */
 {
-	lock_t*		lock;
-
-	ut_ad(lock_mutex_own());
 	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
 	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
 	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
 
-	for (lock = lock_rec_get_first(
-		lock_hash_get(type_mode), block, PRDT_HEAPNO);
-	     lock != NULL;
+	for (lock_t*lock= lock_sys_t::get_first(cell, id, PRDT_HEAPNO);
+	     lock;
 	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
 		ut_ad(lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
 
 		if (lock->trx == trx
-		    && !(lock->type_mode & LOCK_INSERT_INTENTION)
-		    && !lock_get_wait(lock)
+		    && !(lock->type_mode & (LOCK_INSERT_INTENTION | LOCK_WAIT))
 		    && lock_mode_stronger_or_eq(
-			    lock_get_mode(lock),
+			    lock->mode(),
 			    static_cast<lock_mode>(
 				    precise_mode & LOCK_MODE_MASK))) {
 			if (lock->type_mode & LOCK_PRDT_PAGE) {
 				return(lock);
 			}
 
-			ut_ad(lock->type_mode & LOCK_PREDICATE);
 			lock_prdt_t*	cur_prdt = lock_get_prdt_from_lock(
 							lock);
 
@@ -288,17 +281,14 @@ lock_prdt_other_has_conflicting(
 	unsigned		mode,	/*!< in: LOCK_S or LOCK_X,
 					possibly ORed to LOCK_PREDICATE or
 					LOCK_PRDT_PAGE, LOCK_INSERT_INTENTION */
-	const buf_block_t*	block,	/*!< in: buffer block containing
-					the record */
+	const hash_cell_t&	cell,	/*!< in: hash table cell */
+	const page_id_t		id,	/*!< in: page identifier */
 	lock_prdt_t*		prdt,    /*!< in: Predicates (currently)
 					the Minimum Bounding Rectangle)
 					the new lock will be on */
 	const trx_t*		trx)	/*!< in: our transaction */
 {
-	ut_ad(lock_mutex_own());
-
-	for (lock_t* lock = lock_rec_get_first(
-		lock_hash_get(mode), block, PRDT_HEAPNO);
+	for (lock_t* lock = lock_sys_t::get_first(cell, id, PRDT_HEAPNO);
 	     lock != NULL;
 	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
 
@@ -390,12 +380,10 @@ lock_prdt_find_on_page(
 	lock_prdt_t*		prdt,		/*!< in: MBR with the lock */
 	const trx_t*		trx)		/*!< in: transaction */
 {
-	lock_t*	lock;
+	const page_id_t id{block->page.id()};
+	hash_cell_t& cell = *lock_sys.hash_get(type_mode).cell_get(id.fold());
 
-	ut_ad(lock_mutex_own());
-
-	for (lock = lock_sys.get_first(*lock_hash_get(type_mode),
-				       block->page.id());
+	for (lock_t *lock = lock_sys_t::get_first(cell, id);
 	     lock != NULL;
 	     lock = lock_rec_get_next_on_page(lock)) {
 
@@ -425,8 +413,7 @@ lock_t*
 lock_prdt_add_to_queue(
 /*===================*/
 	unsigned		type_mode,/*!< in: lock mode, wait, predicate
-					etc. flags; type is ignored
-					and replaced by LOCK_REC */
+					etc. flags */
 	const buf_block_t*	block,	/*!< in: buffer block containing
 					the record */
 	dict_index_t*		index,	/*!< in: index of record */
@@ -437,9 +424,9 @@ lock_prdt_add_to_queue(
 					/*!< in: TRUE if caller owns the
 					transaction mutex */
 {
-	ut_ad(lock_mutex_own());
-	ut_ad(caller_owns_trx_mutex == trx_mutex_own(trx));
-	ut_ad(!dict_index_is_clust(index) && !dict_index_is_online_ddl(index));
+	ut_ad(caller_owns_trx_mutex == trx->mutex_is_owner());
+	ut_ad(index->is_spatial());
+	ut_ad(!dict_index_is_online_ddl(index));
 	ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
 
 #ifdef UNIV_DEBUG
@@ -452,49 +439,40 @@ lock_prdt_add_to_queue(
 	}
 #endif /* UNIV_DEBUG */
 
-	type_mode |= LOCK_REC;
-
-	/* Look for a waiting lock request on the same record or on a gap */
-
-	lock_t*		lock;
-
-	for (lock = lock_sys.get_first(*lock_hash_get(type_mode),
-				       block->page.id());
-	     lock != NULL;
-	     lock = lock_rec_get_next_on_page(lock)) {
-
-		if (lock_get_wait(lock)
-		    && lock_rec_get_nth_bit(lock, PRDT_HEAPNO)
-		    && lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) {
-
-			break;
+	/* Try to extend a similar non-waiting lock on the same page */
+	if (!(type_mode & LOCK_WAIT)) {
+		const page_id_t id{block->page.id()};
+		hash_cell_t& cell = *lock_sys.hash_get(type_mode).
+			cell_get(id.fold());
+
+		for (lock_t* lock = lock_sys_t::get_first(cell, id);
+		     lock; lock = lock_rec_get_next_on_page(lock)) {
+			if (lock->is_waiting()
+			    && lock->type_mode
+			    & (LOCK_PREDICATE | LOCK_PRDT_PAGE)
+			    && lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) {
+				goto create;
+			}
 		}
-	}
-
-	if (lock == NULL && !(type_mode & LOCK_WAIT)) {
-
-		/* Look for a similar record lock on the same page:
-		if one is found and there are no waiting lock requests,
-		we can just set the bit */
-
-		lock = lock_prdt_find_on_page(type_mode, block, prdt, trx);
-
-		if (lock != NULL) {
 
+		if (lock_t* lock = lock_prdt_find_on_page(type_mode, block,
+							  prdt, trx)) {
 			if (lock->type_mode & LOCK_PREDICATE) {
 				lock_prdt_enlarge_prdt(lock, prdt);
 			}
 
-			return(lock);
+			return lock;
 		}
 	}
 
-	lock = lock_rec_create(
-#ifdef WITH_WSREP
-		NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
-#endif
-		type_mode, block, PRDT_HEAPNO, index, trx,
-		caller_owns_trx_mutex);
+create:
+	/* Note: We will not pass any conflicting lock to lock_rec_create(),
+	because we should be moving an existing waiting lock request. */
+	ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx);
+
+	lock_t* lock = lock_rec_create(nullptr,
+				       type_mode, block, PRDT_HEAPNO, index,
+				       trx, caller_owns_trx_mutex);
 
 	if (lock->type_mode & LOCK_PREDICATE) {
 		lock_prdt_set_prdt(lock, prdt);
@@ -510,8 +488,6 @@ a predicate record.
 dberr_t
 lock_prdt_insert_check_and_lock(
 /*============================*/
-	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
-				set, does nothing */
 	const rec_t*	rec,	/*!< in: record after which to insert */
 	buf_block_t*	block,	/*!< in/out: buffer block of rec */
 	dict_index_t*	index,	/*!< in: index */
@@ -520,95 +496,55 @@ lock_prdt_insert_check_and_lock(
 	lock_prdt_t*	prdt)	/*!< in: Predicates with Minimum Bound
 				Rectangle */
 {
-	ut_ad(block->frame == page_align(rec));
-
-	if (flags & BTR_NO_LOCKING_FLAG) {
-
-		return(DB_SUCCESS);
-	}
-
-	ut_ad(!index->table->is_temporary());
-	ut_ad(!dict_index_is_clust(index));
-
-	trx_t*	trx = thr_get_trx(thr);
-
-	lock_mutex_enter();
-
-	/* Because this code is invoked for a running transaction by
-	the thread that is serving the transaction, it is not necessary
-	to hold trx->mutex here. */
-
-	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
-
-	lock_t*		lock;
-
-	/* Only need to check locks on prdt_hash */
-	lock = lock_rec_get_first(&lock_sys.prdt_hash, block, PRDT_HEAPNO);
-
-	if (lock == NULL) {
-		lock_mutex_exit();
-
-		/* Update the page max trx id field */
-		page_update_max_trx_id(block, buf_block_get_page_zip(block),
-				       trx->id, mtr);
-
-		return(DB_SUCCESS);
-	}
-
-	ut_ad(lock->type_mode & LOCK_PREDICATE);
-
-	dberr_t		err;
-
-	/* If another transaction has an explicit lock request which locks
-	the predicate, waiting or granted, on the successor, the insert
-	has to wait.
-
-	Similar to GAP lock, we do not consider lock from inserts conflicts
-	with each other */
-
-	const ulint	mode = LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION;
-
-	const lock_t*	wait_for = lock_prdt_other_has_conflicting(
-		mode, block, prdt, trx);
-
-	if (wait_for != NULL) {
-		rtr_mbr_t*	mbr = prdt_get_mbr_from_prdt(prdt);
-
-		/* Allocate MBR on the lock heap */
-		lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap);
-
-		/* Note that we may get DB_SUCCESS also here! */
-		trx_mutex_enter(trx);
-
-		err = lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
-			NULL, /* FIXME: replicate SPATIAL INDEX locks */
-#endif
-			LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION,
-			block, PRDT_HEAPNO, index, thr, prdt);
-
-		trx_mutex_exit(trx);
-	} else {
-		err = DB_SUCCESS;
-	}
-
-	lock_mutex_exit();
-
-	switch (err) {
-	case DB_SUCCESS_LOCKED_REC:
-		err = DB_SUCCESS;
-		/* fall through */
-	case DB_SUCCESS:
-		/* Update the page max trx id field */
-		page_update_max_trx_id(block,
-				       buf_block_get_page_zip(block),
-				       trx->id, mtr);
-	default:
-		/* We only care about the two return values. */
-		break;
-	}
-
-	return(err);
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(!index->table->is_temporary());
+  ut_ad(index->is_spatial());
+
+  trx_t *trx= thr_get_trx(thr);
+  const page_id_t id{block->page.id()};
+  dberr_t err= DB_SUCCESS;
+
+  {
+    LockGuard g{lock_sys.prdt_hash, id};
+    /* Because this code is invoked for a running transaction by
+    the thread that is serving the transaction, it is not necessary
+    to hold trx->mutex here. */
+    ut_ad(lock_table_has(trx, index->table, LOCK_IX));
+
+    /* Only need to check locks on prdt_hash */
+    if (ut_d(lock_t *lock=) lock_sys_t::get_first(g.cell(), id, PRDT_HEAPNO))
+    {
+      ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+      /* If another transaction has an explicit lock request which locks
+      the predicate, waiting or granted, on the successor, the insert
+      has to wait.
+
+      Similar to GAP lock, we do not consider lock from inserts conflicts
+      with each other */
+
+      const ulint mode= LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION;
+      lock_t *c_lock= lock_prdt_other_has_conflicting(mode, g.cell(), id,
+                                                      prdt, trx);
+
+      if (c_lock)
+      {
+        rtr_mbr_t *mbr= prdt_get_mbr_from_prdt(prdt);
+        trx->mutex_lock();
+        /* Allocate MBR on the lock heap */
+        lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap);
+        err= lock_rec_enqueue_waiting(c_lock, mode, id, block->page.frame,
+                                      PRDT_HEAPNO, index, thr, prdt);
+        trx->mutex_unlock();
+      }
+    }
+  }
+
+  if (err == DB_SUCCESS)
+    /* Update the page max trx id field */
+    page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr);
+
+  return err;
 }
 
 /**************************************************************//**
@@ -623,10 +559,12 @@ lock_prdt_update_parent(
         lock_prdt_t*	right_prdt,	/*!< in: MBR on the new page */
 	const page_id_t	page_id)	/*!< in: parent page */
 {
-	lock_mutex_enter();
+	auto fold= page_id.fold();
+	LockMutexGuard g{SRW_LOCK_CALL};
+	hash_cell_t& cell = *lock_sys.prdt_hash.cell_get(fold);
 
 	/* Get all locks in parent */
-	for (lock_t *lock = lock_sys.get_first_prdt(page_id);
+	for (lock_t *lock = lock_sys_t::get_first(cell, page_id);
 	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
 		lock_prdt_t*	lock_prdt;
@@ -649,7 +587,7 @@ lock_prdt_update_parent(
 			lock_prdt_add_to_queue(lock->type_mode,
 					       left_block, lock->index,
 					       lock->trx, lock_prdt,
-					       FALSE);
+					       false);
 		}
 
 		if (!lock_prdt_consistent(lock_prdt, right_prdt, op)
@@ -657,11 +595,9 @@ lock_prdt_update_parent(
 					       lock_prdt, lock->trx)) {
 			lock_prdt_add_to_queue(lock->type_mode, right_block,
 					       lock->index, lock->trx,
-					       lock_prdt, FALSE);
+					       lock_prdt, false);
 		}
 	}
-
-	lock_mutex_exit();
 }
 
 /**************************************************************//**
@@ -673,25 +609,22 @@ lock_prdt_update_split_low(
 	buf_block_t*	new_block,	/*!< in/out: the new half page */
 	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
 	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
-	const page_id_t	page_id,	/*!< in: page number */
+	const page_id_t	id,		/*!< in: page number */
 	unsigned	type_mode)	/*!< in: LOCK_PREDICATE or
 					LOCK_PRDT_PAGE */
 {
-	lock_t*		lock;
+	hash_cell_t& cell = *lock_sys.hash_get(type_mode).cell_get(id.fold());
 
-	for (lock = lock_sys.get_first(*lock_hash_get(type_mode), page_id);
+	for (lock_t* lock = lock_sys_t::get_first(cell, id);
 	     lock;
 	     lock = lock_rec_get_next_on_page(lock)) {
 		/* First dealing with Page Lock */
 		if (lock->type_mode & LOCK_PRDT_PAGE) {
 			/* Duplicate the lock to new page */
-			trx_mutex_enter(lock->trx);
 			lock_prdt_add_to_queue(lock->type_mode,
 					       new_block,
 					       lock->index,
-					       lock->trx, NULL, TRUE);
-
-			trx_mutex_exit(lock->trx);
+					       lock->trx, nullptr, false);
 			continue;
 		}
 
@@ -708,27 +641,11 @@ lock_prdt_update_split_low(
 
 		lock_prdt = lock_get_prdt_from_lock(lock);
 
-		if (lock_prdt_consistent(lock_prdt, prdt, op)) {
-
-			if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) {
-				/* Move the lock to new page */
-				trx_mutex_enter(lock->trx);
-				lock_prdt_add_to_queue(lock->type_mode,
-						       new_block,
-						       lock->index,
-						       lock->trx, lock_prdt,
-						       TRUE);
-				trx_mutex_exit(lock->trx);
-			}
-		} else if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) {
-			/* Duplicate the lock to new page */
-			trx_mutex_enter(lock->trx);
-			lock_prdt_add_to_queue(lock->type_mode,
-					       new_block,
-					       lock->index,
-					       lock->trx, lock_prdt, TRUE);
-
-			trx_mutex_exit(lock->trx);
+		if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) {
+			/* Move the lock to new page */
+			lock_prdt_add_to_queue(lock->type_mode, new_block,
+					       lock->index, lock->trx,
+					       lock_prdt, false);
 		}
 	}
 }
@@ -743,15 +660,12 @@ lock_prdt_update_split(
 	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
 	const page_id_t	page_id)	/*!< in: page number */
 {
-	lock_mutex_enter();
-
+	LockMutexGuard g{SRW_LOCK_CALL};
 	lock_prdt_update_split_low(new_block, prdt, new_prdt,
 				   page_id, LOCK_PREDICATE);
 
 	lock_prdt_update_split_low(new_block, NULL, NULL,
 				   page_id, LOCK_PRDT_PAGE);
-
-	lock_mutex_exit();
 }
 
 /*********************************************************************//**
@@ -767,8 +681,7 @@ lock_init_prdt_from_mbr(
 	memset(prdt, 0, sizeof(*prdt));
 
 	if (heap != NULL) {
-		prdt->data = mem_heap_alloc(heap, sizeof(*mbr));
-		memcpy(prdt->data, mbr, sizeof(*mbr));
+		prdt->data = mem_heap_dup(heap, mbr, sizeof *mbr);
 	} else {
 		prdt->data = static_cast<void*>(mbr);
 	}
@@ -807,75 +720,56 @@ lock_prdt_lock(
 	ut_ad(!dict_index_is_online_ddl(index));
 	ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
 
-	const hash_table_t& hash = type_mode == LOCK_PREDICATE
-		? lock_sys.prdt_hash
-		: lock_sys.prdt_page_hash;
+	auto& hash = lock_sys.prdt_hash_get(type_mode != LOCK_PREDICATE);
+	const page_id_t id{block->page.id()};
 
 	/* Another transaction cannot have an implicit lock on the record,
 	because when we come here, we already have modified the clustered
 	index record, and this would not have been possible if another active
 	transaction had modified this secondary index record. */
 
-	lock_mutex_enter();
+	LockGuard g{hash, id};
 
 	const unsigned	prdt_mode = type_mode | mode;
-	lock_t*		lock = lock_sys.get_first(hash, block->page.id());
+	lock_t*		lock = lock_sys_t::get_first(g.cell(), id);
 
 	if (lock == NULL) {
 		lock = lock_rec_create(
-#ifdef WITH_WSREP
-			NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
-#endif
+			NULL,
 			prdt_mode, block, PRDT_HEAPNO,
 			index, trx, FALSE);
 
 		status = LOCK_REC_SUCCESS_CREATED;
 	} else {
-		trx_mutex_enter(trx);
-
 		if (lock_rec_get_next_on_page(lock)
 		    || lock->trx != trx
-		    || lock->type_mode != (LOCK_REC | prdt_mode)
+		    || lock->type_mode != prdt_mode
 		    || lock_rec_get_n_bits(lock) == 0
 		    || ((type_mode & LOCK_PREDICATE)
 		        && (!lock_prdt_consistent(
 				lock_get_prdt_from_lock(lock), prdt, 0)))) {
+			trx->mutex_lock();
 
 			lock = lock_prdt_has_lock(
-				mode, type_mode, block, prdt, trx);
-
-			if (lock == NULL) {
-
-				lock_t*	wait_for;
-
-				wait_for = lock_prdt_other_has_conflicting(
-					prdt_mode, block, prdt, trx);
-
-				if (wait_for != NULL) {
-
-					err = lock_rec_enqueue_waiting(
-#ifdef WITH_WSREP
-						NULL, /* FIXME: replicate
-						      SPATIAL INDEX locks */
-#endif
-						prdt_mode,
-						block, PRDT_HEAPNO,
-						index, thr, prdt);
-				} else {
-
-					lock_prdt_add_to_queue(
-						prdt_mode, block, index, trx,
-						prdt, true);
-
-					status = LOCK_REC_SUCCESS;
-				}
+				mode, g.cell(), id, prdt, trx);
+
+			if (lock) {
+			} else if (lock_t* wait_for
+				   = lock_prdt_other_has_conflicting(
+					   prdt_mode, g.cell(), id, prdt,
+					   trx)) {
+				err = lock_rec_enqueue_waiting(
+					wait_for, prdt_mode, id,
+					block->page.frame, PRDT_HEAPNO,
+					index, thr, prdt);
+			} else {
+				lock_prdt_add_to_queue(
+					prdt_mode, block, index, trx,
+					prdt, true);
 			}
 
-			trx_mutex_exit(trx);
-
+			trx->mutex_unlock();
 		} else {
-			trx_mutex_exit(trx);
-
 			if (!lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) {
 				lock_rec_set_nth_bit(lock, PRDT_HEAPNO);
 				status = LOCK_REC_SUCCESS_CREATED;
@@ -883,8 +777,6 @@ lock_prdt_lock(
 		}
 	}
 
-	lock_mutex_exit();
-
 	if (status == LOCK_REC_SUCCESS_CREATED && type_mode == LOCK_PREDICATE) {
 		/* Append the predicate in the lock record */
 		lock_prdt_set_prdt(lock, prdt);
@@ -903,9 +795,9 @@ lock_place_prdt_page_lock(
 	que_thr_t*	thr)		/*!< in: query thread */
 {
 	ut_ad(thr != NULL);
-	ut_ad(!srv_read_only_mode);
+	ut_ad(!high_level_read_only);
 
-	ut_ad(!dict_index_is_clust(index));
+	ut_ad(index->is_spatial());
 	ut_ad(!dict_index_is_online_ddl(index));
 	if (index->table->is_temporary()) {
 		return DB_SUCCESS;
@@ -916,34 +808,26 @@ lock_place_prdt_page_lock(
 	index record, and this would not have been possible if another active
 	transaction had modified this secondary index record. */
 
-	lock_mutex_enter();
+	LockGuard g{lock_sys.prdt_page_hash, page_id};
 
-	const lock_t*	lock = lock_sys.get_first_prdt_page(page_id);
+	const lock_t*	lock = lock_sys_t::get_first(g.cell(), page_id);
 	const ulint	mode = LOCK_S | LOCK_PRDT_PAGE;
 	trx_t*		trx = thr_get_trx(thr);
 
 	if (lock != NULL) {
-
-		trx_mutex_enter(trx);
-
 		/* Find a matching record lock owned by this transaction. */
 
 		while (lock != NULL && lock->trx != trx) {
-
 			lock = lock_rec_get_next_on_page_const(lock);
 		}
 
-		ut_ad(lock == NULL || lock->type_mode == (mode | LOCK_REC));
+		ut_ad(lock == NULL || lock->type_mode == mode);
 		ut_ad(lock == NULL || lock_rec_get_n_bits(lock) != 0);
-
-		trx_mutex_exit(trx);
 	}
 
 	if (lock == NULL) {
 		lock = lock_rec_create_low(
-#ifdef WITH_WSREP
-			NULL, NULL, /* FIXME: replicate SPATIAL INDEX locks */
-#endif
+			NULL,
 			mode, page_id, NULL, PRDT_HEAPNO,
 			index, trx, FALSE);
 
@@ -952,8 +836,6 @@ lock_place_prdt_page_lock(
 #endif /* PRDT_DIAG */
 	}
 
-	lock_mutex_exit();
-
 	return(DB_SUCCESS);
 }
 
@@ -963,15 +845,9 @@ lock_place_prdt_page_lock(
 @return	true if there is none */
 bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id)
 {
-	lock_t*		lock;
-
-	lock_mutex_enter();
-
-	lock = lock_sys.get_first_prdt_page(page_id);
-
-	lock_mutex_exit();
-
-	return(!lock || trx == lock->trx);
+  LockGuard g{lock_sys.prdt_page_hash, page_id};
+  lock_t *lock= lock_sys_t::get_first(g.cell(), page_id);
+  return !lock || trx == lock->trx;
 }
 
 /*************************************************************//**
@@ -982,50 +858,71 @@ lock_prdt_rec_move(
 /*===============*/
 	const buf_block_t*	receiver,	/*!< in: buffer block containing
 						the receiving record */
-	const buf_block_t*	donator)	/*!< in: buffer block containing
-						the donating record */
+	const page_id_t		donator)	/*!< in: target page */
 {
-	lock_mutex_enter();
+	LockMultiGuard g{lock_sys.prdt_hash, receiver->page.id(), donator};
 
-	for (lock_t *lock = lock_rec_get_first(&lock_sys.prdt_hash,
-					       donator, PRDT_HEAPNO);
-	     lock != NULL;
+	for (lock_t *lock = lock_sys_t::get_first(g.cell2(), donator,
+						  PRDT_HEAPNO);
+	     lock;
 	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
 
 		const auto type_mode = lock->type_mode;
 		lock_prdt_t*	lock_prdt = lock_get_prdt_from_lock(lock);
 
 		lock_rec_reset_nth_bit(lock, PRDT_HEAPNO);
-		lock_reset_lock_and_trx_wait(lock);
-
+		if (type_mode & LOCK_WAIT) {
+			ut_ad(lock->trx->lock.wait_lock == lock);
+			lock->type_mode &= ~LOCK_WAIT;
+		}
 		lock_prdt_add_to_queue(
 			type_mode, receiver, lock->index, lock->trx,
-			lock_prdt, FALSE);
+			lock_prdt, false);
 	}
-
-	lock_mutex_exit();
 }
 
-/** Removes predicate lock objects set on an index page which is discarded.
-@param[in]	block		page to be discarded
-@param[in]	lock_hash	lock hash */
-void
-lock_prdt_page_free_from_discard(
-	const buf_block_t*      block,
-	hash_table_t*		lock_hash)
+/** Remove locks on a discarded SPATIAL INDEX page.
+@param id   page to be discarded
+@param page whether to discard also from lock_sys.prdt_hash */
+void lock_sys_t::prdt_page_free_from_discard(const page_id_t id, bool all)
 {
-	lock_t*	lock;
-	lock_t*	next_lock;
-
-	ut_ad(lock_mutex_own());
-
-	lock = lock_sys.get_first(*lock_hash, block->page.id());
-
-	while (lock != NULL) {
-		next_lock = lock_rec_get_next_on_page(lock);
-
-		lock_rec_discard(lock);
-
-		lock = next_lock;
-	}
+  const auto id_fold= id.fold();
+  rd_lock(SRW_LOCK_CALL);
+  auto cell= prdt_page_hash.cell_get(id_fold);
+  auto latch= hash_table::latch(cell);
+  latch->acquire();
+
+  for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next)
+  {
+    next= lock_rec_get_next_on_page(lock);
+    lock_rec_discard(prdt_page_hash, lock);
+  }
+
+  if (all)
+  {
+    latch->release();
+    cell= prdt_hash.cell_get(id_fold);
+    latch= hash_table::latch(cell);
+    latch->acquire();
+    for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next)
+    {
+      next= lock_rec_get_next_on_page(lock);
+      lock_rec_discard(prdt_hash, lock);
+    }
+  }
+
+  latch->release();
+  cell= rec_hash.cell_get(id_fold);
+  latch= hash_table::latch(cell);
+  latch->acquire();
+
+  for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next)
+  {
+    next= lock_rec_get_next_on_page(lock);
+    lock_rec_discard(rec_hash, lock);
+  }
+
+  latch->release();
+  /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+  rd_unlock();
 }
diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc
index dbf41c7dc3f..d035808c6b9 100644
--- a/storage/innobase/log/log0crypt.cc
+++ b/storage/innobase/log/log0crypt.cc
@@ -347,9 +347,7 @@ found:
 
 /** Add the encryption information to a redo log checkpoint buffer.
 @param[in,out]	buf	checkpoint buffer */
-UNIV_INTERN
-void
-log_crypt_write_checkpoint_buf(byte* buf)
+void log_crypt_write_checkpoint_buf(byte *buf)
 {
 	ut_ad(info.key_version);
 	compile_time_assert(16 == sizeof info.crypt_msg);
@@ -397,9 +395,7 @@ bool log_crypt_read_checkpoint_buf(const byte* buf)
 @param[in]	offs		offset to block
 @param[in]	encrypt		true=encrypt; false=decrypt
 @return whether the operation succeeded */
-UNIV_INTERN
-bool
-log_tmp_block_encrypt(
+bool log_tmp_block_encrypt(
 	const byte*	src,
 	ulint		size,
 	byte*		dst,
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 61b0d30fec2..c53e2fd5074 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -50,7 +50,6 @@ Created 12/9/1995 Heikki Tuuri
 #include "trx0trx.h"
 #include "trx0roll.h"
 #include "srv0mon.h"
-#include "sync0sync.h"
 #include "buf0dump.h"
 #include "log0sync.h"
 #include "log.h"
@@ -148,9 +147,7 @@ log_set_capacity(ulonglong file_size)
 	free = LOG_CHECKPOINT_FREE_PER_THREAD * 10
 		+ LOG_CHECKPOINT_EXTRA_FREE;
 	if (free >= smallest_capacity / 2) {
-		ib::error() << "Cannot continue operation because log file is "
-			       "too small. Increase innodb_log_file_size "
-			       "or decrease innodb_thread_concurrency. "
+		ib::error() << "innodb_log_file_size is too small. "
 			    << INNODB_PARAMETERS_MSG;
 		return false;
 	}
@@ -177,8 +174,14 @@ void log_t::create()
   ut_ad(!is_initialised());
   m_initialised= true;
 
+#if defined(__aarch64__)
+  mysql_mutex_init(log_sys_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+  mysql_mutex_init(
+    log_flush_order_mutex_key, &flush_order_mutex, MY_MUTEX_INIT_FAST);
+#else
   mysql_mutex_init(log_sys_mutex_key, &mutex, nullptr);
   mysql_mutex_init(log_flush_order_mutex_key, &flush_order_mutex, nullptr);
+#endif
 
   /* Start the lsn from one log block from zero: this way every
   log record has a non-zero start lsn, a fact which we will use */
@@ -272,7 +275,8 @@ dberr_t file_os_io::close() noexcept
 
 dberr_t file_os_io::read(os_offset_t offset, span<byte> buf) noexcept
 {
-  return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size());
+  return os_file_read(IORequestRead, m_fd, buf.data(), offset, buf.size(),
+                      nullptr);
 }
 
 dberr_t file_os_io::write(const char *path, os_offset_t offset,
@@ -794,6 +798,7 @@ bool log_write_lock_own()
 }
 #endif
 
+
 /** Ensure that the log has been written to the log file up to a given
 log entry (such as that of a transaction commit). Start a new write, or
 wait and check if an already running write is covering the request.
@@ -802,7 +807,8 @@ included in the redo log file write
 @param[in]	flush_to_disk	whether the written log should also
 be flushed to the file system
 @param[in]	rotate_key	whether to rotate the encryption key */
-void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
+void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key,
+                     const completion_callback *callback)
 {
   ut_ad(!srv_read_only_mode);
   ut_ad(!rotate_key || flush_to_disk);
@@ -812,39 +818,57 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
   {
     /* Recovery is running and no operations on the log files are
     allowed yet (the variable name .._no_ibuf_.. is misleading) */
+    ut_a(!callback);
     return;
   }
 
-  if (flush_to_disk &&
-    flush_lock.acquire(lsn) != group_commit_lock::ACQUIRED)
+repeat:
+  lsn_t ret_lsn1= 0, ret_lsn2= 0;
+
+  if (flush_to_disk)
   {
-    return;
+    if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
+      return;
+    flush_lock.set_pending(log_sys.get_lsn());
   }
 
-  if (write_lock.acquire(lsn) == group_commit_lock::ACQUIRED)
+  if (write_lock.acquire(lsn, flush_to_disk ? nullptr : callback) ==
+      group_commit_lock::ACQUIRED)
   {
     mysql_mutex_lock(&log_sys.mutex);
     lsn_t write_lsn= log_sys.get_lsn();
     write_lock.set_pending(write_lsn);
-
+    if (flush_to_disk)
+      flush_lock.set_pending(write_lsn);
     log_write(rotate_key);
 
     ut_a(log_sys.write_lsn == write_lsn);
-    write_lock.release(write_lsn);
+    ret_lsn1= write_lock.release(write_lsn);
   }
 
-  if (!flush_to_disk)
+  if (flush_to_disk)
   {
-    return;
+    /* Flush the highest written lsn.*/
+    auto flush_lsn = write_lock.value();
+    flush_lock.set_pending(flush_lsn);
+    log_write_flush_to_disk_low(flush_lsn);
+    ret_lsn2= flush_lock.release(flush_lsn);
+
+    log_flush_notify(flush_lsn);
+    DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
   }
 
-  /* Flush the highest written lsn.*/
-  auto flush_lsn = write_lock.value();
-  flush_lock.set_pending(flush_lsn);
-  log_write_flush_to_disk_low(flush_lsn);
-  flush_lock.release(flush_lsn);
-
-  log_flush_notify(flush_lsn);
+  if (ret_lsn1 || ret_lsn2)
+  {
+    /*
+     There is no new group commit lead, some async waiters could stall.
+     Rerun log_write_up_to(), to prevent that.
+    */
+    lsn= std::max(ret_lsn1, ret_lsn2);
+    static const completion_callback dummy{[](void *) {},nullptr};
+    callback= &dummy;
+    goto repeat;
+  }
 }
 
 /** Write to the log file up to the last log entry.
@@ -860,9 +884,9 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare()
 {
   mysql_mutex_assert_not_owner(&log_sys.mutex);
 
-  while (flush_lock.acquire(log_sys.get_lsn() + 1) !=
+  while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
          group_commit_lock::ACQUIRED);
-  while (write_lock.acquire(log_sys.get_lsn() + 1) !=
+  while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
          group_commit_lock::ACQUIRED);
 }
 
@@ -1020,7 +1044,8 @@ func_exit:
 
     /* We must wait to prevent the tail of the log overwriting the head. */
     buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
-    os_thread_sleep(10000); /* Sleep 10ms to avoid a thundering herd */
+    /* Sleep to avoid a thundering herd */
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
 }
 
@@ -1070,7 +1095,7 @@ ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
 		buf_dump_start();
 	}
 	srv_monitor_timer.reset();
-	lock_sys.timeout_timer.reset();
+
 	if (do_srv_shutdown) {
 		srv_shutdown(srv_fast_shutdown == 0);
 	}
@@ -1083,7 +1108,7 @@ loop:
 
 #define COUNT_INTERVAL 600U
 #define CHECK_INTERVAL 100000U
-	os_thread_sleep(CHECK_INTERVAL);
+	std::this_thread::sleep_for(std::chrono::microseconds(CHECK_INTERVAL));
 
 	count++;
 
@@ -1133,7 +1158,7 @@ wait_suspend_loop:
 
 	ut_ad(!srv_any_background_activity());
 	if (srv_n_fil_crypt_threads_started) {
-		os_event_set(fil_crypt_threads_event);
+		fil_crypt_threads_signal(true);
 		thread_name = "fil_crypt_thread";
 		goto wait_suspend_loop;
 	}
@@ -1148,14 +1173,6 @@ wait_suspend_loop:
 
 	if (!buf_pool.is_initialised()) {
 		ut_ad(!srv_was_started);
-	} else if (ulint pending_io = buf_pool.io_pending()) {
-		if (srv_print_verbose_log && count > 600) {
-			ib::info() << "Waiting for " << pending_io << " buffer"
-				" page I/Os to complete";
-			count = 0;
-		}
-
-		goto loop;
 	} else {
 		buf_flush_buffer_pool();
 	}
@@ -1181,11 +1198,8 @@ wait_suspend_loop:
 	if (srv_fast_shutdown == 2 || !srv_was_started) {
 		if (!srv_read_only_mode && srv_was_started) {
 			sql_print_information(
-				"InnoDB: Executing innodb_fast_shutdown=2 "
-				"(without flushing the InnoDB buffer pool"
-				" to data files)."
-				" The next mariadbd"
-				" invocation will perform crash recovery!");
+				"InnoDB: Executing innodb_fast_shutdown=2."
+				" Next startup will execute crash recovery!");
 
 			/* In this fastest shutdown we do not flush the
 			buffer pool:
@@ -1193,10 +1207,7 @@ wait_suspend_loop:
 			it is essentially a 'crash' of the InnoDB server.
 			Make sure that the log is all flushed to disk, so
 			that we can recover all committed transactions in
-			a crash recovery. We must not write the lsn stamps
-			to the data files, since at a startup InnoDB deduces
-			from the stamps if the previous shutdown was clean. */
-
+			a crash recovery. */
 			log_buffer_flush_to_disk();
 		}
 
@@ -1358,11 +1369,15 @@ std::string get_log_file_path(const char *filename)
   path.reserve(size);
   path.assign(srv_log_group_home_dir);
 
-  std::replace(path.begin(), path.end(), OS_PATH_SEPARATOR_ALT,
-	       OS_PATH_SEPARATOR);
-
-  if (path.back() != OS_PATH_SEPARATOR)
-    path.push_back(OS_PATH_SEPARATOR);
+  switch (path.back()) {
+#ifdef _WIN32
+  case '\\':
+#endif
+  case '/':
+    break;
+  default:
+    path.push_back('/');
+  }
   path.append(filename);
 
   return path;
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 86e7f43015c..3b6e3008a95 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -87,7 +87,7 @@ is bigger than the lsn we are able to scan up to, that is an indication that
 the recovery failed and the database may be corrupt. */
 static lsn_t	recv_max_page_lsn;
 
-/** Stored physical log record with logical LSN (@see log_t::FORMAT_10_5) */
+/** Stored physical log record */
 struct log_phys_t : public log_rec_t
 {
   /** start LSN of the mini-transaction (not necessarily of this record) */
@@ -159,7 +159,7 @@ public:
   {
     ut_ad(len > 2);
     byte *free_p= my_assume_aligned<2>
-      (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame);
+      (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame);
     const uint16_t free= mach_read_from_2(free_p);
     if (UNIV_UNLIKELY(free < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE ||
                       free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END))
@@ -169,7 +169,7 @@ public:
       return true;
     }
 
-    byte *p= block.frame + free;
+    byte *p= block.page.frame + free;
     mach_write_to_2(free_p, free + 4 + len);
     memcpy(p, free_p, 2);
     p+= 2;
@@ -179,6 +179,35 @@ public:
     return false;
   }
 
+  /** Check an OPT_PAGE_CHECKSUM record.
+  @see mtr_t::page_checksum()
+  @param block   buffer page
+  @param l       pointer to checksum
+  @return whether an unrecoverable mismatch was found */
+  static bool page_checksum(const buf_block_t &block, const byte *l)
+  {
+    size_t size;
+    const byte *page= block.page.zip.data;
+    if (UNIV_LIKELY_NULL(page))
+      size= (UNIV_ZIP_SIZE_MIN >> 1) << block.page.zip.ssize;
+    else
+    {
+      page= block.page.frame;
+      size= srv_page_size;
+    }
+    if (UNIV_LIKELY(my_crc32c(my_crc32c(my_crc32c(0, page + FIL_PAGE_OFFSET,
+                                                  FIL_PAGE_LSN -
+                                                  FIL_PAGE_OFFSET),
+                                        page + FIL_PAGE_TYPE, 2),
+                              page + FIL_PAGE_SPACE_ID,
+                              size - (FIL_PAGE_SPACE_ID + 8)) ==
+                    mach_read_from_4(l)))
+      return false;
+
+    ib::error() << "OPT_PAGE_CHECKSUM mismatch on " << block.page.id();
+    return !srv_force_recovery;
+  }
+
   /** The status of apply() */
   enum apply_status {
     /** The page was not affected */
@@ -200,8 +229,8 @@ public:
   apply_status apply(const buf_block_t &block, uint16_t &last_offset) const
   {
     const byte * const recs= begin();
-    byte *const frame= block.page.zip.ssize
-      ? block.page.zip.data : block.frame;
+    byte *const frame= block.page.zip.data
+      ? block.page.zip.data : block.page.frame;
     const size_t size= block.physical_size();
     apply_status applied= APPLIED_NO;
 
@@ -259,15 +288,25 @@ public:
       record_corrupted:
           if (!srv_force_recovery)
           {
-            recv_sys.found_corrupt_log= true;
+            recv_sys.set_corrupt_log();
             return applied;
           }
       next_not_same_page:
           last_offset= 1; /* the next record must not be same_page  */
         }
-      next:
         l+= rlen;
         continue;
+      case OPTION:
+        ut_ad(rlen == 5);
+        ut_ad(*l == OPT_PAGE_CHECKSUM);
+        if (page_checksum(block, l + 1))
+        {
+page_corrupted:
+          sql_print_error("InnoDB: Set innodb_force_recovery=1"
+                          " to ignore corruption.");
+          return APPLIED_CORRUPTED;
+        }
+        goto next_after_applying;
       }
 
       ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) ==
@@ -278,8 +317,6 @@ public:
       ut_ad(last_offset <= size);
 
       switch (b & 0x70) {
-      case OPTION:
-        goto next;
       case EXTENDED:
         if (UNIV_UNLIKELY(block.page.id().page_no() < 3 ||
                           block.page.zip.ssize))
@@ -308,11 +345,7 @@ public:
           if (UNIV_UNLIKELY(rlen <= 3))
             goto record_corrupted;
           if (undo_append(block, ++l, --rlen) && !srv_force_recovery)
-          {
-page_corrupted:
-            ib::error() << "Set innodb_force_recovery=1 to ignore corruption.";
-            return APPLIED_CORRUPTED;
-          }
+            goto page_corrupted;
           break;
         case INSERT_HEAP_REDUNDANT:
         case INSERT_REUSE_REDUNDANT:
@@ -585,19 +618,431 @@ static recv_spaces_t	recv_spaces;
 /** The last parsed FILE_RENAME records */
 static std::map<uint32_t,std::string> renamed_spaces;
 
+/** Files for which fil_ibd_load() returned FIL_LOAD_DEFER */
+static struct
+{
+  /** Maintains the last opened defer file name along with lsn */
+  struct item
+  {
+    /** Log sequence number of latest add() called by fil_name_process() */
+    lsn_t lsn;
+    /** File name from the FILE_ record */
+    std::string file_name;
+    /** whether a FILE_DELETE record was encountered */
+    mutable bool deleted;
+  };
+
+  using map= std::map<const uint32_t, item, std::less<const uint32_t>,
+                      ut_allocator<std::pair<const uint32_t, item> > >;
+
+  /** Map of defer tablespaces */
+  map defers;
+
+  /** Add the deferred space only if it is latest one
+  @param space  space identifier
+  @param f_name file name
+  @param lsn    log sequence number of the FILE_ record */
+  void add(uint32_t space, const std::string &f_name, lsn_t lsn)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    const char *filename= f_name.c_str();
+
+    if (srv_operation == SRV_OPERATION_RESTORE)
+    {
+      /* Replace absolute DATA DIRECTORY file paths with
+      short names relative to the backup directory. */
+      const char *name= strrchr(filename, '/');
+#ifdef _WIN32
+      if (const char *last= strrchr(filename, '\\'))
+        if (last > name)
+          name= last;
+#endif
+      if (name)
+      {
+        while (--name > filename &&
+#ifdef _WIN32
+               *name != '\\' &&
+#endif
+               *name != '/');
+        if (name > filename)
+          filename= name + 1;
+       }
+    }
+
+    char *fil_path= fil_make_filepath(nullptr, {filename, strlen(filename)},
+                                      IBD, false);
+    const item defer{lsn, fil_path, false};
+    ut_free(fil_path);
+
+    /* The file name must be unique. Keep the one with the latest LSN. */
+    auto d= defers.begin();
+
+    while (d != defers.end())
+    {
+      if (d->second.file_name != defer.file_name)
+        ++d;
+      else if (d->first == space)
+      {
+        /* Neither the file name nor the tablespace ID changed.
+        Update the LSN if needed. */
+        if (d->second.lsn < lsn)
+          d->second.lsn= lsn;
+        return;
+      }
+      else if (d->second.lsn < lsn)
+      {
+        /* Reset the old tablespace name in recovered spaces list */
+        recv_spaces_t::iterator it{recv_spaces.find(d->first)};
+        if (it != recv_spaces.end() &&
+            it->second.name == d->second.file_name)
+          it->second.name = "";
+        defers.erase(d++);
+      }
+      else
+      {
+        ut_ad(d->second.lsn != lsn);
+        return; /* A later tablespace already has this name. */
+      }
+    }
+
+    auto p= defers.emplace(space, defer);
+    if (!p.second && p.first->second.lsn <= lsn)
+    {
+      p.first->second.lsn= lsn;
+      p.first->second.file_name= defer.file_name;
+    }
+    /* Add the newly added defered space and change the file name */
+    recv_spaces_t::iterator it{recv_spaces.find(space)};
+    if (it != recv_spaces.end())
+      it->second.name = defer.file_name;
+  }
+
+  void remove(uint32_t space)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    defers.erase(space);
+  }
+
+  /** Look up a tablespace that was found corrupted during recovery.
+  @param id   tablespace id
+  @return tablespace whose creation was deferred
+  @retval nullptr if no such tablespace was found */
+  item *find(uint32_t id)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    auto it= defers.find(id);
+    if (it != defers.end())
+      return &it->second;
+    return nullptr;
+  }
+
+  void clear()
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    defers.clear();
+  }
+
+  /** Initialize all deferred tablespaces.
+  @return whether any deferred initialization failed */
+  bool reinit_all()
+  {
+retry:
+    mysql_mutex_unlock(&log_sys.mutex);
+    bool fail= false;
+    buf_block_t *free_block= buf_LRU_get_free_block(false);
+    mysql_mutex_lock(&log_sys.mutex);
+    mysql_mutex_lock(&recv_sys.mutex);
+
+    for (auto d= defers.begin(); d != defers.end(); )
+    {
+      const uint32_t space_id{d->first};
+      recv_sys_t::map::iterator p{recv_sys.pages.lower_bound({space_id,0})};
+
+      if (d->second.deleted ||
+          p == recv_sys.pages.end() || p->first.space() != space_id)
+      {
+        /* We found a FILE_DELETE record for the tablespace, or
+        there were no buffered records. Either way, we must create a
+        dummy tablespace with the latest known name,
+        for dict_drop_index_tree(). */
+        while (p != recv_sys.pages.end() && p->first.space() == space_id)
+        {
+          recv_sys_t::map::iterator r= p++;
+          r->second.log.clear();
+          recv_sys.pages.erase(r);
+        }
+        recv_spaces_t::iterator it{recv_spaces.find(space_id)};
+        if (it != recv_spaces.end())
+        {
+          const std::string *name= &d->second.file_name;
+          if (d->second.deleted)
+          {
+            const auto r= renamed_spaces.find(space_id);
+            if (r != renamed_spaces.end())
+              name= &r->second;
+            bool exists;
+            os_file_type_t ftype;
+            if (!os_file_status(name->c_str(), &exists, &ftype) || !exists)
+              goto processed;
+          }
+          create(it, *name, static_cast<uint32_t>
+                 (1U << FSP_FLAGS_FCRC32_POS_MARKER |
+                  FSP_FLAGS_FCRC32_PAGE_SSIZE()), nullptr, 0);
+        }
+      }
+      else
+        fail= recv_sys.recover_deferred(p, d->second.file_name, free_block);
+processed:
+      defers.erase(d++);
+      if (fail)
+        break;
+      if (free_block)
+        continue;
+      mysql_mutex_unlock(&recv_sys.mutex);
+      goto retry;
+    }
+
+    clear();
+    mysql_mutex_unlock(&recv_sys.mutex);
+    if (free_block)
+      buf_pool.free_block(free_block);
+    return fail;
+  }
+
+  /** Create tablespace metadata for a data file that was initially
+  found corrupted during recovery.
+  @param it         tablespace iterator
+  @param name       latest file name
+  @param flags      FSP_SPACE_FLAGS
+  @param crypt_data encryption metadata
+  @param size       tablespace size in pages
+  @return tablespace
+  @retval nullptr   if crypt_data is invalid */
+  static fil_space_t *create(const recv_spaces_t::const_iterator &it,
+                             const std::string &name, uint32_t flags,
+                             fil_space_crypt_t *crypt_data, uint32_t size)
+  {
+    if (crypt_data && !crypt_data->is_key_found())
+    {
+      crypt_data->~fil_space_crypt_t();
+      ut_free(crypt_data);
+      return nullptr;
+    }
+    fil_space_t *space= fil_space_t::create(it->first, flags,
+                                            FIL_TYPE_TABLESPACE, crypt_data);
+    ut_ad(space);
+    const char *filename= name.c_str();
+    if (srv_operation == SRV_OPERATION_RESTORE)
+    {
+      const char* tbl_name = strrchr(filename, '/');
+#ifdef _WIN32
+      if (const char *last = strrchr(filename, '\\'))
+      {
+        if (last > tbl_name)
+          tbl_name = last;
+      }
+#endif
+      if (tbl_name)
+      {
+        while (--tbl_name > filename &&
+#ifdef _WIN32
+               *tbl_name != '\\' &&
+#endif
+               *tbl_name != '/');
+        if (tbl_name > filename)
+          filename= tbl_name + 1;
+      }
+    }
+    space->add(filename, OS_FILE_CLOSED, size, false, false);
+    space->recv_size= it->second.size;
+    space->size_in_header= size;
+    return space;
+  }
+
+  /** Attempt to recover pages from the doublewrite buffer.
+  This is invoked if we found neither a valid first page in the
+  data file nor redo log records that would initialize the first
+  page. */
+  void deferred_dblwr()
+  {
+    for (auto d= defers.begin(); d != defers.end(); )
+    {
+      if (d->second.deleted)
+      {
+      next_item:
+        d++;
+        continue;
+      }
+      const page_id_t page_id{d->first, 0};
+      const byte *page= recv_sys.dblwr.find_page(page_id);
+      if (!page)
+        goto next_item;
+      const uint32_t space_id= mach_read_from_4(page + FIL_PAGE_SPACE_ID);
+      const uint32_t flags= fsp_header_get_flags(page);
+      const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET);
+      const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
+
+      if (page_no == 0 && space_id == d->first && size >= 4 &&
+          fil_space_t::is_valid_flags(flags, space_id) &&
+          fil_space_t::logical_size(flags) == srv_page_size)
+      {
+        recv_spaces_t::iterator it {recv_spaces.find(d->first)};
+        ut_ad(it != recv_spaces.end());
+
+        fil_space_t *space= create(
+          it, d->second.file_name.c_str(), flags,
+          fil_space_read_crypt_data(fil_space_t::zip_size(flags), page),
+          size);
+
+        if (!space)
+          goto next_item;
+
+        space->free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
+        space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
+        fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+        if (!space->acquire())
+	{
+free_space:
+          fil_space_free(it->first, false);
+          goto next_item;
+	}
+        if (os_file_write(IORequestWrite, node->name, node->handle,
+                          page, 0, fil_space_t::physical_size(flags)) !=
+            DB_SUCCESS)
+        {
+          space->release();
+          goto free_space;
+        }
+        space->release();
+        it->second.space= space;
+        defers.erase(d++);
+        continue;
+      }
+      goto next_item;
+    }
+  }
+}
+deferred_spaces;
+
+/** Try to recover a tablespace that was not readable earlier
+@param p          iterator, initially pointing to page_id_t{space_id,0};
+                  the records will be freed and the iterator advanced
+@param name       tablespace file name
+@param free_block spare buffer block
+@return whether recovery failed */
+bool recv_sys_t::recover_deferred(recv_sys_t::map::iterator &p,
+                                  const std::string &name,
+                                  buf_block_t *&free_block)
+{
+  mysql_mutex_assert_owner(&mutex);
+
+  const page_id_t first{p->first};
+  ut_ad(first.space());
+
+  recv_spaces_t::iterator it{recv_spaces.find(first.space())};
+  ut_ad(it != recv_spaces.end());
+
+  if (!first.page_no() && p->second.state == page_recv_t::RECV_WILL_NOT_READ)
+  {
+    mtr_t mtr;
+    buf_block_t *block= recover_low(first, p, mtr, free_block);
+    ut_ad(block == free_block || block == reinterpret_cast<buf_block_t*>(-1));
+    free_block= nullptr;
+    if (UNIV_UNLIKELY(!block || block == reinterpret_cast<buf_block_t*>(-1)))
+      goto fail;
+    const byte *page= UNIV_LIKELY_NULL(block->page.zip.data)
+      ? block->page.zip.data
+      : block->page.frame;
+    const uint32_t space_id= mach_read_from_4(page + FIL_PAGE_SPACE_ID);
+    const uint32_t flags= fsp_header_get_flags(page);
+    const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET);
+    const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
+
+    ut_ad(it != recv_spaces.end());
+
+    if (page_id_t{space_id, page_no} == first && size >= 4 &&
+        it != recv_spaces.end() &&
+        fil_space_t::is_valid_flags(flags, space_id) &&
+        fil_space_t::logical_size(flags) == srv_page_size)
+    {
+      fil_space_t *space= deferred_spaces.create(it, name, flags,
+                                                 fil_space_read_crypt_data
+                                                 (fil_space_t::zip_size(flags),
+                                                  page), size);
+      if (!space)
+        goto release_and_fail;
+      space->free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
+      space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
+      fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+      node->deferred= true;
+      if (!space->acquire())
+        goto release_and_fail;
+      fil_names_dirty(space);
+      const bool is_compressed= fil_space_t::is_compressed(flags);
+#ifdef _WIN32
+      const bool is_sparse= is_compressed;
+      if (is_compressed)
+        os_file_set_sparse_win32(node->handle);
+#else
+      const bool is_sparse= is_compressed &&
+        DB_SUCCESS == os_file_punch_hole(node->handle, 0, 4096) &&
+        !my_test_if_thinly_provisioned(node->handle);
+#endif
+      /* Mimic fil_node_t::read_page0() in case the file exists and
+      has already been extended to a larger size. */
+      ut_ad(node->size == size);
+      const os_offset_t file_size= os_file_get_size(node->handle);
+      if (file_size != os_offset_t(-1))
+      {
+        const uint32_t n_pages=
+          uint32_t(file_size / fil_space_t::physical_size(flags));
+        if (n_pages > size)
+        {
+          space->size= node->size= n_pages;
+          space->set_committed_size();
+          goto size_set;
+        }
+      }
+      if (!os_file_set_size(node->name, node->handle,
+                            (size * fil_space_t::physical_size(flags)) &
+                            ~4095ULL, is_sparse))
+      {
+        space->release();
+        goto release_and_fail;
+      }
+    size_set:
+      node->deferred= false;
+      space->release();
+      it->second.space= space;
+      block->page.lock.x_unlock();
+      return false;
+    }
+
+  release_and_fail:
+    block->page.lock.x_unlock();
+  }
+
+fail:
+  ib::error() << "Cannot apply log to " << first
+              << " of corrupted file '" << name << "'";
+  return true;
+}
+
 /** Report an operation to create, delete, or rename a file during backup.
 @param[in]	space_id	tablespace identifier
-@param[in]	create		whether the file is being created
+@param[in]	type		redo log type
 @param[in]	name		file name (not NUL-terminated)
 @param[in]	len		length of name, in bytes
 @param[in]	new_name	new file name (NULL if not rename)
 @param[in]	new_len		length of new_name, in bytes (0 if NULL) */
-void (*log_file_op)(ulint space_id, bool create,
+void (*log_file_op)(ulint space_id, int type,
 		    const byte* name, ulint len,
 		    const byte* new_name, ulint new_len);
 
 void (*undo_space_trunc)(uint32_t space_id);
 
+void (*first_page_init)(ulint space_id);
+
 /** Information about initializing page contents during redo log processing.
 FIXME: Rely on recv_sys.pages! */
 class mlog_init_t
@@ -630,7 +1075,7 @@ public:
 	@return whether the state was changed */
 	bool add(const page_id_t page_id, lsn_t lsn)
 	{
-		ut_ad(mutex_own(&recv_sys.mutex));
+		mysql_mutex_assert_owner(&recv_sys.mutex);
 		const init init = { lsn, false };
 		std::pair<map::iterator, bool> p = inits.insert(
 			map::value_type(page_id, init));
@@ -649,7 +1094,7 @@ public:
 	not valid after releasing recv_sys.mutex. */
 	init& last(page_id_t page_id)
 	{
-		ut_ad(mutex_own(&recv_sys.mutex));
+		mysql_mutex_assert_owner(&recv_sys.mutex);
 		return inits.find(page_id)->second;
 	}
 
@@ -659,7 +1104,7 @@ public:
 	@return whether page_id will be freed or initialized after lsn */
 	bool will_avoid_read(page_id_t page_id, lsn_t lsn) const
 	{
-		ut_ad(mutex_own(&recv_sys.mutex));
+		mysql_mutex_assert_owner(&recv_sys.mutex);
 		auto i= inits.find(page_id);
 		return i != inits.end() && i->second.lsn > lsn;
 	}
@@ -667,7 +1112,7 @@ public:
 	/** At the end of each recovery batch, reset the 'created' flags. */
 	void reset()
 	{
-		ut_ad(mutex_own(&recv_sys.mutex));
+		mysql_mutex_assert_owner(&recv_sys.mutex);
 		ut_ad(recv_no_ibuf_operations);
 		for (map::value_type& i : inits) {
 			i.second.created = false;
@@ -680,7 +1125,7 @@ public:
 	@param[in,out]	mtr	dummy mini-transaction */
 	void mark_ibuf_exist(mtr_t& mtr)
 	{
-		ut_ad(mutex_own(&recv_sys.mutex));
+		mysql_mutex_assert_owner(&recv_sys.mutex);
 		mtr.start();
 
 		for (const map::value_type& i : inits) {
@@ -689,7 +1134,7 @@ public:
 			}
 			if (buf_block_t* block = buf_page_get_low(
 				    i.first, 0, RW_X_LATCH, nullptr,
-				    BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
+				    BUF_GET_IF_IN_POOL,
 				    &mtr, nullptr, false)) {
 				if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 					switch (fil_page_get_type(
@@ -698,7 +1143,7 @@ public:
 					case FIL_PAGE_RTREE:
 						if (page_zip_decompress(
 							    &block->page.zip,
-							    block->frame,
+							    block->page.frame,
 							    true)) {
 							break;
 						}
@@ -711,16 +1156,19 @@ public:
 					mtr.start();
 					continue;
 				}
-				mutex_exit(&recv_sys.mutex);
-				block->page.ibuf_exist = ibuf_page_exists(
-					block->page.id(), block->zip_size());
+				mysql_mutex_unlock(&recv_sys.mutex);
+				if (ibuf_page_exists(block->page.id(),
+						     block->zip_size())) {
+					block->page.set_ibuf_exist();
+				}
 				mtr.commit();
 				mtr.start();
-				mutex_enter(&recv_sys.mutex);
+				mysql_mutex_lock(&recv_sys.mutex);
 			}
 		}
 
 		mtr.commit();
+		clear();
 	}
 
 	/** Clear the data structure */
@@ -739,7 +1187,7 @@ inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
 	DBUG_LOG("ib_log",
 		 "discarding log beyond end of tablespace "
 		 << page_id << " before LSN " << lsn);
-	ut_ad(mutex_own(&mutex));
+	mysql_mutex_assert_owner(&mutex);
 	for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
 	     p != pages.end() && p->first.space() == page_id.space();) {
 		recv_sys_t::map::iterator r = p++;
@@ -779,15 +1227,17 @@ inline size_t recv_sys_t::files_size()
 }
 
 /** Process a file name from a FILE_* record.
-@param[in,out]	name		file name
+@param[in]	name		file name
 @param[in]	len		length of the file name
 @param[in]	space_id	the tablespace ID
-@param[in]	deleted		whether this is a FILE_DELETE record */
-static
-void
-fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
+@param[in]	ftype		FILE_MODIFY, FILE_DELETE, or FILE_RENAME
+@param[in]	lsn		lsn of the redo log
+@param[in]	store		whether the redo log has to be stored */
+static void fil_name_process(const char *name, ulint len, uint32_t space_id,
+                             mfile_type_t ftype, lsn_t lsn, store_t store)
 {
-	if (srv_operation == SRV_OPERATION_BACKUP) {
+	if (srv_operation == SRV_OPERATION_BACKUP
+	    || srv_operation == SRV_OPERATION_BACKUP_NO_DEFER) {
 		return;
 	}
 
@@ -799,7 +1249,7 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
 	further checks can ensure that a FILE_MODIFY record was
 	scanned before applying any page records for the space_id. */
 
-	os_normalize_path(name);
+	const bool deleted{ftype == FILE_DELETE};
 	const file_name_t fname(std::string(name, len), deleted);
 	std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace(
 		space_id, fname);
@@ -807,9 +1257,17 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
 
 	file_name_t&	f = p.first->second;
 
+	if (auto d = deferred_spaces.find(space_id)) {
+		if (deleted) {
+			d->deleted = true;
+			goto got_deleted;
+		}
+		goto reload;
+	}
+
 	if (deleted) {
+got_deleted:
 		/* Got FILE_DELETE */
-
 		if (!p.second && f.status != file_name_t::DELETED) {
 			f.status = file_name_t::DELETED;
 			if (f.space != NULL) {
@@ -821,16 +1279,18 @@ fil_name_process(char* name, ulint len, ulint space_id, bool deleted)
 		ut_ad(f.space == NULL);
 	} else if (p.second // the first FILE_MODIFY or FILE_RENAME
 		   || f.name != fname.name) {
+reload:
 		fil_space_t*	space;
 
 		/* Check if the tablespace file exists and contains
 		the space_id. If not, ignore the file after displaying
 		a note. Abort if there are multiple files with the
 		same space_id. */
-		switch (fil_ibd_load(space_id, name, space)) {
+		switch (fil_ibd_load(space_id, fname.name.c_str(), space)) {
 		case FIL_LOAD_OK:
 			ut_ad(space != NULL);
 
+			deferred_spaces.remove(space_id);
 			if (!f.space) {
 				if (f.size
 				    || f.flags != f.initial_flags) {
@@ -847,9 +1307,10 @@ same_space:
 			} else {
 				ib::error() << "Tablespace " << space_id
 					<< " has been found in two places: '"
-					<< f.name << "' and '" << name << "'."
+					<< f.name << "' and '"
+					<< fname.name << "'."
 					" You must delete one of them.";
-				recv_sys.found_corrupt_fs = true;
+				recv_sys.set_corrupt_fs();
 			}
 			break;
 
@@ -873,30 +1334,41 @@ same_space:
 
 				ib::info()
 					<< "At LSN: " << recv_sys.recovered_lsn
-					<< ": unable to open file " << name
+					<< ": unable to open file "
+					<< fname.name
 					<< " for tablespace " << space_id;
 			}
 			break;
 
+		case FIL_LOAD_DEFER:
+			/* Skip the deferred spaces
+			when lsn is already processed */
+			if (store != store_t::STORE_IF_EXISTS) {
+				deferred_spaces.add(
+					space_id, fname.name.c_str(), lsn);
+			}
+			break;
 		case FIL_LOAD_INVALID:
 			ut_ad(space == NULL);
 			if (srv_force_recovery == 0) {
 				sql_print_error("InnoDB: Recovery cannot access"
-						" file %s (tablespace "
-						ULINTPF ")", name, space_id);
+						" file %.*s (tablespace "
+						UINT32PF ")", int(len), name,
+						space_id);
 				sql_print_information("InnoDB: You may set "
 						      "innodb_force_recovery=1"
 						      " to ignore this and"
 						      " possibly get a"
 						      " corrupted database.");
-				recv_sys.found_corrupt_fs = true;
+				recv_sys.set_corrupt_fs();
 				break;
 			}
 
 			sql_print_warning("InnoDB: Ignoring changes to"
-					  " file %s (tablespace " ULINTPF ")"
+					  " file %.*s (tablespace "
+					  UINT32PF ")"
 					  " due to innodb_force_recovery",
-					  name, space_id);
+					  int(len), name, space_id);
 		}
 	}
 }
@@ -909,9 +1381,10 @@ void recv_sys_t::close()
   if (is_initialised())
   {
     dblwr.pages.clear();
-    ut_d(mutex_enter(&mutex));
+    ut_d(mysql_mutex_lock(&mutex));
     clear();
-    ut_d(mutex_exit(&mutex));
+    deferred_spaces.clear();
+    ut_d(mysql_mutex_unlock(&mutex));
 
     if (buf)
     {
@@ -920,13 +1393,13 @@ void recv_sys_t::close()
     }
 
     last_stored_lsn= 0;
-    mutex_free(&mutex);
+    mysql_mutex_destroy(&mutex);
+    pthread_cond_destroy(&cond);
   }
 
   recv_spaces.clear();
   renamed_spaces.clear();
   mlog_init.clear();
-
   close_files();
 }
 
@@ -935,7 +1408,8 @@ void recv_sys_t::create()
 {
 	ut_ad(this == &recv_sys);
 	ut_ad(!is_initialised());
-	mutex_create(LATCH_ID_RECV_SYS, &mutex);
+	mysql_mutex_init(recv_sys_mutex_key, &mutex, nullptr);
+	pthread_cond_init(&cond, nullptr);
 
 	apply_log_recs = false;
 	apply_batch_on = false;
@@ -963,21 +1437,23 @@ void recv_sys_t::create()
 /** Clear a fully processed set of stored redo log records. */
 inline void recv_sys_t::clear()
 {
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
   apply_log_recs= false;
   apply_batch_on= false;
-  ut_ad(!after_apply || !UT_LIST_GET_LAST(blocks));
+  ut_ad(!after_apply || found_corrupt_fs || !UT_LIST_GET_LAST(blocks));
   pages.clear();
 
   for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; )
   {
     buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block);
-    ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+    ut_ad(block->page.state() == buf_page_t::MEMORY);
     UT_LIST_REMOVE(blocks, block);
-    MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+    MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
     buf_block_free(block);
     block= prev_block;
   }
+
+  pthread_cond_broadcast(&cond);
 }
 
 /** Free most recovery data structures. */
@@ -985,7 +1461,7 @@ void recv_sys_t::debug_free()
 {
   ut_ad(this == &recv_sys);
   ut_ad(is_initialised());
-  mutex_enter(&mutex);
+  mysql_mutex_lock(&mutex);
 
   recovery_on= false;
   pages.clear();
@@ -993,12 +1469,12 @@ void recv_sys_t::debug_free()
 
   buf= nullptr;
 
-  mutex_exit(&mutex);
+  mysql_mutex_unlock(&mutex);
 }
 
 inline void *recv_sys_t::alloc(size_t len)
 {
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
   ut_ad(len);
   ut_ad(len <= srv_page_size);
 
@@ -1011,9 +1487,9 @@ create_block:
       ut_calc_align<uint16_t>(static_cast<uint16_t>(len), ALIGNMENT);
     static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2");
     UT_LIST_ADD_FIRST(blocks, block);
-    MEM_MAKE_ADDRESSABLE(block->frame, len);
-    MEM_NOACCESS(block->frame + len, srv_page_size - len);
-    return my_assume_aligned<ALIGNMENT>(block->frame);
+    MEM_MAKE_ADDRESSABLE(block->page.frame, len);
+    MEM_NOACCESS(block->page.frame + len, srv_page_size - len);
+    return my_assume_aligned<ALIGNMENT>(block->page.frame);
   }
 
   size_t free_offset= static_cast<uint16_t>(block->page.access_time);
@@ -1031,8 +1507,8 @@ create_block:
 
   block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 |
     ut_calc_align<uint16_t>(static_cast<uint16_t>(free_offset), ALIGNMENT);
-  MEM_MAKE_ADDRESSABLE(block->frame + free_offset - len, len);
-  return my_assume_aligned<ALIGNMENT>(block->frame + free_offset - len);
+  MEM_MAKE_ADDRESSABLE(block->page.frame + free_offset - len, len);
+  return my_assume_aligned<ALIGNMENT>(block->page.frame + free_offset - len);
 }
 
 
@@ -1042,7 +1518,7 @@ inline void recv_sys_t::free(const void *data)
 {
   ut_ad(!ut_align_offset(data, ALIGNMENT));
   data= page_align(data);
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
 
   /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(),
   we must acquire and hold the buffer pool mutex here. */
@@ -1051,22 +1527,22 @@ inline void recv_sys_t::free(const void *data)
   auto *chunk= buf_pool.chunks;
   for (auto i= buf_pool.n_chunks; i--; chunk++)
   {
-    if (data < chunk->blocks->frame)
+    if (data < chunk->blocks->page.frame)
       continue;
     const size_t offs= (reinterpret_cast<const byte*>(data) -
-                        chunk->blocks->frame) >> srv_page_size_shift;
+                        chunk->blocks->page.frame) >> srv_page_size_shift;
     if (offs >= chunk->size)
       continue;
     buf_block_t *block= &chunk->blocks[offs];
-    ut_ad(block->frame == data);
-    ut_ad(block->page.state() == BUF_BLOCK_MEMORY);
+    ut_ad(block->page.frame == data);
+    ut_ad(block->page.state() == buf_page_t::MEMORY);
     ut_ad(static_cast<uint16_t>(block->page.access_time - 1) <
           srv_page_size);
     ut_ad(block->page.access_time >= 1U << 16);
     if (!((block->page.access_time -= 1U << 16) >> 16))
     {
       UT_LIST_REMOVE(blocks, block);
-      MEM_MAKE_ADDRESSABLE(block->frame, srv_page_size);
+      MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
       buf_block_free(block);
     }
     return;
@@ -1160,7 +1636,7 @@ fail:
 		if (dl < LOG_BLOCK_HDR_SIZE
 		    || (dl != OS_FILE_LOG_BLOCK_SIZE
 			&& dl > log_sys.trailer_offset())) {
-			recv_sys.found_corrupt_log = true;
+			recv_sys.set_corrupt_log();
 			goto fail;
 		}
 	}
@@ -1629,7 +2105,7 @@ inline bool page_recv_t::trim(lsn_t start_lsn)
 
 inline void page_recv_t::recs_t::clear()
 {
-  ut_ad(mutex_own(&recv_sys.mutex));
+  mysql_mutex_assert_owner(&recv_sys.mutex);
   for (const log_rec_t *l= head; l; )
   {
     const log_rec_t *next= l->next;
@@ -1658,7 +2134,7 @@ inline void page_recv_t::will_not_read()
 inline void recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
                             const byte *l, size_t len)
 {
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
   page_id_t page_id = it->first;
   page_recv_t &recs= it->second;
 
@@ -1689,9 +2165,11 @@ append:
       tail->append(l, len);
       return;
     }
-    if (end <= &block->frame[used - ALIGNMENT] || &block->frame[used] >= end)
+    if (end <= &block->page.frame[used - ALIGNMENT] ||
+        &block->page.frame[used] >= end)
       break; /* Not the last allocated record in the page */
-    const size_t new_used= static_cast<size_t>(end - block->frame + len + 1);
+    const size_t new_used= static_cast<size_t>
+      (end - block->page.frame + len + 1);
     ut_ad(new_used > used);
     if (new_used > srv_page_size)
       break;
@@ -1710,6 +2188,8 @@ static void store_freed_or_init_rec(page_id_t page_id, bool freed)
 {
   uint32_t space_id= page_id.space();
   uint32_t page_no= page_id.page_no();
+  if (!freed && page_no == 0 && first_page_init)
+    first_page_init(space_id);
   if (is_predefined_tablespace(space_id))
   {
     if (!srv_immediate_scrub_data_uncompressed)
@@ -1743,7 +2223,7 @@ or corruption was noticed */
 bool recv_sys_t::parse(lsn_t checkpoint_lsn, store_t *store, bool apply)
 {
   mysql_mutex_assert_owner(&log_sys.mutex);
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
   ut_ad(parse_start_lsn);
   ut_ad(log_sys.is_physical());
 
@@ -1921,7 +2401,8 @@ same_page:
     if (got_page_op)
     {
       const page_id_t id(space_id, page_no);
-      ut_d(if ((b & 0x70) == INIT_PAGE) freed.erase(id));
+      ut_d(if ((b & 0x70) == INIT_PAGE || (b & 0x70) == OPTION)
+             freed.erase(id));
       ut_ad(freed.find(id) == freed.end());
       switch (b & 0x70) {
       case FREE_PAGE:
@@ -1962,8 +2443,11 @@ same_page:
         }
         last_offset= FIL_PAGE_TYPE;
         break;
-      case RESERVED:
       case OPTION:
+        if (rlen == 5 && *l == OPT_PAGE_CHECKSUM)
+          break;
+        /* fall through */
+      case RESERVED:
         continue;
       case WRITE:
       case MEMMOVE:
@@ -2055,9 +2539,9 @@ same_page:
 #if 0 && defined UNIV_DEBUG
       switch (b & 0x70) {
       case RESERVED:
-      case OPTION:
         ut_ad(0); /* we did "continue" earlier */
         break;
+      case OPTION:
       case FREE_PAGE:
         break;
       default:
@@ -2074,7 +2558,7 @@ same_page:
           if (!size)
             continue;
         }
-        else
+        else if (!deferred_spaces.find(space_id))
           continue;
         /* fall through */
       case STORE_YES:
@@ -2182,28 +2666,30 @@ same_page:
         if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4))
           goto file_rec_error;
 
-        const char saved_end= fn[rlen];
-        const_cast<char&>(fn[rlen])= '\0';
-        fil_name_process(const_cast<char*>(fn), fnend - fn, space_id,
-                         (b & 0xf0) == FILE_DELETE);
-        if (fn2)
-          fil_name_process(const_cast<char*>(fn2), fn2end - fn2, space_id,
-                           false);
-        if ((b & 0xf0) < FILE_MODIFY && log_file_op)
-          log_file_op(space_id, (b & 0xf0) == FILE_CREATE,
+        fil_name_process(fn, fnend - fn, space_id,
+                         (b & 0xf0) == FILE_DELETE ? FILE_DELETE : FILE_MODIFY,
+                         start_lsn, *store);
+
+        if ((b & 0xf0) < FILE_CHECKPOINT && log_file_op)
+          log_file_op(space_id, b & 0xf0,
                       l, static_cast<ulint>(fnend - fn),
                       reinterpret_cast<const byte*>(fn2),
                       fn2 ? static_cast<ulint>(fn2end - fn2) : 0);
-        const_cast<char&>(fn[rlen])= saved_end;
 
-        if (fn2 && apply)
+        if (fn2)
         {
-          const size_t len= fn2end - fn2;
-          auto r= renamed_spaces.emplace(space_id, std::string{fn2, len});
-          if (!r.second)
-            r.first->second= std::string{fn2, len};
+          fil_name_process(fn2, fn2end - fn2, space_id,
+                           FILE_RENAME, start_lsn, *store);
+          if (apply)
+          {
+            const size_t len= fn2end - fn2;
+            auto r= renamed_spaces.emplace(space_id, std::string{fn2, len});
+            if (!r.second)
+              r.first->second= std::string{fn2, len};
+          }
         }
-        if (UNIV_UNLIKELY(found_corrupt_fs))
+
+        if (is_corrupt_fs())
           return true;
       }
     }
@@ -2225,13 +2711,15 @@ lsn of a log record.
 @param[in,out]	mtr		mini-transaction
 @param[in,out]	p		recovery address
 @param[in,out]	space		tablespace, or NULL if not looked up yet
-@param[in,out]	init		page initialization operation, or NULL */
-static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
-			      const recv_sys_t::map::iterator& p,
-			      fil_space_t* space = NULL,
-			      mlog_init_t::init* init = NULL)
+@param[in,out]	init		page initialization operation, or NULL
+@return the recovered page
+@retval nullptr on failure */
+static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr,
+                                      const recv_sys_t::map::iterator &p,
+                                      fil_space_t *space= nullptr,
+                                      mlog_init_t::init *init= nullptr)
 {
-	ut_ad(mutex_own(&recv_sys.mutex));
+	mysql_mutex_assert_owner(&recv_sys.mutex);
 	ut_ad(recv_sys.apply_log_recs);
 	ut_ad(recv_needed_recovery);
 	ut_ad(!init || init->created);
@@ -2251,11 +2739,11 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
 
 	p->second.state = page_recv_t::RECV_BEING_PROCESSED;
 
-	mutex_exit(&recv_sys.mutex);
+	mysql_mutex_unlock(&recv_sys.mutex);
 
 	byte *frame = UNIV_LIKELY_NULL(block->page.zip.data)
 		? block->page.zip.data
-		: block->frame;
+		: block->page.frame;
 	const lsn_t page_lsn = init
 		? 0
 		: mach_read_from_8(frame + FIL_PAGE_LSN);
@@ -2388,8 +2876,22 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
 
 set_start_lsn:
 		if ((a == log_phys_t::APPLIED_CORRUPTED
-		     || recv_sys.found_corrupt_log) && !srv_force_recovery) {
-			break;
+		     || recv_sys.is_corrupt_log()) && !srv_force_recovery) {
+			if (init) {
+				init->created = false;
+				if (space || block->page.id().page_no()) {
+					block->page.lock.x_lock_recursive();
+				}
+			}
+
+			mtr.discard_modifications();
+			mtr.commit();
+
+			buf_pool.corrupted_evict(&block->page,
+						 block->page.state() &
+						 buf_page_t::LRU_MASK);
+			block = nullptr;
+			goto done;
 		}
 
 		if (!start_lsn) {
@@ -2400,7 +2902,7 @@ set_start_lsn:
 	if (start_lsn) {
 		ut_ad(end_lsn >= start_lsn);
 		mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn);
-		if (UNIV_LIKELY(frame == block->frame)) {
+		if (UNIV_LIKELY(frame == block->page.frame)) {
 			mach_write_to_8(srv_page_size
 					- FIL_PAGE_END_LSN_OLD_CHKSUM
 					+ frame, end_lsn);
@@ -2419,7 +2921,7 @@ set_start_lsn:
 		any buffered changes. */
 		init->created = false;
 		ut_ad(!mtr.has_modifications());
-		block->page.status = buf_page_t::FREED;
+		block->page.set_freed(block->page.state());
 	}
 
 	/* Make sure that committing mtr does not change the modification
@@ -2428,16 +2930,17 @@ set_start_lsn:
 	mtr.discard_modifications();
 	mtr.commit();
 
+done:
 	time_t now = time(NULL);
 
-	mutex_enter(&recv_sys.mutex);
+	mysql_mutex_lock(&recv_sys.mutex);
 
 	if (recv_max_page_lsn < page_lsn) {
 		recv_max_page_lsn = page_lsn;
 	}
 
-	ut_ad(p->second.is_being_processed());
-	ut_ad(!recv_sys.pages.empty());
+	ut_ad(!block || p->second.is_being_processed());
+	ut_ad(!block || !recv_sys.pages.empty());
 
 	if (recv_sys.report(now)) {
 		const ulint n = recv_sys.pages.size();
@@ -2445,6 +2948,8 @@ set_start_lsn:
 		service_manager_extend_timeout(
 			INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
 	}
+
+	return block;
 }
 
 /** Remove records for a corrupted page.
@@ -2452,58 +2957,103 @@ This function should only be called when innodb_force_recovery is set.
 @param page_id  corrupted page identifier */
 ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id)
 {
-  mutex_enter(&mutex);
+  if (!recovery_on)
+    return;
+
+  mysql_mutex_lock(&mutex);
   map::iterator p= pages.find(page_id);
   if (p != pages.end())
   {
     p->second.log.clear();
     pages.erase(p);
+    if (!srv_force_recovery)
+    {
+      set_corrupt_fs();
+      ib::error() << "Unable to apply log to corrupted page " << page_id
+                  << "; set innodb_force_recovery to ignore";
+    }
+    else
+      ib::warn() << "Discarding log for corrupted page " << page_id;
   }
-  mutex_exit(&mutex);
+
+  if (pages.empty())
+    pthread_cond_broadcast(&cond);
+  mysql_mutex_unlock(&mutex);
+}
+
+/** Possibly finish a recovery batch. */
+inline void recv_sys_t::maybe_finish_batch()
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(recovery_on);
+  if (!apply_batch_on || pages.empty() || is_corrupt_log() || is_corrupt_fs())
+    pthread_cond_broadcast(&cond);
+}
+
+ATTRIBUTE_COLD void recv_sys_t::set_corrupt_log()
+{
+  mysql_mutex_lock(&mutex);
+  found_corrupt_log= true;
+  pthread_cond_broadcast(&cond);
+  mysql_mutex_unlock(&mutex);
+}
+
+ATTRIBUTE_COLD void recv_sys_t::set_corrupt_fs()
+{
+  mysql_mutex_assert_owner(&mutex);
+  found_corrupt_fs= true;
+  pthread_cond_broadcast(&cond);
 }
 
 /** Apply any buffered redo log to a page that was just read from a data file.
 @param[in,out]	space	tablespace
-@param[in,out]	bpage	buffer pool page */
-void recv_recover_page(fil_space_t* space, buf_page_t* bpage)
+@param[in,out]	bpage	buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage)
 {
 	mtr_t mtr;
 	mtr.start();
 	mtr.set_log_mode(MTR_LOG_NO_REDO);
 
-	ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
-	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
-
+	ut_ad(bpage->frame);
 	/* Move the ownership of the x-latch on the page to
 	this OS thread, so that we can acquire a second
 	x-latch on it.  This is needed for the operations to
 	the page to pass the debug checks. */
-	rw_lock_x_lock_move_ownership(&block->lock);
-	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
-	rw_lock_x_lock(&block->lock);
-	mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+	bpage->lock.claim_ownership();
+	bpage->lock.x_lock_recursive();
+	bpage->fix_on_recovery();
+	mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage),
+		      MTR_MEMO_PAGE_X_FIX);
+
+	buf_block_t* success = reinterpret_cast<buf_block_t*>(bpage);
 
-	mutex_enter(&recv_sys.mutex);
+	mysql_mutex_lock(&recv_sys.mutex);
 	if (recv_sys.apply_log_recs) {
 		recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id());
 		if (p != recv_sys.pages.end()
 		    && !p->second.is_being_processed()) {
-			recv_recover_page(block, mtr, p, space);
-			p->second.log.clear();
-			recv_sys.pages.erase(p);
+			success = recv_recover_page(success, mtr, p, space);
+			if (UNIV_LIKELY(!!success)) {
+				p->second.log.clear();
+				recv_sys.pages.erase(p);
+			}
+			recv_sys.maybe_finish_batch();
 			goto func_exit;
 		}
 	}
 
 	mtr.commit();
 func_exit:
-	mutex_exit(&recv_sys.mutex);
+	mysql_mutex_unlock(&recv_sys.mutex);
 	ut_ad(mtr.has_committed());
+	return success;
 }
 
 /** Read pages for which log needs to be applied.
 @param page_id	first page identifier to read
 @param i        iterator to recv_sys.pages */
+TRANSACTIONAL_TARGET
 static void recv_read_in_area(page_id_t page_id, recv_sys_t::map::iterator i)
 {
   uint32_t page_nos[32];
@@ -2523,9 +3073,9 @@ static void recv_read_in_area(page_id_t page_id, recv_sys_t::map::iterator i)
 
   if (p != page_nos)
   {
-    mutex_exit(&recv_sys.mutex);
+    mysql_mutex_unlock(&recv_sys.mutex);
     buf_read_recv_pages(page_id.space(), page_nos, ulint(p - page_nos));
-    mutex_enter(&recv_sys.mutex);
+    mysql_mutex_lock(&recv_sys.mutex);
   }
 }
 
@@ -2534,12 +3084,14 @@ static void recv_read_in_area(page_id_t page_id, recv_sys_t::map::iterator i)
 @param p        iterator pointing to page_id
 @param mtr      mini-transaction
 @param b        pre-allocated buffer pool block
-@return whether the page was successfully initialized */
+@return the recovered block
+@retval nullptr if the page cannot be initialized based on log records
+@retval -1      if the page cannot be recovered due to corruption */
 inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
                                             map::iterator &p, mtr_t &mtr,
                                             buf_block_t *b)
 {
-  ut_ad(mutex_own(&mutex));
+  mysql_mutex_assert_owner(&mutex);
   ut_ad(p->first == page_id);
   page_recv_t &recs= p->second;
   ut_ad(recs.state == page_recv_t::RECV_WILL_NOT_READ);
@@ -2549,63 +3101,102 @@ inline buf_block_t *recv_sys_t::recover_low(const page_id_t page_id,
   if (end_lsn < i.lsn)
     DBUG_LOG("ib_log", "skip log for page " << page_id
              << " LSN " << end_lsn << " < " << i.lsn);
-  else if (fil_space_t *space= fil_space_t::get(page_id.space()))
+  fil_space_t *space= fil_space_t::get(page_id.space());
+
+  mtr.start();
+  mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+  ulint zip_size= space ? space->zip_size() : 0;
+
+  if (!space)
   {
-    mtr.start();
-    mtr.set_log_mode(MTR_LOG_NO_REDO);
-    block= buf_page_create(space, page_id.page_no(), space->zip_size(), &mtr,
-                           b);
-    if (UNIV_UNLIKELY(block != b))
+    if (page_id.page_no() != 0)
     {
-      /* The page happened to exist in the buffer pool, or it was just
-      being read in. Before buf_page_get_with_no_latch() returned to
-      buf_page_create(), all changes must have been applied to the
-      page already. */
-      ut_ad(recv_sys.pages.find(page_id) == recv_sys.pages.end());
+    nothing_recoverable:
       mtr.commit();
-      block= nullptr;
+      return nullptr;
     }
-    else
+    auto it= recv_spaces.find(page_id.space());
+    ut_ad(it != recv_spaces.end());
+    uint32_t flags= it->second.flags;
+    zip_size= fil_space_t::zip_size(flags);
+    block= buf_page_create_deferred(page_id.space(), zip_size, &mtr, b);
+    ut_ad(block == b);
+    block->page.lock.x_lock_recursive();
+  }
+  else
+  {
+    block= buf_page_create(space, page_id.page_no(), zip_size, &mtr, b);
+
+    if (UNIV_UNLIKELY(block != b))
     {
-      ut_ad(&recs == &recv_sys.pages.find(page_id)->second);
-      i.created= true;
-      buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
-      recv_recover_page(block, mtr, p, space, &i);
-      ut_ad(mtr.has_committed());
-      recs.log.clear();
-      map::iterator r= p++;
-      recv_sys.pages.erase(r);
+      /* The page happened to exist in the buffer pool, or it
+      was just being read in. Before the exclusive page latch was acquired by
+      buf_page_create(), all changes to the page must have been applied. */
+      ut_ad(pages.find(page_id) == pages.end());
+      space->release();
+      goto nothing_recoverable;
     }
-    space->release();
   }
 
+  ut_ad(&recs == &pages.find(page_id)->second);
+  i.created= true;
+  map::iterator r= p++;
+  block= recv_recover_page(block, mtr, r, space, &i);
+  ut_ad(mtr.has_committed());
+
+  if (block)
+  {
+    recs.log.clear();
+    pages.erase(r);
+  }
+  else
+    block= reinterpret_cast<buf_block_t*>(-1);
+
+  if (pages.empty())
+    pthread_cond_signal(&cond);
+
+  if (space)
+    space->release();
+
   return block;
 }
 
 /** Attempt to initialize a page based on redo log records.
 @param page_id  page identifier
-@return whether the page was successfully initialized */
+@return recovered block
+@retval nullptr if the page cannot be initialized based on log records */
 buf_block_t *recv_sys_t::recover_low(const page_id_t page_id)
 {
   buf_block_t *free_block= buf_LRU_get_free_block(false);
   buf_block_t *block= nullptr;
 
-  mutex_enter(&mutex);
+  mysql_mutex_lock(&mutex);
   map::iterator p= pages.find(page_id);
 
   if (p != pages.end() && p->second.state == page_recv_t::RECV_WILL_NOT_READ)
   {
     mtr_t mtr;
     block= recover_low(page_id, p, mtr, free_block);
-    ut_ad(!block || block == free_block);
+    ut_ad(!block || block == reinterpret_cast<buf_block_t*>(-1) ||
+          block == free_block);
   }
 
-  mutex_exit(&mutex);
+  mysql_mutex_unlock(&mutex);
   if (UNIV_UNLIKELY(!block))
     buf_pool.free_block(free_block);
   return block;
 }
 
+inline fil_space_t *fil_system_t::find(const char *path) const
+{
+  mysql_mutex_assert_owner(&mutex);
+  for (fil_space_t &space : fil_system.space_list)
+    if (space.chain.start && !strcmp(space.chain.start->name, path))
+      return &space;
+  return nullptr;
+}
+
 /** Thread-safe function which sorts flush_list by oldest_modification */
 static void log_sort_flush_list()
 {
@@ -2640,24 +3231,34 @@ void recv_sys_t::apply(bool last_batch)
         srv_operation == SRV_OPERATION_RESTORE ||
         srv_operation == SRV_OPERATION_RESTORE_EXPORT);
 
-  mutex_enter(&mutex);
+#ifdef SAFE_MUTEX
+  DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex));
+#endif /* SAFE_MUTEX */
+  mysql_mutex_lock(&mutex);
+
+  timespec abstime;
 
   while (apply_batch_on)
   {
-    bool abort= found_corrupt_log;
-    mutex_exit(&mutex);
-
-    if (abort)
+    if (is_corrupt_log())
+    {
+      mysql_mutex_unlock(&mutex);
       return;
-
-    os_thread_sleep(500000);
-    mutex_enter(&mutex);
+    }
+    if (last_batch)
+    {
+      mysql_mutex_assert_not_owner(&log_sys.mutex);
+      my_cond_wait(&cond, &mutex.m_mutex);
+    }
+    else
+    {
+      mysql_mutex_unlock(&mutex);
+      set_timespec_nsec(abstime, 500000000ULL); /* 0.5s */
+      my_cond_timedwait(&cond, &log_sys.mutex.m_mutex, &abstime);
+      mysql_mutex_lock(&mutex);
+    }
   }
 
-#ifdef SAFE_MUTEX
-  DBUG_ASSERT(!last_batch == mysql_mutex_is_owner(&log_sys.mutex));
-#endif /* SAFE_MUTEX */
-
   recv_no_ibuf_operations = !last_batch ||
     srv_operation == SRV_OPERATION_RESTORE ||
     srv_operation == SRV_OPERATION_RESTORE_EXPORT;
@@ -2700,13 +3301,12 @@ void recv_sys_t::apply(bool last_batch)
 
     fil_system.extend_to_recv_size();
 
-    /* Release the log_sys mutex in non-last batches of multi-batch
-    recovery mode and recv_sys.mutex before preallocating the
-    block because while preallocating the block which may initiate
-    log flush which requires log_sys mutex to acquire again, which
-    should be acquired before recv_sys.mutex in order to avoid
-    deadlocks. */
-    mutex_exit(&mutex);
+    /* We must release log_sys.mutex and recv_sys.mutex before
+    invoking buf_LRU_get_free_block(). Allocating a block may initiate
+    a redo log write and therefore acquire log_sys.mutex. To avoid
+    deadlocks, log_sys.mutex must not be acquired while holding
+    recv_sys.mutex. */
+    mysql_mutex_unlock(&mutex);
     if (!last_batch)
       mysql_mutex_unlock(&log_sys.mutex);
 
@@ -2715,13 +3315,43 @@ void recv_sys_t::apply(bool last_batch)
 
     if (!last_batch)
       mysql_mutex_lock(&log_sys.mutex);
-    mutex_enter(&mutex);
+    mysql_mutex_lock(&mutex);
 
     for (map::iterator p= pages.begin(); p != pages.end(); )
     {
       const page_id_t page_id= p->first;
       ut_ad(!p->second.log.empty());
 
+      const uint32_t space_id= page_id.space();
+      auto d= deferred_spaces.defers.find(space_id);
+      if (d != deferred_spaces.defers.end())
+      {
+        if (d->second.deleted)
+        {
+          /* For deleted files we must preserve the entry in deferred_spaces */
+erase_for_space:
+          while (p != pages.end() && p->first.space() == space_id)
+          {
+            map::iterator r= p++;
+            r->second.log.clear();
+            pages.erase(r);
+          }
+        }
+        else if (recover_deferred(p, d->second.file_name, free_block))
+        {
+          if (!srv_force_recovery)
+            set_corrupt_fs();
+          deferred_spaces.defers.erase(d);
+          goto erase_for_space;
+        }
+        else
+          deferred_spaces.defers.erase(d);
+        if (!free_block)
+          goto next_free_block;
+        p= pages.lower_bound(page_id);
+        continue;
+      }
+
       switch (p->second.state) {
       case page_recv_t::RECV_BEING_READ:
       case page_recv_t::RECV_BEING_PROCESSED:
@@ -2730,12 +3360,15 @@ void recv_sys_t::apply(bool last_batch)
       case page_recv_t::RECV_WILL_NOT_READ:
         if (UNIV_LIKELY(!!recover_low(page_id, p, mtr, free_block)))
         {
-          mutex_exit(&mutex);
-          if (!last_batch) mysql_mutex_unlock(&log_sys.mutex);
+next_free_block:
+          mysql_mutex_unlock(&mutex);
+          if (!last_batch)
+            mysql_mutex_unlock(&log_sys.mutex);
           mysql_mutex_assert_not_owner(&log_sys.mutex);
           free_block= buf_LRU_get_free_block(false);
-          if (!last_batch) mysql_mutex_lock(&log_sys.mutex);
-          mutex_enter(&mutex);
+          if (!last_batch)
+            mysql_mutex_lock(&log_sys.mutex);
+          mysql_mutex_lock(&mutex);
           break;
         }
         ut_ad(p == pages.end() || p->first > page_id);
@@ -2752,19 +3385,40 @@ void recv_sys_t::apply(bool last_batch)
     buf_pool.free_block(free_block);
 
     /* Wait until all the pages have been processed */
-    while (!pages.empty() || buf_pool.n_pend_reads)
+    for (;;)
     {
-      const bool abort= found_corrupt_log || found_corrupt_fs;
+      const bool empty= pages.empty();
+      if (empty && !os_aio_pending_reads())
+        break;
 
-      if (found_corrupt_fs && !srv_force_recovery)
+      if (!is_corrupt_fs() && !is_corrupt_log())
+      {
+        if (last_batch)
+        {
+          mysql_mutex_assert_not_owner(&log_sys.mutex);
+          if (!empty)
+            my_cond_wait(&cond, &mutex.m_mutex);
+          else
+          {
+            mysql_mutex_unlock(&mutex);
+            os_aio_wait_until_no_pending_reads();
+            mysql_mutex_lock(&mutex);
+            ut_ad(pages.empty());
+          }
+        }
+        else
+        {
+          mysql_mutex_unlock(&mutex);
+          set_timespec_nsec(abstime, 500000000ULL); /* 0.5s */
+          my_cond_timedwait(&cond, &log_sys.mutex.m_mutex, &abstime);
+          mysql_mutex_lock(&mutex);
+        }
+        continue;
+      }
+      if (is_corrupt_fs() && !srv_force_recovery)
         ib::info() << "Set innodb_force_recovery=1 to ignore corrupted pages.";
-
-      mutex_exit(&mutex);
-
-      if (abort)
-        return;
-      os_thread_sleep(500000);
-      mutex_enter(&mutex);
+      mysql_mutex_unlock(&mutex);
+      return;
     }
   }
 
@@ -2778,7 +3432,7 @@ void recv_sys_t::apply(bool last_batch)
   }
 
   mysql_mutex_assert_not_owner(&log_sys.mutex);
-  mutex_exit(&mutex);
+  mysql_mutex_unlock(&mutex);
 
   if (last_batch && srv_operation != SRV_OPERATION_RESTORE &&
       srv_operation != SRV_OPERATION_RESTORE_EXPORT)
@@ -2795,68 +3449,12 @@ void recv_sys_t::apply(bool last_batch)
     buf_pool_invalidate();
     mysql_mutex_lock(&log_sys.mutex);
   }
-#if 1 /* Mariabackup FIXME: Remove or adjust rename_table_in_prepare() */
-  else if (srv_operation > SRV_OPERATION_EXPORT_RESTORED);
-#endif
-  else
-  {
-    /* In the last batch, we will apply any rename operations. */
-    for (auto r : renamed_spaces)
-    {
-      const uint32_t id= r.first;
-      fil_space_t *space= fil_space_t::get(id);
-      if (!space)
-        continue;
-      ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
-      const char *old= space->chain.start->name;
-      if (r.second != old)
-      {
-        bool exists;
-        os_file_type_t ftype;
-        const char *new_name= r.second.c_str();
-        if (!os_file_status(new_name, &exists, &ftype) || exists)
-        {
-          ib::error() << "Cannot replay rename of tablespace " << id
-                      << " from '" << old << "' to '" << r.second <<
-                      (exists ? "' because the target file exists" : "'");
-          found_corrupt_fs= true;
-        }
-        else
-        {
-          size_t base= r.second.rfind(OS_PATH_SEPARATOR);
-          ut_ad(base != std::string::npos);
-          size_t start= r.second.rfind(OS_PATH_SEPARATOR, base - 1);
-          if (start == std::string::npos)
-            start= 0;
-          else
-            ++start;
-          /* Keep only databasename/tablename without .ibd suffix */
-          std::string space_name(r.second, start, r.second.size() - start - 4);
-          ut_ad(space_name[base - start] == OS_PATH_SEPARATOR);
-#if OS_PATH_SEPARATOR != '/'
-          space_name[base - start]= '/';
-#endif
-          mysql_mutex_lock(&log_sys.mutex);
-          if (dberr_t err= space->rename(space_name.c_str(), r.second.c_str(),
-                                         false))
-          {
-            ib::error() << "Cannot replay rename of tablespace " << id
-                        << " to '" << r.second << "': " << err;
-            found_corrupt_fs= true;
-          }
-          mysql_mutex_unlock(&log_sys.mutex);
-        }
-      }
-      space->release();
-    }
-    renamed_spaces.clear();
-  }
 
-  mutex_enter(&mutex);
+  mysql_mutex_lock(&mutex);
 
   ut_d(after_apply= true);
   clear();
-  mutex_exit(&mutex);
+  mysql_mutex_unlock(&mutex);
 }
 
 /** Check whether the number of read redo log blocks exceeds the maximum.
@@ -3087,7 +3685,7 @@ static bool recv_scan_log_recs(
 				ib::error() << "Log parsing buffer overflow."
 					" Recovery may have failed!";
 
-				recv_sys.found_corrupt_log = true;
+				recv_sys.set_corrupt_log();
 
 				if (!srv_force_recovery) {
 					ib::error()
@@ -3095,7 +3693,7 @@ static bool recv_scan_log_recs(
 						" to ignore this error.";
 					return(true);
 				}
-			} else if (!recv_sys.found_corrupt_log) {
+			} else if (!recv_sys.is_corrupt_log()) {
 				more_data = recv_sys_add_to_parsing_buf(
 					log_block, scanned_lsn);
 			}
@@ -3123,13 +3721,13 @@ static bool recv_scan_log_recs(
 
 	*group_scanned_lsn = scanned_lsn;
 
-	mutex_enter(&recv_sys.mutex);
+	mysql_mutex_lock(&recv_sys.mutex);
 
-	if (more_data && !recv_sys.found_corrupt_log) {
+	if (more_data && !recv_sys.is_corrupt_log()) {
 		/* Try to parse more log records */
 		if (recv_sys.parse(checkpoint_lsn, store, apply)) {
-			ut_ad(recv_sys.found_corrupt_log
-			      || recv_sys.found_corrupt_fs
+			ut_ad(recv_sys.is_corrupt_log()
+			      || recv_sys.is_corrupt_fs()
 			      || recv_sys.mlog_checkpoint_lsn
 			      == recv_sys.recovered_lsn);
 			finished = true;
@@ -3154,7 +3752,8 @@ static bool recv_scan_log_recs(
 	}
 
 func_exit:
-	mutex_exit(&recv_sys.mutex);
+	recv_sys.maybe_finish_batch();
+	mysql_mutex_unlock(&recv_sys.mutex);
 	return(finished);
 }
 
@@ -3176,7 +3775,7 @@ recv_group_scan_log_recs(
 	DBUG_ENTER("recv_group_scan_log_recs");
 	DBUG_ASSERT(!last_phase || recv_sys.mlog_checkpoint_lsn > 0);
 
-	mutex_enter(&recv_sys.mutex);
+	mysql_mutex_lock(&recv_sys.mutex);
 	recv_sys.len = 0;
 	recv_sys.recovered_offset = 0;
 	recv_sys.clear();
@@ -3185,7 +3784,7 @@ recv_group_scan_log_recs(
 	recv_sys.recovered_lsn = *contiguous_lsn;
 	recv_sys.scanned_checkpoint_no = 0;
 	ut_ad(recv_max_page_lsn == 0);
-	mutex_exit(&recv_sys.mutex);
+	mysql_mutex_unlock(&recv_sys.mutex);
 
 	lsn_t	start_lsn;
 	lsn_t	end_lsn;
@@ -3213,7 +3812,7 @@ recv_group_scan_log_recs(
 					start_lsn, end_lsn, contiguous_lsn,
 					&log_sys.log.scanned_lsn));
 
-	if (recv_sys.found_corrupt_log || recv_sys.found_corrupt_fs) {
+	if (recv_sys.is_corrupt_log() || recv_sys.is_corrupt_fs()) {
 		DBUG_RETURN(false);
 	}
 
@@ -3232,10 +3831,12 @@ static
 dberr_t
 recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i)
 {
-	if (srv_operation == SRV_OPERATION_RESTORE
-	    || srv_operation == SRV_OPERATION_RESTORE_EXPORT) {
-		if (i->second.name.find(TEMP_TABLE_PATH_PREFIX)
-		    != std::string::npos) {
+	switch (srv_operation) {
+	default:
+		break;
+	case SRV_OPERATION_RESTORE:
+	case SRV_OPERATION_RESTORE_EXPORT:
+		if (i->second.name.find("/#sql") != std::string::npos) {
 			ib::warn() << "Tablespace " << i->first << " was not"
 				" found at " << i->second.name << " when"
 				" restoring a (partial?) backup. All redo log"
@@ -3276,7 +3877,7 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace)
 {
 	dberr_t err = DB_SUCCESS;
 
-	mutex_enter(&recv_sys.mutex);
+	mysql_mutex_lock(&recv_sys.mutex);
 
 	for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
 	     p != recv_sys.pages.end();) {
@@ -3291,6 +3892,12 @@ next:
 		recv_spaces_t::iterator i = recv_spaces.find(space);
 		ut_ad(i != recv_spaces.end());
 
+		if (deferred_spaces.find(static_cast<uint32_t>(space))) {
+			/* Skip redo logs belonging to
+			incomplete tablespaces */
+			goto next;
+		}
+
 		switch (i->second.status) {
 		case file_name_t::NORMAL:
 			goto next;
@@ -3309,7 +3916,7 @@ next:
 
 	if (err != DB_SUCCESS) {
 func_exit:
-		mutex_exit(&recv_sys.mutex);
+		mysql_mutex_unlock(&recv_sys.mutex);
 		return(err);
 	}
 
@@ -3321,6 +3928,10 @@ func_exit:
 			continue;
 		}
 
+		if (deferred_spaces.find(static_cast<uint32_t>(rs.first))) {
+			continue;
+		}
+
 		missing_tablespace = true;
 
 		if (srv_force_recovery > 0) {
@@ -3388,7 +3999,7 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
 			ib::error() << "Missing FILE_CREATE, FILE_DELETE"
 				" or FILE_MODIFY before FILE_CHECKPOINT"
 				" for tablespace " << rs.first;
-			recv_sys.found_corrupt_log = true;
+			recv_sys.set_corrupt_log();
 			return(DB_CORRUPTION);
 		} else {
 			rs.second.status = file_name_t::MISSING;
@@ -3406,6 +4017,81 @@ recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
 	return DB_SUCCESS;
 }
 
+/** Apply any FILE_RENAME records */
+static dberr_t recv_rename_files()
+{
+  mysql_mutex_assert_owner(&recv_sys.mutex);
+  mysql_mutex_assert_owner(&log_sys.mutex);
+
+  dberr_t err= DB_SUCCESS;
+
+  for (auto i= renamed_spaces.begin(); i != renamed_spaces.end(); )
+  {
+    const auto &r= *i;
+    const uint32_t id= r.first;
+    fil_space_t *space= fil_space_t::get(id);
+    if (!space)
+    {
+      i++;
+      continue;
+    }
+    ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+    char *old= space->chain.start->name;
+    if (r.second != old)
+    {
+      bool exists;
+      os_file_type_t ftype;
+      const char *new_name= r.second.c_str();
+      mysql_mutex_lock(&fil_system.mutex);
+      const fil_space_t *other= nullptr;
+      if (!space->chain.start->is_open() && space->chain.start->deferred &&
+          (other= fil_system.find(new_name)) &&
+          (other->chain.start->is_open() || !other->chain.start->deferred))
+        other= nullptr;
+
+      if (other)
+      {
+        /* Multiple tablespaces use the same file name. This should
+        only be possible if the recovery of both files was deferred
+        (no valid page 0 is contained in either file). We shall not
+        rename the file, just rename the metadata. */
+        ib::info() << "Renaming tablespace metadata " << id
+                   << " from '" << old << "' to '" << r.second
+                   << "' that is also associated with tablespace "
+                   << other->id;
+        space->chain.start->name= mem_strdup(new_name);
+        ut_free(old);
+      }
+      else if (!os_file_status(new_name, &exists, &ftype) || exists)
+      {
+        ib::error() << "Cannot replay rename of tablespace " << id
+                    << " from '" << old << "' to '" << r.second <<
+                    (exists ? "' because the target file exists" : "'");
+        err= DB_TABLESPACE_EXISTS;
+      }
+      else
+      {
+        mysql_mutex_unlock(&fil_system.mutex);
+        err= space->rename(new_name, false);
+        if (err != DB_SUCCESS)
+          ib::error() << "Cannot replay rename of tablespace " << id
+                      << " to '" << r.second << "': " << err;
+        goto done;
+      }
+      mysql_mutex_unlock(&fil_system.mutex);
+    }
+done:
+    space->release();
+    if (err != DB_SUCCESS)
+    {
+      recv_sys.set_corrupt_fs();
+      break;
+    }
+    renamed_spaces.erase(i++);
+  }
+  return err;
+}
+
 /** Start recovering from a redo log checkpoint.
 @param[in]	flush_lsn	FIL_PAGE_FILE_FLUSH_LSN
 of first system tablespace page
@@ -3479,7 +4165,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 			contiguous_lsn = end_lsn;
 			break;
 		}
-		recv_sys.found_corrupt_log = true;
+		recv_sys.set_corrupt_log();
 		mysql_mutex_unlock(&log_sys.mutex);
 		return(DB_ERROR);
 	}
@@ -3495,14 +4181,14 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 	recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false);
 	/* The first scan should not have stored or applied any records. */
 	ut_ad(recv_sys.pages.empty());
-	ut_ad(!recv_sys.found_corrupt_fs);
+	ut_ad(!recv_sys.is_corrupt_fs());
 
 	if (srv_read_only_mode && recv_needed_recovery) {
 		mysql_mutex_unlock(&log_sys.mutex);
 		return(DB_READ_ONLY);
 	}
 
-	if (recv_sys.found_corrupt_log && !srv_force_recovery) {
+	if (recv_sys.is_corrupt_log() && !srv_force_recovery) {
 		mysql_mutex_unlock(&log_sys.mutex);
 		ib::warn() << "Log scan aborted at LSN " << contiguous_lsn;
 		return(DB_ERROR);
@@ -3528,8 +4214,8 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
 		rescan = recv_group_scan_log_recs(
 			checkpoint_lsn, &contiguous_lsn, false);
 
-		if ((recv_sys.found_corrupt_log && !srv_force_recovery)
-		    || recv_sys.found_corrupt_fs) {
+		if ((recv_sys.is_corrupt_log() && !srv_force_recovery)
+		    || recv_sys.is_corrupt_fs()) {
 			mysql_mutex_unlock(&log_sys.mutex);
 			return(DB_ERROR);
 		}
@@ -3613,11 +4299,11 @@ completed:
 			rescan = recv_group_scan_log_recs(
 				checkpoint_lsn, &recent_stored_lsn, false);
 
-			ut_ad(!recv_sys.found_corrupt_fs);
+			ut_ad(!recv_sys.is_corrupt_fs());
 
 			missing_tablespace = false;
 
-			err = recv_sys.found_corrupt_log
+			err = recv_sys.is_corrupt_log()
 				? DB_ERROR
 				: recv_validate_tablespace(
 					rescan, missing_tablespace);
@@ -3633,6 +4319,7 @@ completed:
 		recv_sys.parse_start_lsn = checkpoint_lsn;
 
 		if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
+			deferred_spaces.deferred_dblwr();
 			buf_dblwr.recover();
 		}
 
@@ -3644,9 +4331,9 @@ completed:
 			recv_group_scan_log_recs(
 				checkpoint_lsn, &contiguous_lsn, true);
 
-			if ((recv_sys.found_corrupt_log
+			if ((recv_sys.is_corrupt_log()
 			     && !srv_force_recovery)
-			    || recv_sys.found_corrupt_fs) {
+			    || recv_sys.is_corrupt_fs()) {
 				mysql_mutex_unlock(&log_sys.mutex);
 				return(DB_ERROR);
 			}
@@ -3709,24 +4396,28 @@ completed:
 
 	log_sys.next_checkpoint_no = ++checkpoint_no;
 
-	mutex_enter(&recv_sys.mutex);
-
+	mysql_mutex_lock(&recv_sys.mutex);
 	recv_sys.apply_log_recs = true;
 	recv_no_ibuf_operations = false;
 	ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE
 	     || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
-
-	mutex_exit(&recv_sys.mutex);
-
-	mysql_mutex_unlock(&log_sys.mutex);
+	if (srv_operation == SRV_OPERATION_NORMAL) {
+		err = recv_rename_files();
+	}
+	mysql_mutex_unlock(&recv_sys.mutex);
 
 	recv_lsn_checks_on = true;
 
 	/* The database is now ready to start almost normal processing of user
 	transactions: transaction rollbacks and the application of the log
 	records in the hash table can be run in background. */
+	if (err == DB_SUCCESS && deferred_spaces.reinit_all()
+	    && !srv_force_recovery) {
+		err = DB_CORRUPTION;
+	}
 
-	return(DB_SUCCESS);
+	mysql_mutex_unlock(&log_sys.mutex);
+	return err;
 }
 
 bool recv_dblwr_t::validate_page(const page_id_t page_id,
diff --git a/storage/innobase/log/log0sync.cc b/storage/innobase/log/log0sync.cc
index 2a6e1b8b853..6b14d1d3591 100644
--- a/storage/innobase/log/log0sync.cc
+++ b/storage/innobase/log/log0sync.cc
@@ -77,6 +77,7 @@ Note that if write operation is very fast, a) or b) can be fine as alternative.
 #include <log0types.h>
 #include "log0sync.h"
 #include <mysql/service_thd_wait.h>
+#include <sql_class.h>
 /**
   Helper class , used in group commit lock.
 
@@ -158,10 +159,10 @@ void binary_semaphore::wake()
 /* A thread helper structure, used in group commit lock below*/
 struct group_commit_waiter_t
 {
-  lsn_t m_value;
-  binary_semaphore m_sema;
-  group_commit_waiter_t* m_next;
-  group_commit_waiter_t() :m_value(), m_sema(), m_next() {}
+  lsn_t m_value=0;
+  binary_semaphore m_sema{};
+  group_commit_waiter_t* m_next= nullptr;
+  bool m_group_commit_leader=false;
 };
 
 group_commit_lock::group_commit_lock() :
@@ -188,7 +189,13 @@ void group_commit_lock::set_pending(group_commit_lock::value_type num)
 const unsigned int MAX_SPINS = 1; /** max spins in acquire */
 thread_local group_commit_waiter_t thread_local_waiter;
 
-group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
+static inline void do_completion_callback(const completion_callback* cb)
+{
+  if (cb)
+    cb->m_callback(cb->m_param);
+}
+
+group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num, const completion_callback *callback)
 {
   unsigned int spins = MAX_SPINS;
 
@@ -197,6 +204,7 @@ group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
     if (num <= value())
     {
       /* No need to wait.*/
+      do_completion_callback(callback);
       return lock_return_code::EXPIRED;
     }
 
@@ -212,14 +220,18 @@ group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
   }
 
   thread_local_waiter.m_value = num;
+  thread_local_waiter.m_group_commit_leader= false;
   std::unique_lock<std::mutex> lk(m_mtx, std::defer_lock);
-  while (num > value())
+  while (num > value() || thread_local_waiter.m_group_commit_leader)
   {
     lk.lock();
 
     /* Re-read current value after acquiring the lock*/
-    if (num <= value())
+    if (num <= value() &&
+       (!thread_local_waiter.m_group_commit_leader || m_lock))
     {
+      lk.unlock();
+      do_completion_callback(callback);
       return lock_return_code::EXPIRED;
     }
 
@@ -230,10 +242,28 @@ group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
 #ifndef DBUG_OFF
       m_owner_id = std::this_thread::get_id();
 #endif
+      if (callback)
+        m_pending_callbacks.push_back({num,*callback});
       return lock_return_code::ACQUIRED;
     }
 
+    if (callback && (m_waiters_list || num <= pending()))
+    {
+      /*
+      If num > pending(), we have a good candidate for the next group
+      commit lead, that will be taking over the lock after current owner
+      releases it.  We put current thread into waiter's list so it sleeps
+      and can be signaled and marked as group commit lead  during lock release.
+
+      For this to work well, pending() must deliver a good approximation for N
+      in the next call to group_commit_lock::release(N).
+      */
+      m_pending_callbacks.push_back({num, *callback});
+      return lock_return_code::CALLBACK_QUEUED;
+    }
+
     /* Add yourself to waiters list.*/
+    thread_local_waiter.m_group_commit_leader= false;
     thread_local_waiter.m_next = m_waiters_list;
     m_waiters_list = &thread_local_waiter;
     lk.unlock();
@@ -244,11 +274,15 @@ group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num)
     thd_wait_end(0);
 
   }
+  do_completion_callback(callback);
   return lock_return_code::EXPIRED;
 }
 
-void group_commit_lock::release(value_type num)
+group_commit_lock::value_type group_commit_lock::release(value_type num)
 {
+  completion_callback callbacks[1000];
+  size_t callback_count = 0;
+  value_type ret = 0;
   std::unique_lock<std::mutex> lk(m_mtx);
   m_lock = false;
 
@@ -262,12 +296,21 @@ void group_commit_lock::release(value_type num)
   */
   group_commit_waiter_t* cur, * prev, * next;
   group_commit_waiter_t* wakeup_list = nullptr;
-  int extra_wake = 0;
+  for (auto& c : m_pending_callbacks)
+  {
+    if (c.first <= num)
+    {
+      if (callback_count < array_elements(callbacks))
+        callbacks[callback_count++] = c.second;
+      else
+        c.second.m_callback(c.second.m_param);
+    }
+  }
 
   for (prev= nullptr, cur= m_waiters_list; cur; cur= next)
   {
     next= cur->m_next;
-    if (cur->m_value <= num || extra_wake++ == 0)
+    if (cur->m_value <= num)
     {
       /* Move current waiter to wakeup_list*/
 
@@ -291,13 +334,65 @@ void group_commit_lock::release(value_type num)
       prev= cur;
     }
   }
+
+  auto it= std::remove_if(
+      m_pending_callbacks.begin(), m_pending_callbacks.end(),
+      [num](const pending_cb &c) { return c.first <= num; });
+
+  m_pending_callbacks.erase(it, m_pending_callbacks.end());
+
+  if (m_pending_callbacks.size() || m_waiters_list)
+  {
+    /*
+     Ensure that after this thread released the lock,
+     there is a new group commit leader
+     We take this from waiters list or wakeup list. It
+     might look like a spurious wake, but in fact we just
+     ensure the waiter do not wait for eternity.
+    */
+    if (m_waiters_list)
+    {
+      /* Move one waiter to wakeup list */
+      auto e= m_waiters_list;
+      m_waiters_list= m_waiters_list->m_next;
+      e->m_next= wakeup_list;
+      e->m_group_commit_leader= true;
+      wakeup_list = e;
+    }
+    else if (wakeup_list)
+    {
+      wakeup_list->m_group_commit_leader=true;
+    }
+    else
+    {
+      /* Tell the caller that some pending callbacks left, and he should
+      do something to prevent stalls. This should be a rare situation.*/
+      ret= m_pending_callbacks[0].first;
+    }
+  }
+
   lk.unlock();
 
+  /*
+    Release designated next group commit lead first,
+    to minimize spurious wakeups.
+  */
+  if (wakeup_list && wakeup_list->m_group_commit_leader)
+  {
+    next = wakeup_list->m_next;
+    wakeup_list->m_sema.wake();
+    wakeup_list= next;
+  }
+
+  for (size_t i = 0; i < callback_count; i++)
+    callbacks[i].m_callback(callbacks[i].m_param);
+
   for (cur= wakeup_list; cur; cur= next)
   {
     next= cur->m_next;
     cur->m_sema.wake();
   }
+  return ret;
 }
 
 #ifndef DBUG_OFF
diff --git a/storage/innobase/log/log0sync.h b/storage/innobase/log/log0sync.h
index 40afbf74ecd..00686d39dac 100644
--- a/storage/innobase/log/log0sync.h
+++ b/storage/innobase/log/log0sync.h
@@ -18,8 +18,14 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include <atomic>
 #include <thread>
 #include <log0types.h>
+#include <vector>
 
 struct group_commit_waiter_t;
+struct completion_callback
+{
+  void (*m_callback)(void*);
+  void* m_param;
+};
 
 /**
 Special synchronization primitive, which is helpful for
@@ -33,16 +39,23 @@ It has a state consisting of
 
 Operations supported on this semaphore
 
-1.acquire(num):
+1.acquire(num, callback):
 - waits until current value exceeds num, or until lock is granted.
+  if running synchronously (callback is nullptr)
 
 - returns EXPIRED if current_value >= num,
-  or ACQUIRED, if current_value < num and lock is granted.
+  or ACQUIRED, if current_value < num and lock is granted,
+  or CALLBACK_QUEUED, if callback was not nullptr, and function
+  would otherwise have to wait
 
 2.release(num)
 - releases lock
 - sets new current value to max(num,current_value)
 - releases some threads waiting in acquire()
+- executes some callbacks
+- might return some lsn, meaning there are some pending
+  callbacks left, and there is no new group commit lead
+  (i.e caller must do something to flush those pending callbacks)
 
 3. value()
 - read current value
@@ -63,15 +76,20 @@ class group_commit_lock
   std::atomic<value_type> m_pending_value;
   bool m_lock;
   group_commit_waiter_t* m_waiters_list;
+
+  typedef std::pair<value_type, completion_callback> pending_cb;
+  std::vector<pending_cb> m_pending_callbacks;
+
 public:
   group_commit_lock();
   enum lock_return_code
   {
     ACQUIRED,
-    EXPIRED
+    EXPIRED,
+    CALLBACK_QUEUED
   };
-  lock_return_code acquire(value_type num);
-  void release(value_type num);
+  lock_return_code acquire(value_type num, const completion_callback *cb);
+  value_type release(value_type num);
   value_type value() const;
   value_type pending() const;
   void set_pending(value_type num);
diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc
index 6d4593e0ab4..5e8587bfea6 100644
--- a/storage/innobase/mem/mem0mem.cc
+++ b/storage/innobase/mem/mem0mem.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2020, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -294,7 +294,7 @@ mem_heap_create_block_func(
 			buf_block = buf_block_alloc();
 		}
 
-		block = (mem_block_t*) buf_block->frame;
+		block = (mem_block_t*) buf_block->page.frame;
 	}
 
 	if (block == NULL) {
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
index 7f4812e158a..8817c77a6f4 100644
--- a/storage/innobase/mtr/mtr0mtr.cc
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -31,374 +31,71 @@ Created 11/26/1995 Heikki Tuuri
 #include "fsp0sysspace.h"
 #include "page0types.h"
 #include "log0recv.h"
+#include "my_cpu.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
 #include "srv0start.h"
 #include "log.h"
 
-/** Iterate over a memo block in reverse. */
-template <typename Functor>
-struct CIterate {
-	CIterate() : functor() {}
-
-	CIterate(const Functor& functor) : functor(functor) {}
-
-	/** @return false if the functor returns false. */
-	bool operator()(mtr_buf_t::block_t* block) const
-	{
-		const mtr_memo_slot_t*	start =
-			reinterpret_cast<const mtr_memo_slot_t*>(
-				block->begin());
-
-		mtr_memo_slot_t*	slot =
-			reinterpret_cast<mtr_memo_slot_t*>(
-				block->end());
-
-		ut_ad(!(block->used() % sizeof(*slot)));
-
-		while (slot-- != start) {
-
-			if (!functor(slot)) {
-				return(false);
-			}
-		}
-
-		return(true);
-	}
-
-	Functor functor;
-};
-
-template <typename Functor>
-struct Iterate {
-	Iterate() : functor() {}
-
-	Iterate(const Functor& functor) : functor(functor) {}
-
-	/** @return false if the functor returns false. */
-	bool operator()(mtr_buf_t::block_t* block)
-	{
-		const mtr_memo_slot_t*	start =
-			reinterpret_cast<const mtr_memo_slot_t*>(
-				block->begin());
-
-		mtr_memo_slot_t*	slot =
-			reinterpret_cast<mtr_memo_slot_t*>(
-				block->end());
-
-		ut_ad(!(block->used() % sizeof(*slot)));
-
-		while (slot-- != start) {
-
-			if (!functor(slot)) {
-				return(false);
-			}
-		}
-
-		return(true);
-	}
-
-	Functor functor;
-};
-
-/** Find specific object */
-struct Find {
-
-	/** Constructor */
-	Find(const void* object, ulint type)
-		:
-		m_slot(),
-		m_type(type),
-		m_object(object)
-	{
-		ut_a(object != NULL);
-	}
-
-	/** @return false if the object was found. */
-	bool operator()(mtr_memo_slot_t* slot)
-	{
-		if (m_object == slot->object && m_type == slot->type) {
-			m_slot = slot;
-			return(false);
-		}
-
-		return(true);
-	}
-
-	/** Slot if found */
-	mtr_memo_slot_t*m_slot;
-
-	/** Type of the object to look for */
-	const ulint	m_type;
-
-	/** The object instance to look for */
-	const void*	m_object;
-};
-
-/** Find a page frame */
-struct FindPage
+void mtr_memo_slot_t::release() const
 {
-	/** Constructor
-	@param[in]	ptr	pointer to within a page frame
-	@param[in]	flags	MTR_MEMO flags to look for */
-	FindPage(const void* ptr, ulint flags)
-		: m_ptr(ptr), m_flags(flags), m_slot(NULL)
-	{
-		/* There must be some flags to look for. */
-		ut_ad(flags);
-		/* We can only look for page-related flags. */
-		ut_ad(!(flags & ulint(~(MTR_MEMO_PAGE_S_FIX
-					| MTR_MEMO_PAGE_X_FIX
-					| MTR_MEMO_PAGE_SX_FIX
-					| MTR_MEMO_BUF_FIX
-					| MTR_MEMO_MODIFY))));
-	}
+  ut_ad(object);
 
-	/** Visit a memo entry.
-	@param[in]	slot	memo entry to visit
-	@retval	false	if a page was found
-	@retval	true	if the iteration should continue */
-	bool operator()(mtr_memo_slot_t* slot)
-	{
-		ut_ad(m_slot == NULL);
-
-		if (!(m_flags & slot->type) || slot->object == NULL) {
-			return(true);
-		}
-
-		buf_block_t* block = reinterpret_cast<buf_block_t*>(
-			slot->object);
-
-		if (m_ptr < block->frame
-		    || m_ptr >= block->frame + srv_page_size) {
-			return(true);
-		}
-
-		ut_ad(!(m_flags & (MTR_MEMO_PAGE_S_FIX
-				   | MTR_MEMO_PAGE_SX_FIX
-				   | MTR_MEMO_PAGE_X_FIX))
-		      || rw_lock_own_flagged(&block->lock, m_flags));
-
-		m_slot = slot;
-		return(false);
-	}
-
-	/** @return the slot that was found */
-	mtr_memo_slot_t* get_slot() const
-	{
-		ut_ad(m_slot != NULL);
-		return(m_slot);
-	}
-	/** @return the block that was found */
-	buf_block_t* get_block() const
-	{
-		return(reinterpret_cast<buf_block_t*>(get_slot()->object));
-	}
-private:
-	/** Pointer inside a page frame to look for */
-	const void*const	m_ptr;
-	/** MTR_MEMO flags to look for */
-	const ulint		m_flags;
-	/** The slot corresponding to m_ptr */
-	mtr_memo_slot_t*	m_slot;
-};
-
-/** Release latches and decrement the buffer fix count.
-@param slot	memo slot */
-static void memo_slot_release(mtr_memo_slot_t *slot)
-{
-  switch (slot->type) {
+  switch (type) {
   case MTR_MEMO_S_LOCK:
-    rw_lock_s_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+    static_cast<index_lock*>(object)->s_unlock();
     break;
+  case MTR_MEMO_X_LOCK:
   case MTR_MEMO_SX_LOCK:
-    rw_lock_sx_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+    static_cast<index_lock*>(object)->
+      u_or_x_unlock(type == MTR_MEMO_SX_LOCK);
     break;
   case MTR_MEMO_SPACE_X_LOCK:
-    {
-      fil_space_t *space= static_cast<fil_space_t*>(slot->object);
-      space->set_committed_size();
-      rw_lock_x_unlock(&space->latch);
-    }
-    break;
-  case MTR_MEMO_X_LOCK:
-    rw_lock_x_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
+    static_cast<fil_space_t*>(object)->set_committed_size();
+    static_cast<fil_space_t*>(object)->x_unlock();
     break;
   default:
-#ifdef UNIV_DEBUG
-    switch (slot->type & ~MTR_MEMO_MODIFY) {
-    case MTR_MEMO_BUF_FIX:
+    buf_page_t *bpage= static_cast<buf_page_t*>(object);
+    ut_d(const auto s=)
+      bpage->unfix();
+    ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+    switch (auto latch= type & ~MTR_MEMO_MODIFY) {
     case MTR_MEMO_PAGE_S_FIX:
+      bpage->lock.s_unlock();
+      return;
     case MTR_MEMO_PAGE_SX_FIX:
     case MTR_MEMO_PAGE_X_FIX:
-      break;
-    default:
-      ut_ad("invalid type" == 0);
-      break;
+      bpage->lock.u_or_x_unlock(latch == MTR_MEMO_PAGE_SX_FIX);
+      /* fall through */
+    case MTR_MEMO_BUF_FIX:
+      return;
     }
-#endif /* UNIV_DEBUG */
-    buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object);
-    buf_page_release_latch(block, slot->type & ~MTR_MEMO_MODIFY);
-    block->unfix();
-    break;
+    ut_ad("invalid type" == 0);
   }
-  slot->object= nullptr;
 }
 
-/** Release the latches acquired by the mini-transaction. */
-struct ReleaseLatches {
-  /** @return true always. */
-  bool operator()(mtr_memo_slot_t *slot) const
-  {
-    if (!slot->object)
-      return true;
-    switch (slot->type) {
-    case MTR_MEMO_S_LOCK:
-      rw_lock_s_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
-      break;
-    case MTR_MEMO_SPACE_X_LOCK:
-      {
-        fil_space_t *space= static_cast<fil_space_t*>(slot->object);
-        space->set_committed_size();
-        rw_lock_x_unlock(&space->latch);
-      }
-      break;
-    case MTR_MEMO_X_LOCK:
-      rw_lock_x_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
-      break;
-    case MTR_MEMO_SX_LOCK:
-      rw_lock_sx_unlock(reinterpret_cast<rw_lock_t*>(slot->object));
-      break;
-    default:
-#ifdef UNIV_DEBUG
-      switch (slot->type & ~MTR_MEMO_MODIFY) {
-      case MTR_MEMO_BUF_FIX:
-      case MTR_MEMO_PAGE_S_FIX:
-      case MTR_MEMO_PAGE_SX_FIX:
-      case MTR_MEMO_PAGE_X_FIX:
-        break;
-      default:
-        ut_ad("invalid type" == 0);
-        break;
-      }
-#endif /* UNIV_DEBUG */
-      buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object);
-      buf_page_release_latch(block, slot->type & ~MTR_MEMO_MODIFY);
-      block->unfix();
-      break;
-    }
-    slot->object= NULL;
-    return true;
-  }
-};
-
-/** Release the latches and blocks acquired by the mini-transaction. */
-struct ReleaseAll {
-  /** @return true always. */
-  bool operator()(mtr_memo_slot_t *slot) const
-  {
-    if (slot->object)
-      memo_slot_release(slot);
-    return true;
-  }
-};
-
-/** Stops iteration is savepoint is reached */
-template <typename Functor> struct TillSavepoint
-{
-
-  /** Constructor
-  @param[in] functor functor which is called if savepoint is not reached
-  @param[in] savepoint savepoint value to rollback
-  @param[in] used current position in slots container */
-  TillSavepoint(const Functor &functor, ulint savepoint, ulint used)
-      : functor(functor),
-        m_slots_count((used - savepoint) / sizeof(mtr_memo_slot_t))
-  {
-    ut_ad(savepoint);
-    ut_ad(used >= savepoint);
-  }
-
-  /** @return true if savepoint is not reached, false otherwise */
-  bool operator()(mtr_memo_slot_t *slot)
-  {
-#ifdef UNIV_DEBUG
-    /** This check is added because the code is invoked only from
-    row_search_mvcc() to release latches acquired during clustered index search
-    for secondary index record. To make it more universal we could add one more
-    member in this functor for debug build to pass only certain slot types,
-    but this is currently not necessary. */
-    switch (slot->type)
-    {
-    case MTR_MEMO_S_LOCK:
-    case MTR_MEMO_PAGE_S_FIX:
-      break;
-    default:
-      ut_a(false);
-    }
-#endif
-    return m_slots_count-- && functor(slot);
-  }
-
-private:
-  /** functor to invoke */
-  const Functor &functor;
-  /** slots count left till savepoint */
-  ulint m_slots_count;
-};
-
-#ifdef UNIV_DEBUG
-/** Check that all slots have been handled. */
-struct DebugCheck {
-	/** @return true always. */
-	bool operator()(const mtr_memo_slot_t* slot) const
-	{
-		ut_ad(!slot->object);
-		return(true);
-	}
-};
-#endif
-
-/** Release page latches held by the mini-transaction. */
-struct ReleaseBlocks
-{
-  const lsn_t start, end;
-  ReleaseBlocks(lsn_t start, lsn_t end) : start(start), end(end) {}
-
-  /** @return true always */
-  bool operator()(mtr_memo_slot_t *slot) const
-  {
-    if (!slot->object)
-      return true;
-    switch (slot->type) {
-    case MTR_MEMO_PAGE_X_MODIFY:
-    case MTR_MEMO_PAGE_SX_MODIFY:
-      break;
-    default:
-      ut_ad(!(slot->type & MTR_MEMO_MODIFY));
-      return true;
-    }
-
-    buf_flush_note_modification(static_cast<buf_block_t*>(slot->object),
-                                start, end);
-    return true;
-  }
-};
+mtr_t::mtr_t()= default;
+mtr_t::~mtr_t()= default;
 
 /** Start a mini-transaction. */
 void mtr_t::start()
 {
+  ut_ad(m_memo.empty());
   ut_ad(!m_freed_pages);
   ut_ad(!m_freed_space);
   MEM_UNDEFINED(this, sizeof *this);
+  MEM_MAKE_DEFINED(&m_memo, sizeof m_memo);
   MEM_MAKE_DEFINED(&m_freed_space, sizeof m_freed_space);
   MEM_MAKE_DEFINED(&m_freed_pages, sizeof m_freed_pages);
 
   ut_d(m_start= true);
   ut_d(m_commit= false);
+  ut_d(m_freeing_tree= false);
 
   m_last= nullptr;
   m_last_offset= 0;
 
-  new(&m_memo) mtr_buf_t();
   new(&m_log) mtr_buf_t();
 
   m_made_dirty= false;
@@ -415,12 +112,18 @@ void mtr_t::start()
 inline void mtr_t::release_resources()
 {
   ut_ad(is_active());
-  ut_d(m_memo.for_each_block_in_reverse(CIterate<DebugCheck>()));
+  ut_ad(m_memo.empty());
   m_log.erase();
-  m_memo.erase();
   ut_d(m_commit= true);
 }
 
+void mtr_t::release()
+{
+  for (auto it= m_memo.rbegin(); it != m_memo.rend(); it++)
+    it->release();
+  m_memo.clear();
+}
+
 /** Commit a mini-transaction. */
 void mtr_t::commit()
 {
@@ -437,7 +140,7 @@ void mtr_t::commit()
 
     std::pair<lsn_t,page_flush_ahead> lsns;
 
-    if (UNIV_LIKELY(m_log_mode == MTR_LOG_ALL))
+    if (UNIV_LIKELY(is_logged()))
     {
       lsns= do_write();
 
@@ -463,7 +166,7 @@ void mtr_t::commit()
     {
       ut_ad(!m_freed_pages->empty());
       ut_ad(m_freed_space);
-      ut_ad(memo_contains(*m_freed_space));
+      ut_ad(m_freed_space->is_owner());
       ut_ad(is_named_space(m_freed_space));
       /* Update the last freed lsn */
       m_freed_space->update_last_freed_lsn(m_commit_lsn);
@@ -481,12 +184,21 @@ void mtr_t::commit()
     else
       ut_ad(!m_freed_space);
 
-    m_memo.for_each_block_in_reverse
-      (CIterate<const ReleaseBlocks>(ReleaseBlocks(lsns.first, m_commit_lsn)));
+    for (const mtr_memo_slot_t &slot : m_memo)
+    {
+      if (slot.type & MTR_MEMO_MODIFY)
+      {
+        ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
+              slot.type == MTR_MEMO_PAGE_SX_MODIFY);
+        buf_flush_note_modification(static_cast<buf_block_t*>(slot.object),
+                                    lsns.first, m_commit_lsn);
+      }
+    }
+
     if (m_made_dirty)
       mysql_mutex_unlock(&log_sys.flush_order_mutex);
 
-    m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
+    release();
 
     if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
       buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
@@ -495,117 +207,220 @@ void mtr_t::commit()
       srv_stats.log_write_requests.inc();
   }
   else
-    m_memo.for_each_block_in_reverse(CIterate<ReleaseAll>());
+  {
+    if (m_freed_pages)
+    {
+      ut_ad(!m_freed_pages->empty());
+      ut_ad(m_freed_space == fil_system.temp_space);
+      ut_ad(!is_trim_pages());
+      for (const auto &range : *m_freed_pages)
+        m_freed_space->add_free_range(range);
+      delete m_freed_pages;
+      m_freed_pages= nullptr;
+      m_freed_space= nullptr;
+    }
+    release();
+  }
 
   release_resources();
 }
 
-/** Release latches till savepoint. To simplify the code only
-MTR_MEMO_S_LOCK and MTR_MEMO_PAGE_S_FIX slot types are allowed to be
-released, otherwise it would be neccesary to add one more argument in the
-function to point out what slot types are allowed for rollback, and this
-would be overengineering as corrently the function is used only in one place
-in the code.
-@param savepoint   savepoint, can be obtained with get_savepoint */
-void mtr_t::rollback_to_savepoint(ulint savepoint)
+void mtr_t::rollback_to_savepoint(ulint begin, ulint end)
 {
-  Iterate<TillSavepoint<ReleaseLatches>> iteration(
-      TillSavepoint<ReleaseLatches>(ReleaseLatches(), savepoint,
-                                    get_savepoint()));
-  m_memo.for_each_block_in_reverse(iteration);
+  ut_ad(end <= m_memo.size());
+  ut_ad(begin <= end);
+  ulint s= end;
+
+  while (s-- > begin)
+  {
+    const mtr_memo_slot_t &slot= m_memo[s];
+    ut_ad(slot.object);
+    /* This is intended for releasing latches on indexes or unmodified
+    buffer pool pages. */
+    ut_ad(slot.type <= MTR_MEMO_SX_LOCK);
+    ut_ad(!(slot.type & MTR_MEMO_MODIFY));
+    slot.release();
+  }
+
+  m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end);
 }
 
-/** Shrink a tablespace. */
-struct Shrink
+/** Commit a mini-transaction that is shrinking a tablespace.
+@param space   tablespace that is being shrunk */
+void mtr_t::commit_shrink(fil_space_t &space)
 {
-  /** the first non-existing page in the tablespace */
-  const page_id_t high;
+  ut_ad(is_active());
+  ut_ad(!is_inside_ibuf());
+  ut_ad(!high_level_read_only);
+  ut_ad(m_modifications);
+  ut_ad(m_made_dirty);
+  ut_ad(!m_memo.empty());
+  ut_ad(!recv_recovery_is_on());
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_ad(!m_freed_pages);
+  ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
+
+  log_write_and_flush_prepare();
+
+  const lsn_t start_lsn= do_write().first;
+  ut_d(m_log.erase());
+
+  mysql_mutex_lock(&log_sys.flush_order_mutex);
+  /* Durably write the reduced FSP_SIZE before truncating the data file. */
+  log_write_and_flush();
+
+  os_file_truncate(space.chain.start->name, space.chain.start->handle,
+                   os_offset_t{space.size} << srv_page_size_shift, true);
+
+  space.clear_freed_ranges();
 
-  Shrink(const fil_space_t &space) : high({space.id, space.size}) {}
+  const page_id_t high{space.id, space.size};
 
-  bool operator()(mtr_memo_slot_t *slot) const
+  for (mtr_memo_slot_t &slot : m_memo)
   {
-    if (!slot->object)
-      return true;
-    switch (slot->type) {
+    ut_ad(slot.object);
+    switch (slot.type) {
     default:
       ut_ad("invalid type" == 0);
-      return false;
+      break;
     case MTR_MEMO_SPACE_X_LOCK:
-      ut_ad(high.space() == static_cast<fil_space_t*>(slot->object)->id);
-      return true;
+      ut_ad(high.space() == static_cast<fil_space_t*>(slot.object)->id);
+      break;
     case MTR_MEMO_PAGE_X_MODIFY:
     case MTR_MEMO_PAGE_SX_MODIFY:
     case MTR_MEMO_PAGE_X_FIX:
     case MTR_MEMO_PAGE_SX_FIX:
-      auto &bpage= static_cast<buf_block_t*>(slot->object)->page;
-      ut_ad(bpage.io_fix() == BUF_IO_NONE);
-      const auto id= bpage.id();
+      auto &block= *static_cast<buf_block_t*>(slot.object);
+      const auto s= block.page.state();
+      ut_ad(s >= buf_page_t::FREED);
+      ut_ad(s < buf_page_t::READ_FIX);
+      ut_ad(block.page.frame);
+      const page_id_t id{block.page.id()};
       if (id < high)
       {
         ut_ad(id.space() == high.space() ||
               (id == page_id_t{0, TRX_SYS_PAGE_NO} &&
                srv_is_undo_tablespace(high.space())));
-        break;
+        if (slot.type & MTR_MEMO_MODIFY)
+          buf_flush_note_modification(&block, start_lsn, m_commit_lsn);
+      }
+      else
+      {
+        ut_ad(id.space() == high.space());
+        if (s >= buf_page_t::UNFIXED)
+          block.page.set_freed(s);
+        if (block.page.oldest_modification() > 1)
+          block.page.reset_oldest_modification();
+        slot.type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY);
       }
-      ut_ad(id.space() == high.space());
-      ut_ad(bpage.state() == BUF_BLOCK_FILE_PAGE);
-      if (bpage.oldest_modification() > 1)
-        bpage.clear_oldest_modification(false);
-      slot->type= static_cast<mtr_memo_type_t>(slot->type & ~MTR_MEMO_MODIFY);
     }
-    return true;
   }
-};
 
-/** Commit a mini-transaction that is shrinking a tablespace.
-@param space   tablespace that is being shrunk */
-void mtr_t::commit_shrink(fil_space_t &space)
+  mysql_mutex_unlock(&log_sys.flush_order_mutex);
+
+  mysql_mutex_lock(&fil_system.mutex);
+  ut_ad(space.is_being_truncated);
+  ut_ad(space.is_stopping());
+  space.clear_stopping();
+  space.is_being_truncated= false;
+  mysql_mutex_unlock(&fil_system.mutex);
+
+  release();
+  release_resources();
+  srv_stats.log_write_requests.inc();
+}
+
+/** Commit a mini-transaction that is deleting or renaming a file.
+@param space   tablespace that is being renamed or deleted
+@param name    new file name (nullptr=the file will be deleted)
+@param detached_handle if detached_handle != nullptr and if space is detached
+                       during the function execution the file handle if its
+                       node will be set to OS_FILE_CLOSED, and the previous
+                       value of the file handle will be assigned to the
+                       address, pointed by detached_handle.
+@return whether the operation succeeded */
+bool mtr_t::commit_file(fil_space_t &space, const char *name,
+    pfs_os_file_t *detached_handle)
 {
   ut_ad(is_active());
   ut_ad(!is_inside_ibuf());
   ut_ad(!high_level_read_only);
   ut_ad(m_modifications);
-  ut_ad(m_made_dirty);
+  ut_ad(!m_made_dirty);
   ut_ad(!recv_recovery_is_on());
   ut_ad(m_log_mode == MTR_LOG_ALL);
   ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
 
   log_write_and_flush_prepare();
 
-  const lsn_t start_lsn= do_write().first;
+  do_write();
 
-  mysql_mutex_lock(&log_sys.flush_order_mutex);
-  /* Durably write the reduced FSP_SIZE before truncating the data file. */
+  mysql_mutex_assert_owner(&log_sys.mutex);
+
+  if (!name && space.max_lsn)
+  {
+    ut_d(space.max_lsn= 0);
+    fil_system.named_spaces.remove(space);
+  }
+
+  /* Block log_checkpoint(). */
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  /* Durably write the log for the file system operation. */
   log_write_and_flush();
 
-  ut_ad(!m_freed_pages);
+  char *old_name= space.chain.start->name;
+  bool success;
 
-  space.clear_freed_ranges();
+  if (name)
+  {
+    char *new_name= mem_strdup(name);
+    mysql_mutex_lock(&fil_system.mutex);
+    success= os_file_rename(innodb_data_file_key, old_name, name);
+    if (success)
+      space.chain.start->name= new_name;
+    else
+      old_name= new_name;
+    mysql_mutex_unlock(&fil_system.mutex);
+    ut_free(old_name);
+  }
+  else
+  {
+    /* Remove any additional files. */
+    if (char *cfg_name= fil_make_filepath(old_name,
+					  fil_space_t::name_type{}, CFG,
+                                          false))
+    {
+      os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr);
+      ut_free(cfg_name);
+    }
 
-  m_memo.for_each_block_in_reverse(CIterate<Shrink>{space});
+    if (FSP_FLAGS_HAS_DATA_DIR(space.flags))
+      RemoteDatafile::delete_link_file(space.name());
 
-  m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
-                                   (ReleaseBlocks(start_lsn, m_commit_lsn)));
-  mysql_mutex_unlock(&log_sys.flush_order_mutex);
+    /* Remove the directory entry. The file will actually be deleted
+    when our caller closes the handle. */
+    os_file_delete(innodb_data_file_key, old_name);
 
-  mutex_enter(&fil_system.mutex);
-  ut_ad(space.is_being_truncated);
-  ut_ad(space.is_stopping());
-  space.set_stopping(false);
-  space.is_being_truncated= false;
-  mutex_exit(&fil_system.mutex);
+    mysql_mutex_lock(&fil_system.mutex);
+    /* Sanity checks after reacquiring fil_system.mutex */
+    ut_ad(&space == fil_space_get_by_id(space.id));
+    ut_ad(!space.referenced());
+    ut_ad(space.is_stopping());
 
-  /* Truncate the file before releasing the space.latch. File extension
-  (and any allocation of pages beyond the current intended end of the file)
-  is covered by exclusive space.latch, which we are still holding here. */
-  os_file_truncate(space.chain.start->name, space.chain.start->handle,
-                   os_offset_t{space.size} << srv_page_size_shift, true);
+    pfs_os_file_t handle = fil_system.detach(&space, true);
+    if (detached_handle)
+      *detached_handle = handle;
+    mysql_mutex_unlock(&fil_system.mutex);
 
-  m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
-  srv_stats.log_write_requests.inc();
+    success= true;
+  }
 
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
   release_resources();
+
+  srv_stats.log_write_requests.inc();
+  return success;
 }
 
 /** Commit a mini-transaction that did not modify any pages,
@@ -621,7 +436,6 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn)
 	ut_ad(!is_inside_ibuf());
 	ut_ad(m_log_mode == MTR_LOG_ALL);
 	ut_ad(!m_made_dirty);
-	ut_ad(m_memo.size() == 0);
 	ut_ad(!srv_read_only_mode);
 	ut_ad(!m_freed_space);
 	ut_ad(!m_freed_pages);
@@ -656,19 +470,9 @@ void mtr_t::commit_files(lsn_t checkpoint_lsn)
 bool
 mtr_t::is_named_space(ulint space) const
 {
-	ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
-
-	switch (m_log_mode) {
-	case MTR_LOG_NONE:
-	case MTR_LOG_NO_REDO:
-		return(true);
-	case MTR_LOG_ALL:
-		return(m_user_space_id == space
-		       || is_predefined_tablespace(space));
-	}
-
-	ut_error;
-	return(false);
+  ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
+  return !is_logged() || m_user_space_id == space ||
+    is_predefined_tablespace(space);
 }
 /** Check if a tablespace is associated with the mini-transaction
 (needed for generating a FILE_MODIFY record)
@@ -678,27 +482,16 @@ bool mtr_t::is_named_space(const fil_space_t* space) const
 {
   ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
 
-  switch (m_log_mode) {
-  case MTR_LOG_NONE:
-  case MTR_LOG_NO_REDO:
-    return true;
-  case MTR_LOG_ALL:
-    return m_user_space == space || is_predefined_tablespace(space->id);
-  }
-
-  ut_error;
-  return false;
+  return !is_logged() || m_user_space == space ||
+    is_predefined_tablespace(space->id);
 }
 #endif /* UNIV_DEBUG */
 
 /** Acquire a tablespace X-latch.
-NOTE: use mtr_x_lock_space().
 @param[in]	space_id	tablespace ID
-@param[in]	file		file name from where called
-@param[in]	line		line number in file
 @return the tablespace object (never NULL) */
 fil_space_t*
-mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line)
+mtr_t::x_lock_space(ulint space_id)
 {
 	fil_space_t*	space;
 
@@ -716,29 +509,39 @@ mtr_t::x_lock_space(ulint space_id, const char* file, unsigned line)
 
 	ut_ad(space);
 	ut_ad(space->id == space_id);
-	x_lock_space(space, file, line);
+	x_lock_space(space);
 	return(space);
 }
 
-/** Release an object in the memo stack.
-@return true if released */
-bool
-mtr_t::memo_release(const void* object, ulint type)
+/** Acquire an exclusive tablespace latch.
+@param space  tablespace */
+void mtr_t::x_lock_space(fil_space_t *space)
 {
-	ut_ad(is_active());
-
-	/* We cannot release a page that has been written to in the
-	middle of a mini-transaction. */
-	ut_ad(!m_modifications || type != MTR_MEMO_PAGE_X_FIX);
-
-	Iterate<Find> iteration(Find(object, type));
+  ut_ad(space->purpose == FIL_TYPE_TEMPORARY ||
+        space->purpose == FIL_TYPE_IMPORT ||
+        space->purpose == FIL_TYPE_TABLESPACE);
+  if (!memo_contains(*space))
+  {
+    memo_push(space, MTR_MEMO_SPACE_X_LOCK);
+    space->x_lock();
+  }
+}
 
-	if (!m_memo.for_each_block_in_reverse(iteration)) {
-		memo_slot_release(iteration.functor.m_slot);
-		return(true);
-	}
+void mtr_t::release(const void *object)
+{
+  ut_ad(is_active());
 
-	return(false);
+  auto it=
+    std::find_if(m_memo.begin(), m_memo.end(),
+                 [object](const mtr_memo_slot_t& slot)
+                 { return slot.object == object; });
+  ut_ad(it != m_memo.end());
+  ut_ad(!(it->type & MTR_MEMO_MODIFY));
+  it->release();
+  m_memo.erase(it, it + 1);
+  ut_ad(std::find_if(m_memo.begin(), m_memo.end(),
+                     [object](const mtr_memo_slot_t& slot)
+                     { return slot.object == &object; }) == m_memo.end());
 }
 
 static bool log_margin_warned;
@@ -931,6 +734,49 @@ static mtr_t::page_flush_ahead log_close(lsn_t lsn)
   return mtr_t::PAGE_FLUSH_SYNC;
 }
 
+inline void mtr_t::page_checksum(const buf_page_t &bpage)
+{
+  const byte *page= bpage.frame;
+  size_t size= srv_page_size;
+
+  if (UNIV_LIKELY_NULL(bpage.zip.data))
+  {
+    size= (UNIV_ZIP_SIZE_MIN >> 1) << bpage.zip.ssize;
+    switch (fil_page_get_type(bpage.zip.data)) {
+    case FIL_PAGE_TYPE_ALLOCATED:
+    case FIL_PAGE_INODE:
+    case FIL_PAGE_IBUF_BITMAP:
+    case FIL_PAGE_TYPE_FSP_HDR:
+    case FIL_PAGE_TYPE_XDES:
+      /* These are essentially uncompressed pages. */
+      break;
+    default:
+      page= bpage.zip.data;
+    }
+  }
+
+  /* We have to exclude from the checksum the normal
+  page checksum that is written by buf_flush_init_for_writing()
+  and FIL_PAGE_LSN which would be updated once we have actually
+  allocated the LSN.
+
+  Unfortunately, we cannot access fil_space_t easily here. In order to
+  be compatible with encrypted tablespaces in the pre-full_crc32
+  format we will unconditionally exclude the 8 bytes at
+  FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+  a.k.a. FIL_RTREE_SPLIT_SEQ_NUM. */
+  const uint32_t checksum=
+    my_crc32c(my_crc32c(my_crc32c(0, page + FIL_PAGE_OFFSET,
+                                  FIL_PAGE_LSN - FIL_PAGE_OFFSET),
+                        page + FIL_PAGE_TYPE, 2),
+              page + FIL_PAGE_SPACE_ID, size - (FIL_PAGE_SPACE_ID + 8));
+
+  byte *l= log_write<OPTION>(bpage.id(), nullptr, 5, true, 0);
+  *l++= OPT_PAGE_CHECKSUM;
+  mach_write_to_4(l, checksum);
+  m_log.close(l + 4);
+}
+
 /** Write the block contents to the REDO log */
 struct mtr_write_log
 {
@@ -945,40 +791,56 @@ struct mtr_write_log
 
 std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
 {
-	ut_ad(!recv_no_log_write);
-	ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_ad(!recv_no_log_write);
+  ut_ad(is_logged());
 
-	ulint	len	= m_log.size();
-	ut_ad(len > 0);
+  ulint	len= m_log.size();
+  ut_ad(len);
 
-	if (len > srv_log_buffer_size / 2) {
-		log_buffer_extend(ulong((len + 1) * 2));
-	}
+#ifndef DBUG_OFF
+  do
+  {
+    if (m_log_mode != MTR_LOG_ALL)
+      continue;
+    DBUG_EXECUTE_IF("skip_page_checksum", continue;);
 
-	fil_space_t*	space = m_user_space;
+    for (const mtr_memo_slot_t& slot : m_memo)
+      if (slot.type & MTR_MEMO_MODIFY)
+      {
+        const buf_page_t &b= *static_cast<const buf_page_t*>(slot.object);
+        if (!b.is_freed())
+          page_checksum(b);
+      }
+    len= m_log.size();
+  }
+  while (0);
+#endif
 
-	if (space != NULL && is_predefined_tablespace(space->id)) {
-		/* Omit FILE_MODIFY for predefined tablespaces. */
-		space = NULL;
-	}
+  if (len > srv_log_buffer_size / 2)
+    log_buffer_extend(ulong((len + 1) * 2));
 
-	mysql_mutex_lock(&log_sys.mutex);
+  fil_space_t *space= m_user_space;
 
-	if (fil_names_write_if_was_clean(space)) {
-		len = m_log.size();
-	} else {
-		/* This was not the first time of dirtying a
-		tablespace since the latest checkpoint. */
-		ut_ad(len == m_log.size());
-	}
+  if (space && is_predefined_tablespace(space->id))
+    /* Omit FILE_MODIFY for predefined tablespaces. */
+    space= nullptr;
+
+  mysql_mutex_lock(&log_sys.mutex);
 
-	*m_log.push<byte*>(1) = 0;
-	len++;
+  if (fil_names_write_if_was_clean(space))
+    len= m_log.size();
+  else
+    /* This was not the first time of dirtying a
+    tablespace since the latest checkpoint. */
+    ut_ad(len == m_log.size());
 
-	/* check and attempt a checkpoint if exceeding capacity */
-	log_margin_checkpoint_age(len);
+  *m_log.push<byte*>(1)= 0;
+  len++;
 
-	return finish_write(len);
+  /* check and attempt a checkpoint if exceeding capacity */
+  log_margin_checkpoint_age(len);
+
+  return finish_write(len);
 }
 
 /** Append the redo log records to the redo log buffer.
@@ -986,7 +848,7 @@ std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
 @return {start_lsn,flush_ahead} */
 inline std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::finish_write(ulint len)
 {
-	ut_ad(m_log_mode == MTR_LOG_ALL);
+	ut_ad(is_logged());
 	mysql_mutex_assert_owner(&log_sys.mutex);
 	ut_ad(m_log.size() == len);
 	ut_ad(len > 0);
@@ -1017,217 +879,427 @@ piecewise:
 	return std::make_pair(start_lsn, flush);
 }
 
-/** Find out whether a block was not X-latched by the mini-transaction */
-struct FindBlockX
+bool mtr_t::have_x_latch(const buf_block_t &block) const
 {
-  const buf_block_t &block;
-
-  FindBlockX(const buf_block_t &block): block(block) {}
+  ut_d(const mtr_memo_slot_t *found= nullptr);
 
-  /** @return whether the block was not found x-latched */
-  bool operator()(const mtr_memo_slot_t *slot) const
+  for (const mtr_memo_slot_t &slot : m_memo)
   {
-    return slot->object != &block || slot->type != MTR_MEMO_PAGE_X_FIX;
-  }
-};
+    if (slot.object != &block)
+      continue;
 
-#ifdef UNIV_DEBUG
-/** Assert that the block is not present in the mini-transaction */
-struct FindNoBlock
-{
-  const buf_block_t &block;
+    ut_d(found= &slot);
+
+    if (!(slot.type & MTR_MEMO_PAGE_X_FIX))
+      continue;
 
-  FindNoBlock(const buf_block_t &block): block(block) {}
+    ut_ad(block.page.lock.have_x());
+    return true;
+  }
+
+  ut_ad(!found);
+  return false;
+}
 
-  /** @return whether the block was not found */
-  bool operator()(const mtr_memo_slot_t *slot) const
+bool mtr_t::have_u_or_x_latch(const buf_block_t &block) const
+{
+  for (const mtr_memo_slot_t &slot : m_memo)
   {
-    return slot->object != &block;
+    if (slot.object == &block &&
+        slot.type & (MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX))
+    {
+      ut_ad(block.page.lock.have_u_or_x());
+      return true;
+    }
   }
-};
-#endif /* UNIV_DEBUG */
+  return false;
+}
 
-bool mtr_t::have_x_latch(const buf_block_t &block) const
+/** Check if we are holding exclusive tablespace latch
+@param space  tablespace to search for
+@return whether space.latch is being held */
+bool mtr_t::memo_contains(const fil_space_t& space) const
 {
-  if (m_memo.for_each_block(CIterate<FindBlockX>(FindBlockX(block))))
+  for (const mtr_memo_slot_t &slot : m_memo)
   {
-    ut_ad(m_memo.for_each_block(CIterate<FindNoBlock>(FindNoBlock(block))));
-    ut_ad(!memo_contains_flagged(&block,
-                                 MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_SX_FIX |
-                                 MTR_MEMO_BUF_FIX | MTR_MEMO_MODIFY));
-    return false;
+    if (slot.object == &space && slot.type == MTR_MEMO_SPACE_X_LOCK)
+    {
+      ut_ad(space.is_owner());
+      return true;
+    }
   }
-  ut_ad(rw_lock_own(&block.lock, RW_LOCK_X));
-  return true;
+
+  return false;
 }
 
-#ifdef UNIV_DEBUG
-/** Check if we are holding an rw-latch in this mini-transaction
-@param lock   latch to search for
-@param type   held latch type
-@return whether (lock,type) is contained */
-bool mtr_t::memo_contains(const rw_lock_t &lock, mtr_memo_type_t type)
+void mtr_t::page_lock_upgrade(const buf_block_t &block)
 {
-  Iterate<Find> iteration(Find(&lock, type));
-  if (m_memo.for_each_block_in_reverse(iteration))
-    return false;
+  ut_ad(block.page.lock.have_x());
 
-  switch (type) {
-  case MTR_MEMO_X_LOCK:
-    ut_ad(rw_lock_own(&lock, RW_LOCK_X));
-    break;
-  case MTR_MEMO_SX_LOCK:
-    ut_ad(rw_lock_own(&lock, RW_LOCK_SX));
+  for (mtr_memo_slot_t &slot : m_memo)
+    if (slot.object == &block && slot.type & MTR_MEMO_PAGE_SX_FIX)
+      slot.type= mtr_memo_type_t(slot.type ^
+                                 (MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX));
+
+#ifdef BTR_CUR_HASH_ADAPT
+  ut_ad(!block.index || !block.index->freed());
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/** Latch a buffer pool block.
+@param block    block to be latched
+@param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+void mtr_t::page_lock(buf_block_t *block, ulint rw_latch)
+{
+  mtr_memo_type_t fix_type;
+  ut_d(const auto state= block->page.state());
+  ut_ad(state > buf_page_t::FREED);
+  ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
+  switch (rw_latch) {
+  case RW_NO_LATCH:
+    fix_type= MTR_MEMO_BUF_FIX;
+    goto done;
+  case RW_S_LATCH:
+    fix_type= MTR_MEMO_PAGE_S_FIX;
+    block->page.lock.s_lock();
     break;
-  case MTR_MEMO_S_LOCK:
-    ut_ad(rw_lock_own(&lock, RW_LOCK_S));
+  case RW_SX_LATCH:
+    fix_type= MTR_MEMO_PAGE_SX_FIX;
+    block->page.lock.u_lock();
+    ut_ad(!block->page.is_io_fixed());
     break;
   default:
-    break;
+    ut_ad(rw_latch == RW_X_LATCH);
+    fix_type= MTR_MEMO_PAGE_X_FIX;
+    if (block->page.lock.x_lock_upgraded())
+    {
+      block->unfix();
+      page_lock_upgrade(*block);
+      return;
+    }
+    ut_ad(!block->page.is_io_fixed());
   }
 
-  return true;
-}
+#ifdef BTR_CUR_HASH_ADAPT
+  btr_search_drop_page_hash_index(block, true);
+#endif
 
-/** Check if we are holding exclusive tablespace latch
-@param space  tablespace to search for
-@return whether space.latch is being held */
-bool mtr_t::memo_contains(const fil_space_t& space)
-{
-  Iterate<Find> iteration(Find(&space, MTR_MEMO_SPACE_X_LOCK));
-  if (m_memo.for_each_block_in_reverse(iteration))
-    return false;
-  ut_ad(rw_lock_own(const_cast<rw_lock_t*>(&space.latch), RW_LOCK_X));
-  return true;
+done:
+  ut_ad(state < buf_page_t::UNFIXED ||
+        page_id_t(page_get_space_id(block->page.frame),
+                  page_get_page_no(block->page.frame)) == block->page.id());
+  memo_push(block, fix_type);
 }
 
-/** Debug check for flags */
-struct FlaggedCheck {
-	FlaggedCheck(const void* ptr, ulint flags)
-		:
-		m_ptr(ptr),
-		m_flags(flags)
-	{
-		/* There must be some flags to look for. */
-		ut_ad(flags);
-		/* Look for rw-lock-related and page-related flags. */
-		ut_ad(!(flags & ulint(~(MTR_MEMO_PAGE_S_FIX
-					| MTR_MEMO_PAGE_X_FIX
-					| MTR_MEMO_PAGE_SX_FIX
-					| MTR_MEMO_BUF_FIX
-					| MTR_MEMO_MODIFY
-					| MTR_MEMO_X_LOCK
-					| MTR_MEMO_SX_LOCK
-					| MTR_MEMO_S_LOCK))));
-		/* Either some rw-lock-related or page-related flags
-		must be specified, but not both at the same time. */
-		ut_ad(!(flags & (MTR_MEMO_PAGE_S_FIX
-				 | MTR_MEMO_PAGE_X_FIX
-				 | MTR_MEMO_PAGE_SX_FIX
-				 | MTR_MEMO_BUF_FIX
-				 | MTR_MEMO_MODIFY))
-		      == !!(flags & (MTR_MEMO_X_LOCK
-				     | MTR_MEMO_SX_LOCK
-				     | MTR_MEMO_S_LOCK)));
-	}
+void mtr_t::upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch)
+{
+  ut_ad(is_active());
+  mtr_memo_slot_t &slot= m_memo[savepoint];
+  ut_ad(slot.type == MTR_MEMO_BUF_FIX);
+  buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+  ut_d(const auto state= block->page.state());
+  ut_ad(state > buf_page_t::UNFIXED);
+  ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
+  static_assert(int{MTR_MEMO_PAGE_S_FIX} == int{RW_S_LATCH}, "");
+  static_assert(int{MTR_MEMO_PAGE_X_FIX} == int{RW_X_LATCH}, "");
+  static_assert(int{MTR_MEMO_PAGE_SX_FIX} == int{RW_SX_LATCH}, "");
+  slot.type= mtr_memo_type_t(rw_latch);
+
+  switch (rw_latch) {
+  default:
+    ut_ad("invalid state" == 0);
+    break;
+  case RW_S_LATCH:
+    block->page.lock.s_lock();
+    break;
+  case RW_SX_LATCH:
+    block->page.lock.u_lock();
+    ut_ad(!block->page.is_io_fixed());
+    break;
+  case RW_X_LATCH:
+    block->page.lock.x_lock();
+    ut_ad(!block->page.is_io_fixed());
+  }
 
-	/** Visit a memo entry.
-	@param[in]	slot	memo entry to visit
-	@retval	false	if m_ptr was found
-	@retval	true	if the iteration should continue */
-	bool operator()(const mtr_memo_slot_t* slot) const
-	{
-		if (m_ptr != slot->object || !(m_flags & slot->type)) {
-			return(true);
-		}
+#ifdef BTR_CUR_HASH_ADAPT
+  btr_search_drop_page_hash_index(block, true);
+#endif
+  ut_ad(page_id_t(page_get_space_id(block->page.frame),
+                  page_get_page_no(block->page.frame)) == block->page.id());
+}
 
-		if (ulint flags = m_flags & (MTR_MEMO_PAGE_S_FIX
-					     | MTR_MEMO_PAGE_SX_FIX
-					     | MTR_MEMO_PAGE_X_FIX)) {
-			rw_lock_t* lock = &static_cast<buf_block_t*>(
-				const_cast<void*>(m_ptr))->lock;
-			ut_ad(rw_lock_own_flagged(lock, flags));
-		} else {
-			rw_lock_t* lock = static_cast<rw_lock_t*>(
-				const_cast<void*>(m_ptr));
-			ut_ad(rw_lock_own_flagged(lock, m_flags >> 5));
-		}
+#ifdef UNIV_DEBUG
+/** Check if we are holding an rw-latch in this mini-transaction
+@param lock   latch to search for
+@param type   held latch type
+@return whether (lock,type) is contained */
+bool mtr_t::memo_contains(const index_lock &lock, mtr_memo_type_t type) const
+{
+  ut_ad(type == MTR_MEMO_X_LOCK || type == MTR_MEMO_S_LOCK ||
+        type == MTR_MEMO_SX_LOCK);
 
-		return(false);
-	}
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (slot.object == &lock && slot.type == type)
+    {
+      switch (type) {
+      case MTR_MEMO_X_LOCK:
+        ut_ad(lock.have_x());
+        break;
+      case MTR_MEMO_SX_LOCK:
+        ut_ad(lock.have_u_or_x());
+        break;
+      case MTR_MEMO_S_LOCK:
+        ut_ad(lock.have_s());
+        break;
+      default:
+        break;
+      }
+      return true;
+    }
+  }
 
-	const void*const	m_ptr;
-	const ulint		m_flags;
-};
+  return false;
+}
 
 /** Check if memo contains the given item.
 @param object		object to search
 @param flags		specify types of object (can be ORred) of
 			MTR_MEMO_PAGE_S_FIX ... values
 @return true if contains */
-bool
-mtr_t::memo_contains_flagged(const void* ptr, ulint flags) const
+bool mtr_t::memo_contains_flagged(const void *object, ulint flags) const
 {
-	ut_ad(is_active());
+  ut_ad(is_active());
+  ut_ad(flags);
+  /* Look for rw-lock-related and page-related flags. */
+  ut_ad(!(flags & ulint(~(MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_X_FIX |
+                          MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_BUF_FIX |
+                          MTR_MEMO_MODIFY | MTR_MEMO_X_LOCK |
+                          MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK))));
+  /* Either some rw-lock-related or page-related flags
+  must be specified, but not both at the same time. */
+  ut_ad(!(flags & (MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_X_FIX |
+                   MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_BUF_FIX |
+                   MTR_MEMO_MODIFY)) ==
+        !!(flags & (MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK)));
+
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (object != slot.object)
+      continue;
+
+    auto f = flags & slot.type;
+    if (!f)
+      continue;
 
-	return !m_memo.for_each_block_in_reverse(
-		CIterate<FlaggedCheck>(FlaggedCheck(ptr, flags)));
+    if (f & (MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX))
+    {
+      const block_lock &lock= static_cast<const buf_page_t*>(object)->lock;
+      ut_ad(!(f & MTR_MEMO_PAGE_S_FIX) || lock.have_s());
+      ut_ad(!(f & MTR_MEMO_PAGE_SX_FIX) || lock.have_u_or_x());
+      ut_ad(!(f & MTR_MEMO_PAGE_X_FIX) || lock.have_x());
+    }
+    else
+    {
+      const index_lock &lock= *static_cast<const index_lock*>(object);
+      ut_ad(!(f & MTR_MEMO_S_LOCK) || lock.have_s());
+      ut_ad(!(f & MTR_MEMO_SX_LOCK) || lock.have_u_or_x());
+      ut_ad(!(f & MTR_MEMO_X_LOCK) || lock.have_x());
+    }
+
+    return true;
+  }
+
+  return false;
 }
 
-/** Check if memo contains the given page.
-@param[in]	ptr	pointer to within buffer frame
-@param[in]	flags	specify types of object with OR of
-			MTR_MEMO_PAGE_S_FIX... values
-@return	the block
-@retval	NULL	if not found */
-buf_block_t*
-mtr_t::memo_contains_page_flagged(
-	const byte*	ptr,
-	ulint		flags) const
+buf_block_t* mtr_t::memo_contains_page_flagged(const byte *ptr, ulint flags)
+  const
 {
-	Iterate<FindPage> iteration(FindPage(ptr, flags));
-	return m_memo.for_each_block_in_reverse(iteration)
-		? NULL : iteration.functor.get_block();
+  ptr= page_align(ptr);
+
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    ut_ad(slot.object);
+    if (!(flags & slot.type))
+      continue;
+
+    buf_page_t *bpage= static_cast<buf_page_t*>(slot.object);
+
+    if (ptr != bpage->frame)
+      continue;
+
+    ut_ad(!(slot.type & MTR_MEMO_PAGE_S_FIX) || bpage->lock.have_s());
+    ut_ad(!(slot.type & MTR_MEMO_PAGE_SX_FIX) || bpage->lock.have_u_or_x());
+    ut_ad(!(slot.type & MTR_MEMO_PAGE_X_FIX) || bpage->lock.have_x());
+    return static_cast<buf_block_t*>(slot.object);
+  }
+
+  return nullptr;
 }
 #endif /* UNIV_DEBUG */
 
 
-/** Find a block, preferrably in MTR_MEMO_MODIFY state */
-struct FindModified
+/** Mark the given latched page as modified.
+@param block   page that will be modified */
+void mtr_t::set_modified(const buf_block_t &block)
 {
-  mtr_memo_slot_t *found= nullptr;
-  const buf_block_t& block;
+  if (block.page.id().space() >= SRV_TMP_SPACE_ID)
+  {
+    const_cast<buf_block_t&>(block).page.set_temp_modified();
+    return;
+  }
+
+  m_modifications= true;
+
+  if (UNIV_UNLIKELY(m_log_mode == MTR_LOG_NONE))
+    return;
 
-  FindModified(const buf_block_t &block) : block(block) {}
-  bool operator()(mtr_memo_slot_t *slot)
+  for (mtr_memo_slot_t &slot : m_memo)
   {
-    if (slot->object != &block)
-      return true;
-    found= slot;
-    return !(slot->type & (MTR_MEMO_MODIFY |
-                           MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+    if (slot.object == &block &&
+        slot.type & (MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX))
+    {
+      if (slot.type & MTR_MEMO_MODIFY)
+        ut_ad(m_made_dirty || block.page.oldest_modification() > 1);
+      else
+      {
+        slot.type= static_cast<mtr_memo_type_t>(slot.type | MTR_MEMO_MODIFY);
+        if (!m_made_dirty)
+          m_made_dirty= block.page.oldest_modification() <= 1;
+      }
+      return;
+    }
   }
-};
 
-/** Mark the given latched page as modified.
-@param block   page that will be modified */
-void mtr_t::modify(const buf_block_t &block)
+  /* This must be PageConverter::update_page() in IMPORT TABLESPACE. */
+  ut_ad(m_memo.empty());
+  ut_ad(!block.page.in_LRU_list);
+}
+
+void mtr_t::init(buf_block_t *b)
 {
-  if (UNIV_UNLIKELY(m_memo.empty()))
+  const page_id_t id{b->page.id()};
+  ut_ad(is_named_space(id.space()));
+  ut_ad(!m_freed_pages == !m_freed_space);
+  ut_ad(memo_contains_flagged(b, MTR_MEMO_PAGE_X_FIX));
+
+  if (id.space() >= SRV_TMP_SPACE_ID)
+    b->page.set_temp_modified();
+  else
   {
-    /* This must be PageConverter::update_page() in IMPORT TABLESPACE. */
-    ut_ad(!block.page.in_LRU_list);
-    return;
+    for (mtr_memo_slot_t &slot : m_memo)
+    {
+      if (slot.object == b && slot.type & MTR_MEMO_PAGE_X_FIX)
+      {
+        slot.type= MTR_MEMO_PAGE_X_MODIFY;
+        m_modifications= true;
+        if (!m_made_dirty)
+          m_made_dirty= b->page.oldest_modification() <= 1;
+        goto found;
+      }
+    }
+    ut_ad("block not X-latched" == 0);
   }
 
-  Iterate<FindModified> iteration((FindModified(block)));
-  if (UNIV_UNLIKELY(m_memo.for_each_block(iteration)))
+ found:
+  if (UNIV_LIKELY_NULL(m_freed_space) &&
+      m_freed_space->id == id.space() &&
+      m_freed_pages->remove_if_exists(id.page_no()) &&
+      m_freed_pages->empty())
   {
-    ut_ad("modifying an unlatched page" == 0);
+    delete m_freed_pages;
+    m_freed_pages= nullptr;
+    m_freed_space= nullptr;
+  }
+
+  b->page.set_reinit(b->page.state() & buf_page_t::LRU_MASK);
+
+  if (!is_logged())
     return;
+
+  m_log.close(log_write<INIT_PAGE>(id, &b->page));
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Free a page.
+@param space   tablespace
+@param offset  offset of the page to be freed */
+void mtr_t::free(const fil_space_t &space, uint32_t offset)
+{
+  ut_ad(is_named_space(&space));
+  ut_ad(!m_freed_space || m_freed_space == &space);
+
+  if (is_logged())
+  {
+    buf_block_t *freed= nullptr;
+    const page_id_t id{space.id, offset};
+
+    for (auto it= m_memo.end(); it != m_memo.begin(); )
+    {
+      it--;
+    next:
+      mtr_memo_slot_t &slot= *it;
+      buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+      ut_ad(block);
+      if (block == freed)
+      {
+        if (slot.type & (MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX))
+          slot.type= MTR_MEMO_PAGE_X_FIX;
+        else
+        {
+          ut_ad(slot.type == MTR_MEMO_BUF_FIX);
+          block->page.unfix();
+          m_memo.erase(it, it + 1);
+          goto next;
+        }
+      }
+      else if (slot.type & (MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX) &&
+               block->page.id() == id)
+      {
+        ut_ad(!block->page.is_freed());
+        ut_ad(!freed);
+        freed= block;
+        if (!(slot.type & MTR_MEMO_PAGE_X_FIX))
+        {
+          ut_d(bool upgraded=) block->page.lock.x_lock_upgraded();
+          ut_ad(upgraded);
+        }
+        if (id.space() >= SRV_TMP_SPACE_ID)
+        {
+          block->page.set_temp_modified();
+          slot.type= MTR_MEMO_PAGE_X_FIX;
+        }
+        else
+        {
+          slot.type= MTR_MEMO_PAGE_X_MODIFY;
+          if (!m_made_dirty)
+            m_made_dirty= block->page.oldest_modification() <= 1;
+        }
+#ifdef BTR_CUR_HASH_ADAPT
+        if (block->index)
+          btr_search_drop_page_hash_index(block, false);
+#endif /* BTR_CUR_HASH_ADAPT */
+        block->page.set_freed(block->page.state());
+      }
+    }
+
+    m_log.close(log_write<FREE_PAGE>(id, nullptr));
   }
-  iteration.functor.found->type= static_cast<mtr_memo_type_t>
-    (iteration.functor.found->type | MTR_MEMO_MODIFY);
-  if (is_block_dirtied(&block))
-    m_made_dirty= true;
+}
+
+void small_vector_base::grow_by_1(void *small, size_t element_size)
+{
+  const size_t cap= Capacity*= 2, s= cap * element_size;
+  void *new_begin;
+  if (BeginX == small)
+  {
+    new_begin= my_malloc(PSI_NOT_INSTRUMENTED, s, MYF(0));
+    memcpy(new_begin, BeginX, size() * element_size);
+    TRASH_FREE(small, size() * element_size);
+  }
+  else
+    new_begin= my_realloc(PSI_NOT_INSTRUMENTED, BeginX, s, MYF(0));
+
+  BeginX= new_begin;
 }
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index 30b2b31abb2..d366c784b96 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -36,12 +36,14 @@ Created 10/21/1995 Heikki Tuuri
 #ifndef UNIV_INNOCHECKSUM
 #include "os0file.h"
 #include "sql_const.h"
+#include "log.h"
 
 #ifdef __linux__
 # include <sys/types.h>
 # include <sys/stat.h>
 #endif
 
+#include "srv0mon.h"
 #include "srv0srv.h"
 #include "srv0start.h"
 #include "fil0fil.h"
@@ -49,10 +51,8 @@ Created 10/21/1995 Heikki Tuuri
 #ifdef HAVE_LINUX_UNISTD_H
 #include "unistd.h"
 #endif
-#include "os0event.h"
-#include "os0thread.h"
+#include "buf0dblwr.h"
 
-#include <vector>
 #include <tpool_structs.h>
 
 #ifdef LINUX_NATIVE_AIO
@@ -73,12 +73,10 @@ Created 10/21/1995 Heikki Tuuri
 
 #ifdef _WIN32
 #include <winioctl.h>
-#else
-// my_test_if_atomic_write()
-#include <my_sys.h>
 #endif
 
-#include "buf0dblwr.h"
+// my_test_if_atomic_write() , my_win_secattr()
+#include <my_sys.h>
 
 #include <thread>
 #include <chrono>
@@ -133,6 +131,11 @@ public:
 	{
 		wait();
 	}
+
+	mysql_mutex_t& mutex()
+	{
+		return m_cache.mutex();
+	}
 };
 
 static io_slots *read_slots;
@@ -155,8 +158,8 @@ static ulint	os_innodb_umask	= 0;
 
 Atomic_counter<ulint> os_n_file_reads;
 static ulint	os_bytes_read_since_printout;
-ulint	os_n_file_writes;
-ulint	os_n_fsyncs;
+Atomic_counter<size_t> os_n_file_writes;
+Atomic_counter<size_t> os_n_fsyncs;
 static ulint	os_n_file_reads_old;
 static ulint	os_n_file_writes_old;
 static ulint	os_n_fsyncs_old;
@@ -180,7 +183,7 @@ mysql_pfs_key_t  innodb_temp_file_key;
 @param[in]	should_abort	whether to abort on an unknown error
 @param[in]	on_error_silent	whether to suppress reports of non-fatal errors
 @return true if we should retry the operation */
-static MY_ATTRIBUTE((warn_unused_result))
+static
 bool
 os_file_handle_error_cond_exit(
 	const char*	name,
@@ -341,7 +344,7 @@ int os_file_lock(int fd, const char *name)
 
 			ib::info()
 				<< "Check that you do not already have"
-				" another mysqld process using the"
+				" another mariadbd process using the"
 				" same InnoDB data or log files.";
 		}
 
@@ -402,50 +405,6 @@ os_file_read_string(
 	}
 }
 
-/** This function returns a new path name after replacing the basename
-in an old path with a new basename.  The old_path is a full path
-name including the extension.  The tablename is in the normal
-form "databasename/tablename".  The new base name is found after
-the forward slash.  Both input strings are null terminated.
-
-This function allocates memory to be returned.  It is the callers
-responsibility to free the return value after it is no longer needed.
-
-@param[in]	old_path		Pathname
-@param[in]	tablename		Contains new base name
-@return own: new full pathname */
-char*
-os_file_make_new_pathname(
-	const char*	old_path,
-	const char*	tablename)
-{
-	ulint		dir_len;
-	char*		last_slash;
-	char*		base_name;
-	char*		new_path;
-	ulint		new_path_len;
-
-	/* Split the tablename into its database and table name components.
-	They are separated by a '/'. */
-	last_slash = strrchr((char*) tablename, '/');
-	base_name = last_slash ? last_slash + 1 : (char*) tablename;
-
-	/* Find the offset of the last slash. We will strip off the
-	old basename.ibd which starts after that slash. */
-	last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
-	dir_len = last_slash ? ulint(last_slash - old_path) : strlen(old_path);
-
-	/* allocate a new path and move the old directory path to it. */
-	new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
-	new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
-	memcpy(new_path, old_path, dir_len);
-
-	snprintf(new_path + dir_len, new_path_len - dir_len,
-		 "%c%s.ibd", OS_PATH_SEPARATOR, base_name);
-
-	return(new_path);
-}
-
 /** This function reduces a null-terminated full remote path name into
 the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
 the 'databasename/tablename.ibd' found at the end of the path with just
@@ -463,7 +422,7 @@ os_file_make_data_dir_path(
 	char*	data_dir_path)
 {
 	/* Replace the period before the extension with a null byte. */
-	char*	ptr = strrchr((char*) data_dir_path, '.');
+	char*	ptr = strrchr(data_dir_path, '.');
 
 	if (ptr == NULL) {
 		return;
@@ -472,7 +431,8 @@ os_file_make_data_dir_path(
 	ptr[0] = '\0';
 
 	/* The tablename starts after the last slash. */
-	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
+	ptr = strrchr(data_dir_path, '/');
+
 
 	if (ptr == NULL) {
 		return;
@@ -483,7 +443,14 @@ os_file_make_data_dir_path(
 	char*	tablename = ptr + 1;
 
 	/* The databasename starts after the next to last slash. */
-	ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
+	ptr = strrchr(data_dir_path, '/');
+#ifdef _WIN32
+	if (char *aptr = strrchr(data_dir_path, '\\')) {
+		if (aptr > ptr) {
+			ptr = aptr;
+		}
+	}
+#endif
 
 	if (ptr == NULL) {
 		return;
@@ -530,10 +497,16 @@ char*
 os_file_get_parent_dir(
 	const char*	path)
 {
-	bool	has_trailing_slash = false;
-
 	/* Find the offset of the last slash */
-	const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
+	const char* last_slash = strrchr(path, '/');
+
+#ifdef _WIN32
+	if (const char *last = strrchr(path, '\\')) {
+		if (last > last_slash) {
+			last_slash = last;
+		}
+	}
+#endif
 
 	if (!last_slash) {
 		/* No slash in the path, return NULL */
@@ -541,13 +514,11 @@ os_file_get_parent_dir(
 	}
 
 	/* Ok, there is a slash. Is there anything after it? */
-	if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
-		has_trailing_slash = true;
-	}
+	const bool has_trailing_slash = last_slash[1] == '\0';
 
-	/* Reduce repetative slashes. */
+	/* Reduce repetitive slashes. */
 	while (last_slash > path
-		&& last_slash[-1] == OS_PATH_SEPARATOR) {
+	       && (IF_WIN(last_slash[-1] == '\\' ||,) last_slash[-1] == '/')) {
 		last_slash--;
 	}
 
@@ -562,13 +533,15 @@ os_file_get_parent_dir(
 		/* Back up to the previous slash. */
 		last_slash--;
 		while (last_slash > path
-		       && last_slash[0] != OS_PATH_SEPARATOR) {
+		       && (IF_WIN(last_slash[0] != '\\' &&,)
+			   last_slash[0] != '/')) {
 			last_slash--;
 		}
 
-		/* Reduce repetative slashes. */
+		/* Reduce repetitive slashes. */
 		while (last_slash > path
-			&& last_slash[-1] == OS_PATH_SEPARATOR) {
+		       && (IF_WIN(last_slash[-1] == '\\' ||,)
+			   last_slash[-1] == '/')) {
 			last_slash--;
 		}
 	}
@@ -600,11 +573,6 @@ test_os_file_get_parent_dir(
 	char* expected = expected_dir == NULL ? NULL
 			 : mem_strdup(expected_dir);
 
-	/* os_file_get_parent_dir() assumes that separators are
-	converted to OS_PATH_SEPARATOR. */
-	os_normalize_path(child);
-	os_normalize_path(expected);
-
 	char* parent = os_file_get_parent_dir(child);
 
 	bool unexpected = (expected == NULL
@@ -798,23 +766,13 @@ ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
 			<< " in a file operation.";
 
 		if (err == ENOENT) {
-
 			ib::error()
 				<< "The error means the system"
 				" cannot find the path specified.";
-
-			if (srv_is_being_started) {
-
-				ib::error()
-					<< "If you are installing InnoDB,"
-					" remember that you must create"
-					" directories yourself, InnoDB"
-					" does not create them.";
-			}
 		} else if (err == EACCES) {
 
 			ib::error()
-				<< "The error means mysqld does not have"
+				<< "The error means mariadbd does not have"
 				" the access rights to the directory.";
 
 		} else {
@@ -1091,12 +1049,8 @@ os_file_create_simple_func(
 	/* This function is always called for data files, we should disable
 	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
 	we open the same file in the same mode, see man page of open(2). */
-       if (!srv_read_only_mode
-	   && *success
-	   && (srv_file_flush_method == SRV_O_DIRECT
-	       || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
-
-	       os_file_set_nocache(file, name, mode_str);
+	if (!srv_read_only_mode && *success) {
+		os_file_set_nocache(file, name, mode_str);
 	}
 
 #ifndef _WIN32
@@ -1274,11 +1228,8 @@ os_file_create_func(
 	if (!read_only
 	    && *success
 	    && type != OS_LOG_FILE
-	    && type != OS_DATA_FILE_NO_O_DIRECT
-	    && (srv_file_flush_method == SRV_O_DIRECT
-		|| srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) {
-
-	       os_file_set_nocache(file, name, mode_str);
+	    && type != OS_DATA_FILE_NO_O_DIRECT) {
+		os_file_set_nocache(file, name, mode_str);
 	}
 
 #ifndef _WIN32
@@ -1294,7 +1245,8 @@ os_file_create_func(
 				<< "Retrying to lock the first data file";
 
 			for (int i = 0; i < 100; i++) {
-				os_thread_sleep(1000000);
+				std::this_thread::sleep_for(
+					std::chrono::seconds(1));
 
 				if (!os_file_lock(file, name)) {
 					*success = true;
@@ -1815,18 +1767,10 @@ ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
 				<< "The error means the system"
 				" cannot find the path specified.";
 
-			if (srv_is_being_started) {
-				ib::error()
-					<< "If you are installing InnoDB,"
-					" remember that you must create"
-					" directories yourself, InnoDB"
-					" does not create them.";
-			}
-
 		} else if (err == ERROR_ACCESS_DENIED) {
 
 			ib::error()
-				<< "The error means mysqld does not have"
+				<< "The error means mariadbd does not have"
 				" the access rights to"
 				" the directory. It may also be"
 				" you have created a subdirectory"
@@ -1840,7 +1784,7 @@ ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
 				" is using InnoDB's files."
 				" This might be a backup or antivirus"
 				" software or another instance"
-				" of MySQL."
+				" of MariaDB."
 				" Please close it to get rid of this error.";
 
 		} else if (err == ERROR_WORKING_SET_QUOTA
@@ -1859,6 +1803,10 @@ ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
 				" because of either a thread exit"
 				" or an application request."
 				" Retry attempt is made.";
+		} else if (err == ERROR_PATH_NOT_FOUND) {
+			ib::error()
+				<< "This error means that directory did not exist"
+				" during file creation.";
 		} else {
 
 			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
@@ -1991,7 +1939,7 @@ os_file_create_simple_func(
 		file = CreateFile(
 			(LPCTSTR) name, access,
 			FILE_SHARE_READ | FILE_SHARE_DELETE,
-			NULL, create_flag, attributes, NULL);
+			my_win_file_secattr(), create_flag, attributes, NULL);
 
 		if (file == INVALID_HANDLE_VALUE) {
 
@@ -2247,7 +2195,7 @@ os_file_create_func(
 
 		/* Use default security attributes and no template file. */
 		file = CreateFile(
-			name, access, share_mode, NULL,
+			name, access, share_mode, my_win_file_secattr(),
 			create_flag, attributes, NULL);
 
 		/* If FILE_FLAG_NO_BUFFERING was set, check if this can work at all,
@@ -2301,6 +2249,7 @@ A simple function to open or create a file.
 @param[out]	success		true if succeeded
 @return own: handle to the file, not defined if error, error number
 	can be retrieved with os_file_get_last_error */
+
 pfs_os_file_t
 os_file_create_simple_no_error_handling_func(
 	const char*	name,
@@ -2382,7 +2331,7 @@ os_file_create_simple_no_error_handling_func(
 	file = CreateFile((LPCTSTR) name,
 			  access,
 			  share_mode,
-			  NULL,			// Security attributes
+			  my_win_file_secattr(),
 			  create_flag,
 			  attributes,
 			  NULL);		// No template file
@@ -2441,8 +2390,7 @@ os_file_delete_if_exists_func(
 			ib::warn() << "Delete of file '" << name << "' failed.";
 		}
 
-		/* Sleep for a second */
-		os_thread_sleep(1000000);
+		std::this_thread::sleep_for(std::chrono::seconds(1));
 
 		if (count > 2000) {
 
@@ -2490,8 +2438,7 @@ os_file_delete_func(
 				<< "another program accessing it?";
 		}
 
-		/* sleep for a second */
-		os_thread_sleep(1000000);
+		std::this_thread::sleep_for(std::chrono::seconds(1));
 
 		if (count > 2000) {
 
@@ -2528,7 +2475,7 @@ os_file_rename_func(
 	ut_ad(exists);
 #endif /* UNIV_DEBUG */
 
-	if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
+	if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) {
 		return(true);
 	}
 
@@ -2672,7 +2619,7 @@ os_file_get_status_win32(
 				access,
 				FILE_SHARE_READ | FILE_SHARE_WRITE
 				| FILE_SHARE_DELETE,	// Full sharing
-				NULL,			// Default security
+				my_win_file_secattr(),
 				OPEN_EXISTING,		// Existing file only
 				FILE_ATTRIBUTE_NORMAL,	// Normal file
 				NULL);			// No attr. template
@@ -2818,15 +2765,15 @@ os_file_io(
 		bytes_returned += n_bytes;
 
 		if (type.type != IORequest::READ_MAYBE_PARTIAL) {
-			const char*	op = type.is_read()
-				? "read" : "written";
-
-			ib::warn()
-				<< n
-				<< " bytes should have been " << op << ". Only "
-				<< bytes_returned
-				<< " bytes " << op << ". Retrying"
-				<< " for the remaining bytes.";
+			sql_print_warning("InnoDB: %zu bytes should have been"
+					  " %s at %llu from %s,"
+					  " but got only %zd."
+					  " Retrying.",
+					  n, type.is_read()
+					  ? "read" : "written", offset,
+					  type.node
+					  ? type.node->name
+					  : "(unknown file)", bytes_returned);
 		}
 
 		/* Advance the offset and buffer by n_bytes */
@@ -2967,52 +2914,38 @@ os_file_pread(
 @param[in]	offset		file offset from the start where to read
 @param[in]	n		number of bytes to read, starting from offset
 @param[out]	o		number of bytes actually read
-@param[in]	exit_on_err	if true then exit on error
 @return DB_SUCCESS or error code */
-static MY_ATTRIBUTE((warn_unused_result))
 dberr_t
-os_file_read_page(
+os_file_read_func(
 	const IORequest&	type,
 	os_file_t	file,
 	void*			buf,
 	os_offset_t		offset,
 	ulint			n,
-	ulint*			o,
-	bool			exit_on_err)
+	ulint*			o)
 {
-	dberr_t		err;
+  ut_ad(!type.node || type.node->handle == file);
+  ut_ad(n);
 
-	os_bytes_read_since_printout += n;
+  os_bytes_read_since_printout+= n;
 
-	ut_ad(n > 0);
+  dberr_t err;
+  ssize_t n_bytes= os_file_pread(type, file, buf, n, offset, &err);
 
-	ssize_t	n_bytes = os_file_pread(type, file, buf, n, offset, &err);
+  if (o)
+    *o= ulint(n_bytes);
 
-	if (o) {
-		*o = n_bytes;
-	}
+  if (ulint(n_bytes) == n || err != DB_SUCCESS)
+    return err;
 
-	if (ulint(n_bytes) == n || (err != DB_SUCCESS && !exit_on_err)) {
-		return err;
-	}
-	int os_err = IF_WIN((int)GetLastError(), errno);
-
-	if (!os_file_handle_error_cond_exit(
-		    NULL, "read", exit_on_err, false)) {
-		ib::fatal()
-			<< "Tried to read " << n << " bytes at offset "
-			<< offset << ", but was only able to read " << n_bytes
-			<< ".Cannot read from file. OS error number "
-			<< os_err << ".";
-	} else {
-		ib::error() << "Tried to read " << n << " bytes at offset "
-		<< offset << ", but was only able to read " << n_bytes;
-	}
-	if (err == DB_SUCCESS) {
-		err = DB_IO_ERROR;
-	}
+  os_file_handle_error_cond_exit(type.node ? type.node->name : nullptr, "read",
+                                 false, false);
+  sql_print_error("InnoDB: Tried to read %zu bytes at offset %llu"
+                  " of file %s, but was only able to read %zd",
+                  n, offset, type.node ? type.node->name : "(unknown)",
+                  n_bytes);
 
-	return err;
+  return err ? err : DB_IO_ERROR;
 }
 
 /** Handle errors for file operations.
@@ -3072,13 +3005,13 @@ os_file_handle_error_cond_exit(
 
 	case OS_FILE_SHARING_VIOLATION:
 
-		os_thread_sleep(10000000);	/* 10 sec */
+		std::this_thread::sleep_for(std::chrono::seconds(10));
 		return(true);
 
 	case OS_FILE_OPERATION_ABORTED:
 	case OS_FILE_INSUFFICIENT_RESOURCE:
 
-		os_thread_sleep(100000);	/* 100 ms */
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
 		return(true);
 
 	default:
@@ -3116,6 +3049,15 @@ os_file_set_nocache(
 	const char*	file_name	MY_ATTRIBUTE((unused)),
 	const char*	operation_name	MY_ATTRIBUTE((unused)))
 {
+	const auto innodb_flush_method = srv_file_flush_method;
+	switch (innodb_flush_method) {
+	case SRV_O_DIRECT:
+	case SRV_O_DIRECT_NO_FSYNC:
+		break;
+	default:
+		return;
+	}
+
 	/* some versions of Solaris may not have DIRECTIO_ON */
 #if defined(__sun__) && defined(DIRECTIO_ON)
 	if (directio(fd, DIRECTIO_ON) == -1) {
@@ -3134,23 +3076,11 @@ os_file_set_nocache(
 		if (errno_save == EINVAL) {
 			if (!warning_message_printed) {
 				warning_message_printed = true;
-# ifdef __linux__
-				ib::warn()
-					<< "Failed to set O_DIRECT on file"
-					<< file_name << "; " << operation_name
-					<< ": " << strerror(errno_save) << ", "
-					"continuing anyway. O_DIRECT is "
-					"known to result in 'Invalid argument' "
-					"on Linux on tmpfs, "
-					"see MySQL Bug#26662.";
-# else /* __linux__ */
-				goto short_warning;
-# endif /* __linux__ */
+				ib::info()
+					<< "Setting O_DIRECT on file "
+					<< file_name << " failed";
 			}
 		} else {
-# ifndef __linux__
-short_warning:
-# endif
 			ib::warn()
 				<< "Failed to set O_DIRECT on file "
 				<< file_name << "; " << operation_name
@@ -3166,7 +3096,7 @@ short_warning:
 /** Check if the file system supports sparse files.
 @param fh	file handle
 @return true if the file system supports sparse files */
-IF_WIN(static,) bool os_is_sparse_file_supported(os_file_t fh)
+static bool os_is_sparse_file_supported(os_file_t fh)
 {
 #ifdef _WIN32
 	FILE_ATTRIBUTE_TAG_INFO info;
@@ -3361,51 +3291,6 @@ os_file_truncate(
 #endif /* _WIN32 */
 }
 
-/** NOTE! Use the corresponding macro os_file_read(), not directly this
-function!
-Requests a synchronous positioned read operation.
-@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
-@param[in]	type		IO flags
-@param[in]	file		handle to an open file
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset from the start where to read
-@param[in]	n		number of bytes to read, starting from offset
-@return error code
-@retval	DB_SUCCESS	if the operation succeeded */
-dberr_t
-os_file_read_func(
-	const IORequest&	type,
-	os_file_t		file,
-	void*			buf,
-	os_offset_t		offset,
-	ulint			n)
-{
-	return(os_file_read_page(type, file, buf, offset, n, NULL, true));
-}
-
-/** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
-not directly this function!
-Requests a synchronous positioned read operation.
-@return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
-@param[in]	type		IO flags
-@param[in]	file		handle to an open file
-@param[out]	buf		buffer where to read
-@param[in]	offset		file offset from the start where to read
-@param[in]	n		number of bytes to read, starting from offset
-@param[out]	o		number of bytes actually read
-@return DB_SUCCESS or error code */
-dberr_t
-os_file_read_no_error_handling_func(
-	const IORequest&	type,
-	os_file_t	file,
-	void*			buf,
-	os_offset_t		offset,
-	ulint			n,
-	ulint*			o)
-{
-	return(os_file_read_page(type, file, buf, offset, n, o, false));
-}
-
 /** Check the existence and type of the given file.
 @param[in]	path		path name of file
 @param[out]	exists		true if the file exists
@@ -3458,24 +3343,23 @@ dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
 
 	/* Check does file system support punching holes for this
 	tablespace. */
-	if (!node->space->punch_hole) {
+	if (!node->punch_hole) {
 		return DB_IO_NO_PUNCH_HOLE;
 	}
 
 	dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
 
-	if (err == DB_SUCCESS) {
+	switch (err) {
+	case DB_SUCCESS:
 		srv_stats.page_compressed_trim_op.inc();
-	} else {
-		/* If punch hole is not supported,
-		set space so that it is not used. */
-		if (err == DB_IO_NO_PUNCH_HOLE) {
-			node->space->punch_hole = false;
-			err = DB_SUCCESS;
-		}
+		return err;
+	case DB_IO_NO_PUNCH_HOLE:
+		node->punch_hole = false;
+		err = DB_SUCCESS;
+		/* fall through */
+	default:
+		return err;
 	}
-
-	return (err);
 }
 
 /*
@@ -3550,10 +3434,10 @@ os_file_get_status(
 
 extern void fil_aio_callback(const IORequest &request);
 
-static void io_callback(tpool::aiocb* cb)
+static void io_callback(tpool::aiocb *cb)
 {
-  const IORequest request(*static_cast<const IORequest*>
-                          (static_cast<const void*>(cb->m_userdata)));
+  const IORequest &request= *static_cast<const IORequest*>
+    (static_cast<const void*>(cb->m_userdata));
   if (cb->m_err != DB_SUCCESS)
   {
     ib::fatal() << "IO Error: " << cb->m_err << " during " <<
@@ -3567,15 +3451,16 @@ static void io_callback(tpool::aiocb* cb)
   if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
   {
     ut_ad(read_slots->contains(cb));
+    fil_aio_callback(request);
     read_slots->release(cb);
   }
   else
   {
     ut_ad(write_slots->contains(cb));
+    const IORequest req{request};
     write_slots->release(cb);
+    fil_aio_callback(req);
   }
-
-  fil_aio_callback(request);
 }
 
 #ifdef LINUX_NATIVE_AIO
@@ -3730,6 +3615,17 @@ disable:
   }
 #endif
 
+#ifdef HAVE_URING
+  if (ret)
+  {
+    ut_ad(srv_use_native_aio);
+    ib::warn()
+	    << "liburing disabled: falling back to innodb_use_native_aio=OFF";
+    srv_use_native_aio= false;
+    ret= srv_thread_pool->configure_aio(false, max_events);
+  }
+#endif
+
   if (!ret)
   {
     read_slots= new io_slots(max_read_events, srv_n_read_io_threads);
@@ -3769,7 +3665,31 @@ void os_aio_wait_until_no_pending_writes()
   buf_dblwr.wait_flush_buffered_writes();
 }
 
-/** Wait until there are no pending asynchronous reads. */
+/** @return number of pending reads */
+size_t os_aio_pending_reads()
+{
+  mysql_mutex_lock(&read_slots->mutex());
+  size_t pending= read_slots->pending_io_count();
+  mysql_mutex_unlock(&read_slots->mutex());
+  return pending;
+}
+
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx()
+{
+  return read_slots->pending_io_count();
+}
+
+/** @return number of pending writes */
+size_t os_aio_pending_writes()
+{
+  mysql_mutex_lock(&write_slots->mutex());
+  size_t pending= write_slots->pending_io_count();
+  mysql_mutex_unlock(&write_slots->mutex());
+  return pending;
+}
+
+/** Wait until all pending asynchronous reads have completed. */
 void os_aio_wait_until_no_pending_reads()
 {
   const auto notify_wait= read_slots->pending_io_count();
@@ -3816,7 +3736,7 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n)
 	if (!type.is_async()) {
 		err = type.is_read()
 			? os_file_read_func(type, type.node->handle,
-					    buf, offset, n)
+					    buf, offset, n, nullptr)
 			: os_file_write_func(type, type.node->name,
 					     type.node->handle,
 					     buf, offset, n);
@@ -3875,14 +3795,12 @@ os_aio_print(FILE*	file)
 	fprintf(file,
 		"Pending flushes (fsync) log: " ULINTPF
 		"; buffer pool: " ULINTPF "\n"
-		ULINTPF " OS file reads, "
-		ULINTPF " OS file writes, "
-		ULINTPF " OS fsyncs\n",
+		ULINTPF " OS file reads, %zu OS file writes, %zu OS fsyncs\n",
 		log_sys.get_pending_flushes(),
 		ulint{fil_n_pending_tablespace_flushes},
 		ulint{os_n_file_reads},
-		os_n_file_writes,
-		os_n_fsyncs);
+		static_cast<size_t>(os_n_file_writes),
+		static_cast<size_t>(os_n_fsyncs));
 
 	const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
 	const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
@@ -4095,212 +4013,167 @@ static bool is_file_on_ssd(char *file_path)
 
 #endif
 
-/** Determine some file metadata when creating or reading the file.
-@param	file	the file that is being created, or OS_FILE_CLOSED */
 void fil_node_t::find_metadata(os_file_t file
 #ifndef _WIN32
-			       , struct stat* statbuf
+                               , bool create, struct stat *statbuf
 #endif
-			       )
+                               )
 {
-	if (file == OS_FILE_CLOSED) {
-		file = handle;
-		ut_ad(is_open());
-	}
+  if (!is_open())
+  {
+    handle= file;
+    ut_ad(is_open());
+  }
 
-#ifdef _WIN32 /* FIXME: make this unconditional */
-	if (space->punch_hole) {
-		space->punch_hole = os_is_sparse_file_supported(file);
-	}
-#endif
+  if (!space->is_compressed())
+    punch_hole= 0;
+  else if (my_test_if_thinly_provisioned(file))
+    punch_hole= 2;
+  else
+    punch_hole= IF_WIN(, !create ||) os_is_sparse_file_supported(file);
 
-	/*
-	For the temporary tablespace and during the
-	non-redo-logged adjustments in
-	IMPORT TABLESPACE, we do not care about
-	the atomicity of writes.
-
-	Atomic writes is supported if the file can be used
-	with atomic_writes (not log file), O_DIRECT is
-	used (tested in ha_innodb.cc) and the file is
-	device and file system that supports atomic writes
-	for the given block size.
-	*/
-	space->atomic_write_supported = space->purpose == FIL_TYPE_TEMPORARY
-		|| space->purpose == FIL_TYPE_IMPORT;
 #ifdef _WIN32
-	on_ssd = is_file_on_ssd(name);
-	FILE_STORAGE_INFO info;
-	if (GetFileInformationByHandleEx(
-		file, FileStorageInfo, &info, sizeof(info))) {
-		block_size = info.PhysicalBytesPerSectorForAtomicity;
-	} else {
-		block_size = 512;
-	}
+  on_ssd= is_file_on_ssd(name);
+  FILE_STORAGE_INFO info;
+  if (GetFileInformationByHandleEx(file, FileStorageInfo, &info, sizeof info))
+    block_size= info.PhysicalBytesPerSectorForAtomicity;
+  else
+    block_size= 512;
 #else
-	struct stat sbuf;
-	if (!statbuf && !fstat(file, &sbuf)) {
-		MSAN_STAT_WORKAROUND(&sbuf);
-		statbuf = &sbuf;
-	}
-	if (statbuf) {
-		block_size = statbuf->st_blksize;
-	}
-	on_ssd = space->atomic_write_supported
+  struct stat sbuf;
+  if (!statbuf && !fstat(file, &sbuf))
+  {
+    MSAN_STAT_WORKAROUND(&sbuf);
+    statbuf= &sbuf;
+  }
+  if (statbuf)
+    block_size= statbuf->st_blksize;
 # ifdef __linux__
-		|| (statbuf && fil_system.is_ssd(statbuf->st_dev))
+  on_ssd= statbuf && fil_system.is_ssd(statbuf->st_dev);
 # endif
-		;
-#endif
-	if (!space->atomic_write_supported) {
-		space->atomic_write_supported = atomic_write
-			&& srv_use_atomic_writes
-#ifndef _WIN32
-			&& my_test_if_atomic_write(file,
-						   space->physical_size())
-#else
-			/* On Windows, all single sector writes are atomic,
-			as per WriteFile() documentation on MSDN.
-			We also require SSD for atomic writes, eventhough
-			technically it is not necessary- the reason is that
-			on hard disks, we still want the benefit from
-			(non-atomic) neighbor page flushing in the buffer
-			pool code. */
-			&& srv_page_size == block_size
-			&& on_ssd
 #endif
-			;
-	}
+
+  if (space->purpose != FIL_TYPE_TABLESPACE)
+  {
+    /* For temporary tablespace or during IMPORT TABLESPACE, we
+    disable neighbour flushing and do not care about atomicity. */
+    on_ssd= true;
+    atomic_write= true;
+  }
+  else
+    /* On Windows, all single sector writes are atomic, as per
+    WriteFile() documentation on MSDN. */
+    atomic_write= srv_use_atomic_writes &&
+      IF_WIN(srv_page_size == block_size,
+	     my_test_if_atomic_write(file, space->physical_size()));
 }
 
 /** Read the first page of a data file.
 @return	whether the page was found valid */
 bool fil_node_t::read_page0()
 {
-	ut_ad(mutex_own(&fil_system.mutex));
-	const unsigned psize = space->physical_size();
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  const unsigned psize= space->physical_size();
 #ifndef _WIN32
-	struct stat statbuf;
-	if (fstat(handle, &statbuf)) {
-		return false;
-	}
-	MSAN_STAT_WORKAROUND(&statbuf);
-	os_offset_t size_bytes = statbuf.st_size;
+  struct stat statbuf;
+  if (fstat(handle, &statbuf))
+    return false;
+  MSAN_STAT_WORKAROUND(&statbuf);
+  os_offset_t size_bytes= statbuf.st_size;
 #else
-	os_offset_t size_bytes = os_file_get_size(handle);
-	ut_a(size_bytes != (os_offset_t) -1);
+  os_offset_t size_bytes= os_file_get_size(handle);
+  ut_a(size_bytes != (os_offset_t) -1);
 #endif
-	const uint32_t min_size = FIL_IBD_FILE_INITIAL_SIZE * psize;
+  const uint32_t min_size= FIL_IBD_FILE_INITIAL_SIZE * psize;
 
-	if (size_bytes < min_size) {
-		ib::error() << "The size of the file " << name
-			    << " is only " << size_bytes
-			    << " bytes, should be at least " << min_size;
-		return false;
-	}
+  if (size_bytes < min_size)
+  {
+    ib::error() << "The size of the file " << name
+      << " is only " << size_bytes
+      << " bytes, should be at least " << min_size;
+    return false;
+  }
 
-	page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
-	if (os_file_read(IORequestRead, handle, page, 0, psize)
-	    != DB_SUCCESS) {
-		ib::error() << "Unable to read first page of file " << name;
+  if (!deferred)
+  {
+    page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
+    if (os_file_read(IORequestRead, handle, page, 0, psize, nullptr)
+        != DB_SUCCESS)
+    {
+      sql_print_error("InnoDB: Unable to read first page of file %s", name);
 corrupted:
-		aligned_free(page);
-		return false;
-	}
+      aligned_free(page);
+      return false;
+    }
 
-	const ulint space_id = memcmp_aligned<2>(
-		FIL_PAGE_SPACE_ID + page,
-		FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
-		? ULINT_UNDEFINED
-		: mach_read_from_4(FIL_PAGE_SPACE_ID + page);
-	ulint flags = fsp_header_get_flags(page);
-	const uint32_t size = fsp_header_get_field(page, FSP_SIZE);
-	const uint32_t free_limit = fsp_header_get_field(page, FSP_FREE_LIMIT);
-	const uint32_t free_len = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
-					       + page);
-	if (!fil_space_t::is_valid_flags(flags, space->id)) {
-		ulint cflags = fsp_flags_convert_from_101(flags);
-		if (cflags == ULINT_UNDEFINED) {
+    const ulint space_id= memcmp_aligned<2>
+      (FIL_PAGE_SPACE_ID + page,
+       FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
+      ? ULINT_UNDEFINED
+      : mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+    ulint flags= fsp_header_get_flags(page);
+    const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
+    const uint32_t free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
+    const uint32_t free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
+    if (!fil_space_t::is_valid_flags(flags, space->id))
+    {
+      ulint cflags= fsp_flags_convert_from_101(flags);
+      if (cflags == ULINT_UNDEFINED)
+      {
 invalid:
-			ib::error()
-				<< "Expected tablespace flags "
-				<< ib::hex(space->flags)
-				<< " but found " << ib::hex(flags)
-				<< " in the file " << name;
-			goto corrupted;
-		}
-
-		ulint cf = cflags & ~FSP_FLAGS_MEM_MASK;
-		ulint sf = space->flags & ~FSP_FLAGS_MEM_MASK;
+        ib::error() << "Expected tablespace flags "
+          << ib::hex(space->flags)
+          << " but found " << ib::hex(flags)
+          << " in the file " << name;
+        goto corrupted;
+      }
 
-		if (!fil_space_t::is_flags_equal(cf, sf)
-		    && !fil_space_t::is_flags_equal(sf, cf)) {
-			goto invalid;
-		}
+      ulint cf= cflags & ~FSP_FLAGS_MEM_MASK;
+      ulint sf= space->flags & ~FSP_FLAGS_MEM_MASK;
 
-		flags = cflags;
-	}
+      if (!fil_space_t::is_flags_equal(cf, sf) &&
+          !fil_space_t::is_flags_equal(sf, cf))
+        goto invalid;
+      flags= cflags;
+    }
 
-	ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
+    ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
 
-	/* Try to read crypt_data from page 0 if it is not yet read. */
-	if (!space->crypt_data) {
-		space->crypt_data = fil_space_read_crypt_data(
-			fil_space_t::zip_size(flags), page);
-	}
-	aligned_free(page);
+    /* Try to read crypt_data from page 0 if it is not yet read. */
+    if (!space->crypt_data)
+      space->crypt_data= fil_space_read_crypt_data(
+        fil_space_t::zip_size(flags), page);
+    aligned_free(page);
 
-	if (UNIV_UNLIKELY(space_id != space->id)) {
-		ib::error() << "Expected tablespace id " << space->id
-			<< " but found " << space_id
-			<< " in the file " << name;
-		return false;
-	}
+    if (UNIV_UNLIKELY(space_id != space->id))
+    {
+      ib::error() << "Expected tablespace id " << space->id
+        << " but found " << space_id
+        << " in the file " << name;
+      return false;
+    }
 
-#ifdef __linux__
-	find_metadata(handle, &statbuf);
-#else
-	find_metadata();
-#endif
-	/* Truncate the size to a multiple of extent size. */
-	ulint	mask = psize * FSP_EXTENT_SIZE - 1;
+    space->flags= (space->flags & FSP_FLAGS_MEM_MASK) | flags;
+    ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
+    ut_ad(space->free_len == 0 || space->free_len == free_len);
+    space->size_in_header= size;
+    space->free_limit= free_limit;
+    space->free_len= free_len;
+  }
 
-	if (size_bytes <= mask) {
-		/* .ibd files start smaller than an
-		extent size. Do not truncate valid data. */
-	} else {
-		size_bytes &= ~os_offset_t(mask);
-	}
+  IF_WIN(find_metadata(), find_metadata(handle, false, &statbuf));
+  /* Truncate the size to a multiple of extent size. */
+  ulint	mask= psize * FSP_EXTENT_SIZE - 1;
 
-	space->flags = (space->flags & FSP_FLAGS_MEM_MASK) | flags;
+  if (size_bytes <= mask);
+    /* .ibd files start smaller than an
+    extent size. Do not truncate valid data. */
+  else
+    size_bytes&= ~os_offset_t(mask);
 
-	space->punch_hole = space->is_compressed();
-	this->size = uint32_t(size_bytes / psize);
-	space->set_sizes(this->size);
-	ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
-	ut_ad(space->free_len == 0 || space->free_len == free_len);
-	space->size_in_header = size;
-	space->free_limit = free_limit;
-	space->free_len = free_len;
-	return true;
+  this->size= uint32_t(size_bytes / psize);
+  space->set_sizes(this->size);
+  return true;
 }
 
-#else
-#include "univ.i"
 #endif /* !UNIV_INNOCHECKSUM */
-
-/** Normalizes a directory path for the current OS:
-On Windows, we convert '/' to '\', else we convert '\' to '/'.
-@param[in,out] str A null-terminated directory and file path */
-void
-os_normalize_path(
-	char*	str)
-{
-	if (str != NULL) {
-		for (; *str; str++) {
-			if (*str == OS_PATH_SEPARATOR_ALT) {
-				*str = OS_PATH_SEPARATOR;
-			}
-		}
-	}
-}
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index 2d2ade11442..f83b4d774b1 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -88,7 +88,10 @@ page_cur_try_search_shortcut(
 		goto exit_func;
 	}
 
-	next_rec = page_rec_get_next_const(rec);
+	if (!(next_rec = page_rec_get_next_const(rec))) {
+		goto exit_func;
+	}
+
 	if (!page_rec_is_supremum(next_rec)) {
 		offsets = rec_get_offsets(next_rec, index, offsets,
 					  index->n_core_fields,
@@ -179,7 +182,10 @@ page_cur_try_search_shortcut_bytes(
 		goto exit_func;
 	}
 
-	next_rec = page_rec_get_next_const(rec);
+	if (!(next_rec = page_rec_get_next_const(rec))) {
+		goto exit_func;
+	}
+
 	if (!page_rec_is_supremum(next_rec)) {
 		offsets = rec_get_offsets(next_rec, index, offsets,
 					  index->n_core_fields,
@@ -267,11 +273,9 @@ page_cur_rec_field_extends(
 
 /****************************************************************//**
 Searches the right position for a page cursor. */
-void
+bool
 page_cur_search_with_match(
 /*=======================*/
-	const buf_block_t*	block,	/*!< in: buffer block */
-	const dict_index_t*	index,	/*!< in/out: record descriptor */
 	const dtuple_t*		tuple,	/*!< in: data tuple */
 	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
 					PAGE_CUR_LE, PAGE_CUR_G, or
@@ -289,7 +293,6 @@ page_cur_search_with_match(
 	ulint		low;
 	ulint		mid;
 	const page_t*	page;
-	const page_dir_slot_t* slot;
 	const rec_t*	up_rec;
 	const rec_t*	low_rec;
 	const rec_t*	mid_rec;
@@ -297,6 +300,8 @@ page_cur_search_with_match(
 	ulint		low_matched_fields;
 	ulint		cur_matched_fields;
 	int		cmp;
+	const dict_index_t* const index = cursor->index;
+	const buf_block_t* const block = cursor->block;
 #ifdef UNIV_ZIP_DEBUG
 	const page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
 #endif /* UNIV_ZIP_DEBUG */
@@ -335,7 +340,7 @@ page_cur_search_with_match(
 	    && page_cur_try_search_shortcut(
 		    block, index, tuple,
 		    iup_matched_fields, ilow_matched_fields, cursor)) {
-		return;
+		return false;
 	}
 # ifdef PAGE_CUR_DBG
 	if (mode == PAGE_CUR_DBG) {
@@ -352,10 +357,9 @@ page_cur_search_with_match(
 		if (mode == PAGE_CUR_RTREE_INSERT && n_core) {
 			mode = PAGE_CUR_LE;
 		} else {
-			rtr_cur_search_with_match(
+			return rtr_cur_search_with_match(
 				block, (dict_index_t*)index, tuple, mode,
 				cursor, rtr_info);
-			return;
 		}
 	}
 
@@ -386,9 +390,11 @@ page_cur_search_with_match(
 
 	while (up - low > 1) {
 		mid = (low + up) / 2;
-		slot = page_dir_get_nth_slot(page, mid);
-		mid_rec = page_dir_slot_get_rec(slot);
-
+		const page_dir_slot_t* slot = page_dir_get_nth_slot(page, mid);
+		if (UNIV_UNLIKELY(!(mid_rec
+				    = page_dir_slot_get_rec_validate(slot)))) {
+			goto corrupted;
+		}
 		cur_matched_fields = std::min(low_matched_fields,
 					      up_matched_fields);
 
@@ -431,18 +437,30 @@ up_slot_match:
 		}
 	}
 
-	slot = page_dir_get_nth_slot(page, low);
-	low_rec = page_dir_slot_get_rec(slot);
-	slot = page_dir_get_nth_slot(page, up);
-	up_rec = page_dir_slot_get_rec(slot);
+	low_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, low));
+	up_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, up));
+	if (UNIV_UNLIKELY(!low_rec || !up_rec)) {
+corrupted:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return true;
+	}
 
 	/* Perform linear search until the upper and lower records come to
 	distance 1 of each other. */
 
-	while (page_rec_get_next_const(low_rec) != up_rec) {
-
-		mid_rec = page_rec_get_next_const(low_rec);
-
+	for (;;) {
+		if (const rec_t* next = page_rec_get_next_const(low_rec)) {
+			if (next == up_rec) {
+				break;
+			}
+			mid_rec = next;
+		} else {
+			goto corrupted;
+		}
 		cur_matched_fields = std::min(low_matched_fields,
 					      up_matched_fields);
 
@@ -512,6 +530,8 @@ up_rec_match:
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
+
+	return false;
 }
 
 #ifdef BTR_CUR_HASH_ADAPT
@@ -529,10 +549,8 @@ lower limit record
 @param[in,out]	ilow_matched_bytes	already matched bytes in the
 first partially matched field in the lower limit record
 @param[out]	cursor			page cursor */
-void
+bool
 page_cur_search_with_match_bytes(
-	const buf_block_t*	block,
-	const dict_index_t*	index,
 	const dtuple_t*		tuple,
 	page_cur_mode_t		mode,
 	ulint*			iup_matched_fields,
@@ -543,9 +561,7 @@ page_cur_search_with_match_bytes(
 {
 	ulint		up;
 	ulint		low;
-	ulint		mid;
 	const page_t*	page;
-	const page_dir_slot_t* slot;
 	const rec_t*	up_rec;
 	const rec_t*	low_rec;
 	const rec_t*	mid_rec;
@@ -556,6 +572,8 @@ page_cur_search_with_match_bytes(
 	ulint		cur_matched_fields;
 	ulint		cur_matched_bytes;
 	int		cmp;
+	const dict_index_t* const index = cursor->index;
+	const buf_block_t* const block = cursor->block;
 #ifdef UNIV_ZIP_DEBUG
 	const page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
 #endif /* UNIV_ZIP_DEBUG */
@@ -594,7 +612,7 @@ page_cur_search_with_match_bytes(
 		    iup_matched_fields, iup_matched_bytes,
 		    ilow_matched_fields, ilow_matched_bytes,
 		    cursor)) {
-		return;
+		return false;
 	}
 # ifdef PAGE_CUR_DBG
 	if (mode == PAGE_CUR_DBG) {
@@ -632,9 +650,12 @@ page_cur_search_with_match_bytes(
 	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
 
 	while (up - low > 1) {
-		mid = (low + up) / 2;
-		slot = page_dir_get_nth_slot(page, mid);
-		mid_rec = page_dir_slot_get_rec(slot);
+		const ulint mid = (low + up) / 2;
+		mid_rec = page_dir_slot_get_rec_validate(
+			page_dir_get_nth_slot(page, mid));
+		if (UNIV_UNLIKELY(!mid_rec)) {
+			goto corrupted;
+		}
 
 		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
 			    low_matched_fields, low_matched_bytes,
@@ -681,18 +702,30 @@ up_slot_match:
 		}
 	}
 
-	slot = page_dir_get_nth_slot(page, low);
-	low_rec = page_dir_slot_get_rec(slot);
-	slot = page_dir_get_nth_slot(page, up);
-	up_rec = page_dir_slot_get_rec(slot);
+	low_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, low));
+	up_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, up));
+	if (UNIV_UNLIKELY(!low_rec || !up_rec)) {
+corrupted:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return true;
+	}
 
 	/* Perform linear search until the upper and lower records come to
 	distance 1 of each other. */
 
-	while (page_rec_get_next_const(low_rec) != up_rec) {
-
-		mid_rec = page_rec_get_next_const(low_rec);
-
+	for (;;) {
+		if (const rec_t* next = page_rec_get_next_const(low_rec)) {
+			if (next == up_rec) {
+				break;
+			}
+			mid_rec = next;
+		} else {
+			goto corrupted;
+		}
 		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
 			    low_matched_fields, low_matched_bytes,
 			    up_matched_fields, up_matched_bytes);
@@ -761,29 +794,20 @@ up_rec_match:
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
+	return false;
 }
 #endif /* BTR_CUR_HASH_ADAPT */
 
 /***********************************************************//**
 Positions a page cursor on a randomly chosen user record on a page. If there
 are no user records, sets the cursor on the infimum record. */
-void
-page_cur_open_on_rnd_user_rec(
-/*==========================*/
-	buf_block_t*	block,	/*!< in: page */
-	page_cur_t*	cursor)	/*!< out: page cursor */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor)
 {
-	const ulint	n_recs = page_get_n_recs(block->frame);
-
-	page_cur_set_before_first(block, cursor);
-
-	if (UNIV_UNLIKELY(n_recs == 0)) {
-
-		return;
-	}
-
-	cursor->rec = page_rec_get_nth(block->frame,
-				       ut_rnd_interval(n_recs) + 1);
+  if (const ulint n_recs= page_get_n_recs(cursor->block->page.frame))
+    if ((cursor->rec= page_rec_get_nth(cursor->block->page.frame,
+                                       ut_rnd_interval(n_recs) + 1)))
+      return;
+  cursor->rec= page_get_infimum_rec(cursor->block->page.frame);
 }
 
 /**
@@ -802,10 +826,10 @@ static void page_rec_set_n_owned(rec_t *rec, ulint n_owned, bool comp)
 Split a directory slot which owns too many records.
 @param[in,out]  block   index page
 @param[in,out]  slot    the slot that needs to be split */
-static void page_dir_split_slot(const buf_block_t &block,
+static bool page_dir_split_slot(const buf_block_t &block,
                                 page_dir_slot_t *slot)
 {
-  ut_ad(slot <= &block.frame[srv_page_size - PAGE_EMPTY_DIR_START]);
+  ut_ad(slot <= &block.page.frame[srv_page_size - PAGE_EMPTY_DIR_START]);
   slot= my_assume_aligned<2>(slot);
 
   const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
@@ -815,20 +839,30 @@ static void page_dir_split_slot(const buf_block_t &block,
                 PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
 
   /* Find a record approximately in the middle. */
-  const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
+  const rec_t *rec= page_dir_slot_get_rec_validate(slot + PAGE_DIR_SLOT_SIZE);
 
   for (ulint i= n_owned / 2; i--; )
+  {
+    if (UNIV_UNLIKELY(!rec))
+      return true;
     rec= page_rec_get_next_const(rec);
+  }
+
+  if (UNIV_UNLIKELY(!rec))
+    return true;
 
   /* Add a directory slot immediately below this one. */
   constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
-  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.frame);
+  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.page.frame);
   const uint16_t n_slots= mach_read_from_2(n_slots_p);
 
   page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
-          (block.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
-           n_slots * PAGE_DIR_SLOT_SIZE);
-  ut_ad(slot >= last_slot);
+    (block.page.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+     n_slots * PAGE_DIR_SLOT_SIZE);
+
+  if (UNIV_UNLIKELY(slot < last_slot))
+    return true;
+
   memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
                      slot - last_slot);
 
@@ -836,11 +870,12 @@ static void page_dir_split_slot(const buf_block_t &block,
 
   mach_write_to_2(n_slots_p, n_slots + 1);
 
-  mach_write_to_2(slot, rec - block.frame);
-  const bool comp= page_is_comp(block.frame) != 0;
+  mach_write_to_2(slot, rec - block.page.frame);
+  const bool comp= page_is_comp(block.page.frame) != 0;
   page_rec_set_n_owned(page_dir_slot_get_rec(slot), half_owned, comp);
   page_rec_set_n_owned(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
                        n_owned - half_owned, comp);
+  return false;
 }
 
 /**
@@ -851,10 +886,10 @@ Split a directory slot which owns too many records.
 static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
 {
   ut_ad(block->page.zip.data);
-  ut_ad(page_is_comp(block->frame));
+  ut_ad(page_is_comp(block->page.frame));
   ut_ad(s);
 
-  page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, s);
+  page_dir_slot_t *slot= page_dir_get_nth_slot(block->page.frame, s);
   const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
 
   ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
@@ -866,17 +901,21 @@ static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
 
   const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
 
+  /* We do not try to prevent crash on corruption here.
+  For ROW_FORMAT=COMPRESSED pages, the next-record links should
+  be validated in page_zip_decompress(). Corruption should only
+  be possible here if the buffer pool was corrupted later. */
   for (ulint i= n_owned / 2; i--; )
     rec= page_rec_get_next_const(rec);
 
   /* Add a directory slot immediately below this one. */
   constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
-  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->frame);
+  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->page.frame);
   const uint16_t n_slots= mach_read_from_2(n_slots_p);
 
   page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
-          (block->frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
-           n_slots * PAGE_DIR_SLOT_SIZE);
+    (block->page.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+     n_slots * PAGE_DIR_SLOT_SIZE);
   memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
                      slot - last_slot);
 
@@ -904,10 +943,10 @@ this may result in merging the two slots.
 static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
 {
 	ut_ad(block->page.zip.data);
-	ut_ad(page_is_comp(block->frame));
+	ut_ad(page_is_comp(block->page.frame));
 	ut_ad(s > 0);
 
-	const ulint n_slots = page_dir_get_n_slots(block->frame);
+	const ulint n_slots = page_dir_get_n_slots(block->page.frame);
 
 	if (UNIV_UNLIKELY(s + 1 == n_slots)) {
 		/* The last directory slot cannot be balanced. */
@@ -916,7 +955,7 @@ static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
 
 	ut_ad(s < n_slots);
 
-	page_dir_slot_t* slot = page_dir_get_nth_slot(block->frame, s);
+	page_dir_slot_t* slot = page_dir_get_nth_slot(block->page.frame, s);
 	rec_t* const up_rec = const_cast<rec_t*>
 		(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
 	rec_t* const slot_rec = const_cast<rec_t*>
@@ -936,12 +975,12 @@ static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
 					   true, mtr);
 		/* Shift the slots */
 		page_dir_slot_t* last_slot = page_dir_get_nth_slot(
-			block->frame, n_slots - 1);
+			block->page.frame, n_slots - 1);
 		memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
 				   slot - last_slot);
 		constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
 		byte *n_slots_p= my_assume_aligned<2>
-			(n_slots_f + block->frame);
+			(n_slots_f + block->page.frame);
 		mtr->write<2>(*block, n_slots_p, n_slots - 1);
 		memcpy_aligned<2>(n_slots_f + block->page.zip.data,
 				  n_slots_p, 2);
@@ -951,8 +990,12 @@ static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
 
 	/* Transfer one record to the underfilled slot */
 	page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
-	rec_t* new_rec = rec_get_next_ptr(slot_rec, TRUE);
-	page_rec_set_n_owned<true>(block, new_rec,
+	const rec_t* new_rec = page_rec_get_next_low(slot_rec, TRUE);
+	/* We do not try to prevent crash on corruption here.
+	For ROW_FORMAT=COMPRESSED pages, the next-record links should
+	be validated in page_zip_decompress(). Corruption should only
+	be possible here if the buffer pool was corrupted later. */
+	page_rec_set_n_owned<true>(block, const_cast<rec_t*>(new_rec),
 				   PAGE_DIR_SLOT_MIN_N_OWNED,
 				   true, mtr);
 	mach_write_to_2(slot, page_offset(new_rec));
@@ -967,11 +1010,11 @@ this may result in merging the two slots.
 @param[in]	s		the slot to be balanced */
 static void page_dir_balance_slot(const buf_block_t &block, ulint s)
 {
-	const bool comp= page_is_comp(block.frame);
+	const bool comp= page_is_comp(block.page.frame);
 	ut_ad(!block.page.zip.data);
 	ut_ad(s > 0);
 
-	const ulint n_slots = page_dir_get_n_slots(block.frame);
+	const ulint n_slots = page_dir_get_n_slots(block.page.frame);
 
 	if (UNIV_UNLIKELY(s + 1 == n_slots)) {
 		/* The last directory slot cannot be balanced. */
@@ -980,7 +1023,7 @@ static void page_dir_balance_slot(const buf_block_t &block, ulint s)
 
 	ut_ad(s < n_slots);
 
-	page_dir_slot_t* slot = page_dir_get_nth_slot(block.frame, s);
+	page_dir_slot_t* slot = page_dir_get_nth_slot(block.page.frame, s);
 	rec_t* const up_rec = const_cast<rec_t*>
 		(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
 	rec_t* const slot_rec = const_cast<rec_t*>
@@ -1001,30 +1044,39 @@ static void page_dir_balance_slot(const buf_block_t &block, ulint s)
 				     + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), comp);
 		/* Shift the slots */
 		page_dir_slot_t* last_slot = page_dir_get_nth_slot(
-			block.frame, n_slots - 1);
+			block.page.frame, n_slots - 1);
 		memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
 				   slot - last_slot);
 		memset_aligned<2>(last_slot, 0, 2);
 		constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
 		byte *n_slots_p= my_assume_aligned<2>
-			(n_slots_f + block.frame);
+			(n_slots_f + block.page.frame);
 		mach_write_to_2(n_slots_p, n_slots - 1);
 		return;
 	}
 
 	/* Transfer one record to the underfilled slot */
-	rec_t* new_rec;
+	const rec_t* new_rec;
 
 	if (comp) {
+		if (UNIV_UNLIKELY(!(new_rec =
+				    page_rec_get_next_low(slot_rec, true)))) {
+			ut_ad("corrupted page" == 0);
+			return;
+		}
 		page_rec_set_n_owned(slot_rec, 0, true);
-		new_rec = rec_get_next_ptr(slot_rec, TRUE);
-		page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED, true);
+		page_rec_set_n_owned(const_cast<rec_t*>(new_rec),
+				     PAGE_DIR_SLOT_MIN_N_OWNED, true);
 		page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
 	} else {
+		if (UNIV_UNLIKELY(!(new_rec =
+				    page_rec_get_next_low(slot_rec, false)))) {
+			ut_ad("corrupted page" == 0);
+			return;
+		}
 		page_rec_set_n_owned(slot_rec, 0, false);
-		new_rec = rec_get_next_ptr(slot_rec, FALSE);
-		page_rec_set_n_owned(new_rec, PAGE_DIR_SLOT_MIN_N_OWNED,
-				     false);
+		page_rec_set_n_owned(const_cast<rec_t*>(new_rec),
+				     PAGE_DIR_SLOT_MIN_N_OWNED, false);
 		page_rec_set_n_owned(up_rec, up_n_owned - 1, false);
 	}
 
@@ -1045,14 +1097,15 @@ static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
   ut_ad(!compressed || block->page.zip.data);
 
   byte *heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
-                                       block->frame);
+                                       block->page.frame);
 
   const uint16_t top= mach_read_from_2(heap_top);
 
-  if (need > page_get_max_insert_size(block->frame, 1))
+  if (need > page_get_max_insert_size(block->page.frame, 1))
     return NULL;
 
-  byte *n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + block->frame);
+  byte *n_heap= my_assume_aligned<2>
+    (PAGE_N_HEAP + PAGE_HEADER + block->page.frame);
 
   const uint16_t h= mach_read_from_2(n_heap);
   if (UNIV_UNLIKELY((h + 1) & 0x6000))
@@ -1078,7 +1131,7 @@ static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
                       heap_top, 4);
   }
 
-  return &block->frame[top];
+  return &block->page.frame[top];
 }
 
 /** Write log for inserting a B-tree or R-tree record in
@@ -1103,13 +1156,14 @@ inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
 {
   ut_ad(!block.page.zip.data);
   ut_ad(m_log_mode == MTR_LOG_ALL);
-  ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+  ut_d(ulint n_slots= page_dir_get_n_slots(block.page.frame));
   ut_ad(n_slots >= 2);
-  ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
-  ut_ad(&block.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end);
-  ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
-        page_end);
-  ut_ad(fil_page_index_page_check(block.frame));
+  ut_d(const byte *page_end=
+       page_dir_get_nth_slot(block.page.frame, n_slots - 1));
+  ut_ad(&block.page.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end);
+  ut_ad(block.page.frame +
+        page_header_get_offs(block.page.frame, PAGE_HEAP_TOP) <= page_end);
+  ut_ad(fil_page_index_page_check(block.page.frame));
   ut_ad(!(~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_bits));
   ut_ad(n_fields_s >= 2);
   ut_ad((n_fields_s >> 1) <= REC_MAX_N_FIELDS);
@@ -1188,15 +1242,16 @@ inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
 {
   ut_ad(!block.page.zip.data);
   ut_ad(m_log_mode == MTR_LOG_ALL);
-  ut_d(ulint n_slots= page_dir_get_n_slots(block.frame));
+  ut_d(ulint n_slots= page_dir_get_n_slots(block.page.frame));
   ut_ad(n_slots >= 2);
-  ut_d(const byte *page_end= page_dir_get_nth_slot(block.frame, n_slots - 1));
-  ut_ad(&block.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end);
-  ut_ad(block.frame + page_header_get_offs(block.frame, PAGE_HEAP_TOP) <=
-        page_end);
-  ut_ad(fil_page_index_page_check(block.frame));
-  ut_ad(hdr_l + hdr_c + data_l + data_c <=
-        static_cast<size_t>(page_end - &block.frame[PAGE_NEW_SUPREMUM_END]));
+  ut_d(const byte *page_end= page_dir_get_nth_slot(block.page.frame,
+                                                   n_slots - 1));
+  ut_ad(&block.page.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end);
+  ut_ad(block.page.frame +
+        page_header_get_offs(block.page.frame, PAGE_HEAP_TOP) <= page_end);
+  ut_ad(fil_page_index_page_check(block.page.frame));
+  ut_ad(hdr_l + hdr_c + data_l + data_c <= static_cast<size_t>
+        (page_end - &block.page.frame[PAGE_NEW_SUPREMUM_END]));
   ut_ad(reuse || shift == 0);
 #ifdef UNIV_DEBUG
   switch (~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_status) {
@@ -1204,11 +1259,11 @@ inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
     ut_ad(0);
     break;
   case REC_STATUS_NODE_PTR:
-    ut_ad(!page_is_leaf(block.frame));
+    ut_ad(!page_is_leaf(block.page.frame));
     break;
   case REC_STATUS_INSTANT:
   case REC_STATUS_ORDINARY:
-    ut_ad(page_is_leaf(block.frame));
+    ut_ad(page_is_leaf(block.page.frame));
   }
 #endif
 
@@ -1278,6 +1333,20 @@ inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
   m_last_offset= FIL_PAGE_TYPE;
 }
 
+/** Report page directory corruption.
+@param block  index page
+@param index  index tree
+*/
+ATTRIBUTE_COLD
+static void page_cur_directory_corrupted(const buf_block_t &block,
+                                         const dict_index_t &index)
+{
+  ib::error() << "Directory of " << block.page.id()
+              << " of index " << index.name
+              << " in table " << index.table->name
+              << " is corrupted";
+}
+
 /***********************************************************//**
 Inserts a record next to page cursor on an uncompressed page.
 @return pointer to record
@@ -1286,27 +1355,27 @@ rec_t*
 page_cur_insert_rec_low(
 /*====================*/
 	const page_cur_t*cur,	/*!< in: page cursor */
-	dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_t*	rec,	/*!< in: record to insert after cur */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-  buf_block_t* block= cur->block;
+  buf_block_t *block= cur->block;
+  dict_index_t * const index= cur->index;
 
   ut_ad(rec_offs_validate(rec, index, offsets));
   ut_ad(rec_offs_n_fields(offsets) > 0);
-  ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
-  ut_ad(!!page_is_comp(block->frame) == !!rec_offs_comp(offsets));
-  ut_ad(fil_page_index_page_check(block->frame));
-  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame) ==
+  ut_ad(index->table->not_redundant() == !!page_is_comp(block->page.frame));
+  ut_ad(!!page_is_comp(block->page.frame) == !!rec_offs_comp(offsets));
+  ut_ad(fil_page_index_page_check(block->page.frame));
+  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame) ==
         index->id ||
         mtr->is_inside_ibuf());
-  ut_ad(page_dir_get_n_slots(block->frame) >= 2);
+  ut_ad(page_dir_get_n_slots(block->page.frame) >= 2);
 
   ut_ad(!page_rec_is_supremum(cur->rec));
 
   /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
-  ut_ad(mtr->get_log_mode() != MTR_LOG_ALL ||
+  ut_ad(!mtr->is_logged() ||
         !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
 
   /* 1. Get the size of the physical record in the page */
@@ -1318,7 +1387,7 @@ page_cur_insert_rec_low(
       rec - rec_offs_extra_size(offsets);
     ulint extra_size __attribute__((unused))=
       rec_offs_extra_size(offsets) -
-      (page_is_comp(block->frame)
+      (page_is_comp(block->page.frame)
        ? REC_N_NEW_EXTRA_BYTES
        : REC_N_OLD_EXTRA_BYTES);
     /* All data bytes of the record must be valid. */
@@ -1334,10 +1403,10 @@ page_cur_insert_rec_low(
   ulint heap_no;
   byte *insert_buf;
 
-  const bool comp= page_is_comp(block->frame);
+  const bool comp= page_is_comp(block->page.frame);
   const ulint extra_size= rec_offs_extra_size(offsets);
 
-  if (rec_t* free_rec= page_header_get_ptr(block->frame, PAGE_FREE))
+  if (rec_t* free_rec= page_header_get_ptr(block->page.frame, PAGE_FREE))
   {
     /* Try to reuse the head of PAGE_FREE. */
     rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
@@ -1346,7 +1415,7 @@ page_cur_insert_rec_low(
     rec_offs_init(foffsets_);
 
     rec_offs *foffsets= rec_get_offsets(free_rec, index, foffsets_,
-                                        page_is_leaf(block->frame)
+                                        page_is_leaf(block->page.frame)
                                         ? index->n_core_fields : 0,
                                         ULINT_UNDEFINED, &heap);
     const ulint fextra_size= rec_offs_extra_size(foffsets);
@@ -1360,13 +1429,14 @@ page_cur_insert_rec_low(
       goto use_heap;
 
     byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
-                                          block->frame);
+                                          block->page.frame);
     if (comp)
     {
       heap_no= rec_get_heap_no_new(free_rec);
       uint16_t next= mach_read_from_2(free_rec - REC_NEXT);
       mach_write_to_2(page_free, next
-                      ? static_cast<uint16_t>(free_rec + next - block->frame)
+                      ? static_cast<uint16_t>(free_rec + next -
+                                              block->page.frame)
                       : 0);
     }
     else
@@ -1394,40 +1464,40 @@ use_heap:
 
   ut_ad(cur->rec != insert_buf + extra_size);
 
-  rec_t *next_rec= block->frame + rec_get_next_offs(cur->rec, comp);
-  ut_ad(next_rec != block->frame);
+  rec_t *next_rec= block->page.frame + rec_get_next_offs(cur->rec, comp);
+  ut_ad(next_rec != block->page.frame);
 
   /* Update page header fields */
   byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
-                                               block->frame);
+                                               block->page.frame);
   const uint16_t last_insert= mach_read_from_2(page_last_insert);
   ut_ad(!last_insert || !comp ||
-        rec_get_node_ptr_flag(block->frame + last_insert) ==
+        rec_get_node_ptr_flag(block->page.frame + last_insert) ==
         rec_get_node_ptr_flag(rec));
 
   /* Write PAGE_LAST_INSERT */
   mach_write_to_2(page_last_insert, page_offset(insert_buf + extra_size));
 
   /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
-  if (block->frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  if (block->page.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
   {
-    byte *dir= &block->frame[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *dir= &block->page.frame[PAGE_DIRECTION_B + PAGE_HEADER];
     byte *n= my_assume_aligned<2>
-      (&block->frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+      (&block->page.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
     if (UNIV_UNLIKELY(!last_insert))
     {
 no_direction:
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
       memset(n, 0, 2);
     }
-    else if (block->frame + last_insert == cur->rec &&
+    else if (block->page.frame + last_insert == cur->rec &&
              (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
     {
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
 inc_dir:
       mach_write_to_2(n, mach_read_from_2(n) + 1);
     }
-    else if (next_rec == block->frame + last_insert &&
+    else if (next_rec == block->page.frame + last_insert &&
              (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
     {
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
@@ -1439,7 +1509,7 @@ inc_dir:
 
   /* Update PAGE_N_RECS. */
   byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                          block->frame);
+                                          block->page.frame);
 
   mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
 
@@ -1469,17 +1539,17 @@ inc_dir:
     }
     switch (rec_get_status(rec)) {
     case REC_STATUS_NODE_PTR:
-      ut_ad(!page_is_leaf(block->frame));
+      ut_ad(!page_is_leaf(block->page.frame));
       break;
     case REC_STATUS_INSTANT:
       ut_ad(index->is_instant());
-      ut_ad(page_is_leaf(block->frame));
+      ut_ad(page_is_leaf(block->page.frame));
       if (!rec_is_metadata(rec, true))
         break;
-      ut_ad(cur->rec == &block->frame[PAGE_NEW_INFIMUM]);
+      ut_ad(cur->rec == &block->page.frame[PAGE_NEW_INFIMUM]);
       break;
     case REC_STATUS_ORDINARY:
-      ut_ad(page_is_leaf(block->frame));
+      ut_ad(page_is_leaf(block->page.frame));
       ut_ad(!(rec_get_info_bits(rec, true) & ~REC_INFO_DELETED_FLAG));
       break;
     case REC_STATUS_INFIMUM:
@@ -1500,12 +1570,12 @@ inc_dir:
                     static_cast<uint16_t>(insert_rec - cur->rec));
     while (!(n_owned= rec_get_n_owned_new(next_rec)))
     {
-      next_rec= block->frame + rec_get_next_offs(next_rec, true);
-      ut_ad(next_rec != block->frame);
+      next_rec= block->page.frame + rec_get_next_offs(next_rec, true);
+      ut_ad(next_rec != block->page.frame);
     }
     rec_set_bit_field_1(next_rec, n_owned + 1, REC_NEW_N_OWNED,
                         REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
-    if (mtr->get_log_mode() != MTR_LOG_ALL)
+    if (!mtr->is_logged())
     {
       mtr->set_modified(*block);
       goto copied;
@@ -1514,7 +1584,7 @@ inc_dir:
     const byte * const c_start= cur->rec - extra_size;
     if (extra_size > REC_N_NEW_EXTRA_BYTES &&
         c_start >=
-        &block->frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES])
+        &block->page.frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES])
     {
       /* Find common header bytes with the preceding record. */
       const byte *r= rec - (REC_N_NEW_EXTRA_BYTES + 1);
@@ -1527,11 +1597,11 @@ inc_dir:
   else
   {
 #ifdef UNIV_DEBUG
-    if (!page_is_leaf(block->frame));
+    if (!page_is_leaf(block->page.frame));
     else if (rec_is_metadata(rec, false))
     {
       ut_ad(index->is_instant());
-      ut_ad(cur->rec == &block->frame[PAGE_OLD_INFIMUM]);
+      ut_ad(cur->rec == &block->page.frame[PAGE_OLD_INFIMUM]);
     }
 #endif
     rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
@@ -1542,12 +1612,12 @@ inc_dir:
     mach_write_to_2(cur->rec - REC_NEXT, page_offset(insert_rec));
     while (!(n_owned= rec_get_n_owned_old(next_rec)))
     {
-      next_rec= block->frame + rec_get_next_offs(next_rec, false);
-      ut_ad(next_rec != block->frame);
+      next_rec= block->page.frame + rec_get_next_offs(next_rec, false);
+      ut_ad(next_rec != block->page.frame);
     }
     rec_set_bit_field_1(next_rec, n_owned + 1, REC_OLD_N_OWNED,
                         REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
-    if (mtr->get_log_mode() != MTR_LOG_ALL)
+    if (!mtr->is_logged())
     {
       mtr->set_modified(*block);
       goto copied;
@@ -1556,7 +1626,7 @@ inc_dir:
     ut_ad(extra_size > REC_N_OLD_EXTRA_BYTES);
     const byte * const c_start= cur->rec - extra_size;
     if (c_start >=
-        &block->frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES])
+        &block->page.frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES])
     {
       /* Find common header bytes with the preceding record. */
       const byte *r= rec - (REC_N_OLD_EXTRA_BYTES + 1);
@@ -1568,7 +1638,7 @@ inc_dir:
   }
 
   /* Insert the record, possibly copying from the preceding record. */
-  ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+  ut_ad(mtr->is_logged());
 
   {
     const byte *r= rec;
@@ -1583,9 +1653,9 @@ inc_dir:
              c_end >= next_rec - REC_N_OLD_EXTRA_BYTES + comp)
       c_end= next_rec - REC_N_OLD_EXTRA_BYTES + comp;
     else
-      c_end= std::min<const byte*>(c_end, block->frame + srv_page_size -
+      c_end= std::min<const byte*>(c_end, block->page.frame + srv_page_size -
                                    PAGE_DIR - PAGE_DIR_SLOT_SIZE *
-                                   page_dir_get_n_slots(block->frame));
+                                   page_dir_get_n_slots(block->page.frame));
     size_t data_common;
     /* Copy common data bytes of the preceding record. */
     for (; c != c_end && *r == *c; c++, r++);
@@ -1593,14 +1663,14 @@ inc_dir:
 
     if (comp)
       mtr->page_insert(*block, reuse,
-                       cur->rec - block->frame - PAGE_NEW_INFIMUM,
+                       cur->rec - block->page.frame - PAGE_NEW_INFIMUM,
                        info_status, free_offset, hdr_common, data_common,
                        insert_buf,
                        extra_size - hdr_common - REC_N_NEW_EXTRA_BYTES,
                        r, data_size - data_common);
     else
       mtr->page_insert(*block, reuse,
-                       cur->rec - block->frame - PAGE_OLD_INFIMUM,
+                       cur->rec - block->page.frame - PAGE_OLD_INFIMUM,
                        info_status, rec_get_n_fields_old(insert_rec) << 1 |
                        rec_get_1byte_offs_flag(insert_rec),
                        hdr_common, data_common,
@@ -1619,12 +1689,20 @@ copied:
 
   if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
   {
-    const auto owner= page_dir_find_owner_slot(next_rec);
-    page_dir_split_slot(*block, page_dir_get_nth_slot(block->frame, owner));
+    const ulint owner= page_dir_find_owner_slot(next_rec);
+    if (UNIV_UNLIKELY(owner == ULINT_UNDEFINED))
+    {
+      page_cur_directory_corrupted(*block, *index);
+      return nullptr;
+    }
+
+    if (page_dir_split_slot(*block, page_dir_get_nth_slot(block->page.frame,
+                                                          owner)))
+      return nullptr;
   }
 
   rec_offs_make_valid(insert_buf + extra_size, index,
-                      page_is_leaf(block->frame), offsets);
+                      page_is_leaf(block->page.frame), offsets);
   return insert_buf + extra_size;
 }
 
@@ -1688,40 +1766,42 @@ static inline void page_zip_dir_add_slot(buf_block_t *block,
 
 /***********************************************************//**
 Inserts a record next to page cursor on a compressed and uncompressed
-page. Returns pointer to inserted record if succeed, i.e.,
-enough space available, NULL otherwise.
-The cursor stays at the same position.
+page.
 
 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
 if this is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
 or by invoking ibuf_reset_free_bits() before mtr_commit().
 
-@return pointer to record if succeed, NULL otherwise */
+@return pointer to inserted record
+@return nullptr on failure */
 rec_t*
 page_cur_insert_rec_zip(
 /*====================*/
-	page_cur_t*	cursor,	/*!< in/out: page cursor */
-	dict_index_t*	index,	/*!< in: record descriptor */
+	page_cur_t*	cursor,	/*!< in/out: page cursor,
+				logical position unchanged  */
 	const rec_t*	rec,	/*!< in: pointer to a physical record */
 	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
   page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor);
+  page_t * const page= cursor->block->page.frame;
+  dict_index_t * const index = cursor->index;
+
   ut_ad(page_zip);
   ut_ad(rec_offs_validate(rec, index, offsets));
 
   ut_ad(index->table->not_redundant());
-  ut_ad(page_is_comp(cursor->block->frame));
+  ut_ad(page_is_comp(page));
   ut_ad(rec_offs_comp(offsets));
-  ut_ad(fil_page_get_type(cursor->block->frame) == FIL_PAGE_INDEX ||
-        fil_page_get_type(cursor->block->frame) == FIL_PAGE_RTREE);
-  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + cursor->block->frame) ==
+  ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX ||
+        fil_page_get_type(page) == FIL_PAGE_RTREE);
+  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + page) ==
         index->id || mtr->is_inside_ibuf());
-  ut_ad(!page_get_instant(cursor->block->frame));
+  ut_ad(!page_get_instant(page));
   ut_ad(!page_cur_is_after_last(cursor));
 #ifdef UNIV_ZIP_DEBUG
-  ut_a(page_zip_validate(page_zip, cursor->block->frame, index));
+  ut_a(page_zip_validate(page_zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
   /* 1. Get the size of the physical record in the page */
@@ -1739,13 +1819,11 @@ page_cur_insert_rec_zip(
     MEM_CHECK_DEFINED(rec_start, extra_size);
   }
 #endif /* HAVE_MEM_CHECK */
-  const bool reorg_before_insert= page_has_garbage(cursor->block->frame) &&
-    rec_size > page_get_max_insert_size(cursor->block->frame, 1) &&
-    rec_size <= page_get_max_insert_size_after_reorganize(cursor->block->frame,
-                                                          1);
+  const bool reorg_before_insert= page_has_garbage(page) &&
+    rec_size > page_get_max_insert_size(page, 1) &&
+    rec_size <= page_get_max_insert_size_after_reorganize(page, 1);
   constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER;
-  byte* const page_free = my_assume_aligned<4>(page_free_f +
-                                               cursor->block->frame);
+  byte* const page_free = my_assume_aligned<4>(page_free_f + page);
   uint16_t free_rec= 0;
 
   /* 2. Try to find suitable space from page memory management */
@@ -1761,15 +1839,14 @@ page_cur_insert_rec_zip(
     const rec_t * const cursor_rec= page_cur_get_rec(cursor);
 #endif /* UNIV_DEBUG */
 
-    if (page_is_empty(cursor->block->frame))
+    if (page_is_empty(page))
     {
       ut_ad(page_cur_is_before_first(cursor));
 
       /* This is an empty page. Recreate to remove the modification log. */
       page_create_zip(cursor->block, index,
-                      page_header_get_field(cursor->block->frame, PAGE_LEVEL),
-                      0, mtr);
-      ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+                      page_header_get_field(page, PAGE_LEVEL), 0, mtr);
+      ut_ad(!page_header_get_ptr(page, PAGE_FREE));
 
       if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
         goto use_heap;
@@ -1778,22 +1855,32 @@ page_cur_insert_rec_zip(
       return nullptr;
     }
 
-    if (page_zip->m_nonempty || page_has_garbage(cursor->block->frame))
+    if (page_zip->m_nonempty || page_has_garbage(page))
     {
       ulint pos= page_rec_get_n_recs_before(cursor->rec);
 
-      if (!page_zip_reorganize(cursor->block, index, level, mtr, true))
-      {
+      if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+        return nullptr;
+
+      switch (page_zip_reorganize(cursor->block, index, level, mtr, true)) {
+      case DB_FAIL:
         ut_ad(cursor->rec == cursor_rec);
         return nullptr;
+      case DB_SUCCESS:
+        break;
+      default:
+        return nullptr;
       }
 
-      if (pos)
-        cursor->rec= page_rec_get_nth(cursor->block->frame, pos);
-      else
-        ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->frame));
+      if (!pos)
+        ut_ad(cursor->rec == page + PAGE_NEW_INFIMUM);
+      else if (!(cursor->rec= page_rec_get_nth(page, pos)))
+      {
+        cursor->rec= page + PAGE_NEW_SUPREMUM;
+        return nullptr;
+      }
 
-      ut_ad(!page_header_get_ptr(cursor->block->frame, PAGE_FREE));
+      ut_ad(!page_header_get_ptr(page, PAGE_FREE));
 
       if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
         goto use_heap;
@@ -1801,40 +1888,45 @@ page_cur_insert_rec_zip(
 
     /* Try compressing the whole page afterwards. */
     const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE);
-    rec_t *insert_rec= page_cur_insert_rec_low(cursor, index, rec, offsets,
-                                               mtr);
+    rec_t *insert_rec= page_cur_insert_rec_low(cursor, rec, offsets, mtr);
     mtr->set_log_mode(log_mode);
 
     if (insert_rec)
     {
       ulint pos= page_rec_get_n_recs_before(insert_rec);
-      ut_ad(pos > 0);
+      if (UNIV_UNLIKELY(!pos || pos == ULINT_UNDEFINED))
+        return nullptr;
 
       /* We are writing entire page images to the log.  Reduce the redo
       log volume by reorganizing the page at the same time. */
-      if (page_zip_reorganize(cursor->block, index, level, mtr))
-      {
+      switch (page_zip_reorganize(cursor->block, index, level, mtr)) {
+      case DB_SUCCESS:
         /* The page was reorganized: Seek to pos. */
-        cursor->rec= pos > 1
-          ? page_rec_get_nth(cursor->block->frame, pos - 1)
-          : cursor->block->frame + PAGE_NEW_INFIMUM;
-        insert_rec= cursor->block->frame + rec_get_next_offs(cursor->rec, 1);
-        rec_offs_make_valid(insert_rec, index,
-                            page_is_leaf(cursor->block->frame), offsets);
-        return insert_rec;
+        if (pos <= 1)
+          cursor->rec= page + PAGE_NEW_INFIMUM;
+        else if (!(cursor->rec= page_rec_get_nth(page, pos - 1)))
+        {
+          cursor->rec= page + PAGE_NEW_INFIMUM;
+          return nullptr;
+        }
+        insert_rec= page + rec_get_next_offs(cursor->rec, 1);
+        rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets);
+        break;
+      case DB_FAIL:
+        /* Theoretically, we could try one last resort of
+           page_zip_reorganize() followed by page_zip_available(), but that
+           would be very unlikely to succeed. (If the full reorganized page
+           failed to compress, why would it succeed to compress the page,
+           plus log the insert of this record?) */
+
+        /* Out of space: restore the page */
+        if (!page_zip_decompress(page_zip, page, false))
+          ut_error; /* Memory corrupted? */
+        ut_ad(page_validate(page, index));
+        /* fall through */
+      default:
+        insert_rec= nullptr;
       }
-
-      /* Theoretically, we could try one last resort of
-      page_zip_reorganize() followed by page_zip_available(), but that
-      would be very unlikely to succeed. (If the full reorganized page
-      failed to compress, why would it succeed to compress the page,
-      plus log the insert of this record?) */
-
-      /* Out of space: restore the page */
-      if (!page_zip_decompress(page_zip, cursor->block->frame, false))
-        ut_error; /* Memory corrupted? */
-      ut_ad(page_validate(cursor->block->frame, index));
-      insert_rec= nullptr;
     }
     return insert_rec;
   }
@@ -1848,13 +1940,11 @@ page_cur_insert_rec_zip(
 
     rec_offs_init(foffsets_);
 
-    rec_offs *foffsets= rec_get_offsets(cursor->block->frame + free_rec, index,
-                                        foffsets_,
-                                        page_is_leaf(cursor->block->frame)
+    rec_offs *foffsets= rec_get_offsets(page + free_rec, index, foffsets_,
+                                        page_is_leaf(page)
                                         ? index->n_core_fields : 0,
                                         ULINT_UNDEFINED, &heap);
-    insert_buf= cursor->block->frame + free_rec -
-      rec_offs_extra_size(foffsets);
+    insert_buf= page + free_rec - rec_offs_extra_size(foffsets);
 
     if (rec_offs_size(foffsets) < rec_size)
     {
@@ -1882,21 +1972,27 @@ too_small:
       /* Do not allow extra_size to grow */
       goto too_small;
 
-    byte *const free_rec_ptr= cursor->block->frame + free_rec;
+    byte *const free_rec_ptr= page + free_rec;
     heap_no= rec_get_heap_no_new(free_rec_ptr);
-    int16_t next_rec= mach_read_from_2(free_rec_ptr - REC_NEXT);
+    int16_t next_free= mach_read_from_2(free_rec_ptr - REC_NEXT);
     /* With innodb_page_size=64k, int16_t would be unsafe to use here,
     but that cannot be used with ROW_FORMAT=COMPRESSED. */
     static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
-    if (next_rec)
+    if (next_free)
     {
-      next_rec= static_cast<int16_t>(next_rec + free_rec);
-      ut_ad(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} <= next_rec);
-      ut_ad(static_cast<uint16_t>(next_rec) < srv_page_size);
+      next_free= static_cast<int16_t>(next_free + free_rec);
+      if (UNIV_UNLIKELY(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} >
+                        next_free ||
+                        uint16_t(next_free) >= srv_page_size))
+      {
+        if (UNIV_LIKELY_NULL(heap))
+          mem_heap_free(heap);
+        return nullptr;
+      }
     }
 
     byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]);
-    mach_write_to_2(hdr, static_cast<uint16_t>(next_rec));
+    mach_write_to_2(hdr, static_cast<uint16_t>(next_free));
     const byte *const garbage= my_assume_aligned<2>(page_free + 2);
     ut_ad(mach_read_from_2(garbage) >= rec_size);
     mach_write_to_2(my_assume_aligned<2>(hdr + 2),
@@ -1904,7 +2000,7 @@ too_small:
     static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
     mtr->memcpy(*cursor->block, page_free, hdr, 4);
 
-    if (!page_is_leaf(cursor->block->frame))
+    if (!page_is_leaf(page))
     {
       /* Zero out the node pointer of free_rec, in case it will not be
       overwritten by insert_rec. */
@@ -1952,26 +2048,26 @@ use_heap:
     page_zip_dir_add_slot(cursor->block, index, mtr);
   }
 
+  /* next record after current before the insertion */
+  const rec_t *next_rec = page_rec_get_next_low(cursor->rec, TRUE);
+  if (UNIV_UNLIKELY(!next_rec ||
+                    rec_get_status(next_rec) == REC_STATUS_INFIMUM ||
+                    rec_get_status(cursor->rec) > REC_STATUS_INFIMUM))
+    return nullptr;
+
   /* 3. Create the record */
   byte *insert_rec= rec_copy(insert_buf, rec, offsets);
-  rec_offs_make_valid(insert_rec, index, page_is_leaf(cursor->block->frame),
-                      offsets);
+  rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets);
 
   /* 4. Insert the record in the linked list of records */
   ut_ad(cursor->rec != insert_rec);
-
-  /* next record after current before the insertion */
-  const rec_t* next_rec = page_rec_get_next_low(cursor->rec, TRUE);
-  ut_ad(rec_get_status(cursor->rec) <= REC_STATUS_INFIMUM);
   ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
-  ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
 
   mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t>
                   (next_rec - insert_rec));
   mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
                   (insert_rec - cursor->rec));
-  byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                     cursor->block->frame);
+  byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
   mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
   memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2);
 
@@ -1990,7 +2086,7 @@ use_heap:
                                           page_zip->data);
   const uint16_t last_insert_rec= mach_read_from_2(last_insert);
   ut_ad(!last_insert_rec ||
-        rec_get_node_ptr_flag(cursor->block->frame + last_insert_rec) ==
+        rec_get_node_ptr_flag(page + last_insert_rec) ==
         rec_get_node_ptr_flag(insert_rec));
   mach_write_to_2(last_insert, page_offset(insert_rec));
 
@@ -2006,15 +2102,14 @@ no_direction:
       *dir= PAGE_NO_DIRECTION;
       memset(n, 0, 2);
     }
-    else if (*dir != PAGE_LEFT &&
-             cursor->block->frame + last_insert_rec == cursor->rec)
+    else if (*dir != PAGE_LEFT && page + last_insert_rec == cursor->rec)
     {
       *dir= PAGE_RIGHT;
 inc_dir:
       mach_write_to_2(n, mach_read_from_2(n) + 1);
     }
     else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) ==
-             cursor->block->frame + last_insert_rec)
+             page + last_insert_rec)
     {
       *dir= PAGE_LEFT;
       goto inc_dir;
@@ -2025,8 +2120,7 @@ inc_dir:
 
   /* Write the header fields in one record. */
   mtr->memcpy(*cursor->block,
-              my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
-                                   cursor->block->frame),
+              my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER + page),
               my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
                                    page_zip->data),
               PAGE_N_RECS - PAGE_LAST_INSERT + 2);
@@ -2034,8 +2128,9 @@ inc_dir:
   /* 7. It remains to update the owner record. */
   ulint n_owned;
 
-  while (!(n_owned = rec_get_n_owned_new(next_rec)))
-    next_rec= page_rec_get_next_low(next_rec, true);
+  while (!(n_owned= rec_get_n_owned_new(next_rec)))
+    if (!(next_rec= page_rec_get_next_low(next_rec, true)))
+      return nullptr;
 
   rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
                       REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
@@ -2046,8 +2141,15 @@ inc_dir:
   record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
   we have to split the corresponding directory slot in two. */
   if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
-    page_zip_dir_split_slot(cursor->block,
-                            page_dir_find_owner_slot(next_rec), mtr);
+  {
+    const ulint owner= page_dir_find_owner_slot(next_rec);
+    if (UNIV_UNLIKELY(owner == ULINT_UNDEFINED))
+    {
+      page_cur_directory_corrupted(*cursor->block, *index);
+      return nullptr;
+    }
+    page_zip_dir_split_slot(cursor->block, owner, mtr);
+  }
 
   page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
   return insert_rec;
@@ -2061,13 +2163,13 @@ inc_dir:
 static void page_mem_free(const buf_block_t &block, rec_t *rec,
                           size_t data_size, size_t extra_size)
 {
-  ut_ad(page_align(rec) == block.frame);
+  ut_ad(page_align(rec) == block.page.frame);
   ut_ad(!block.page.zip.data);
-  const rec_t *free= page_header_get_ptr(block.frame, PAGE_FREE);
+  const rec_t *free= page_header_get_ptr(block.page.frame, PAGE_FREE);
 
-  const uint16_t n_heap= uint16_t(page_header_get_field(block.frame,
+  const uint16_t n_heap= uint16_t(page_header_get_field(block.page.frame,
                                                         PAGE_N_HEAP) - 1);
-  ut_ad(page_get_n_recs(block.frame) < (n_heap & 0x7fff));
+  ut_ad(page_get_n_recs(block.page.frame) < (n_heap & 0x7fff));
   const bool deleting_top= n_heap == ((n_heap & 0x8000)
                                       ? (rec_get_heap_no_new(rec) | 0x8000)
                                       : rec_get_heap_no_old(rec));
@@ -2075,7 +2177,7 @@ static void page_mem_free(const buf_block_t &block, rec_t *rec,
   if (deleting_top)
   {
     byte *page_heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
-                                              block.frame);
+                                              block.page.frame);
     const uint16_t heap_top= mach_read_from_2(page_heap_top);
     const size_t extra_savings= heap_top - page_offset(rec + data_size);
     ut_ad(extra_savings < heap_top);
@@ -2088,7 +2190,7 @@ static void page_mem_free(const buf_block_t &block, rec_t *rec,
     if (extra_savings)
     {
       byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
-                                               block.frame);
+                                               block.page.frame);
       uint16_t garbage= mach_read_from_2(page_garbage);
       ut_ad(garbage >= extra_savings);
       mach_write_to_2(page_garbage, garbage - extra_savings);
@@ -2097,17 +2199,17 @@ static void page_mem_free(const buf_block_t &block, rec_t *rec,
   else
   {
     byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
-                                          block.frame);
+                                          block.page.frame);
     byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
-                                             block.frame);
+                                             block.page.frame);
     mach_write_to_2(page_free, page_offset(rec));
     mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) +
                     extra_size + data_size);
   }
 
-  memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.frame, 0, 2);
+  memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.page.frame, 0, 2);
   byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                          block.frame);
+                                          block.page.frame);
   mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) - 1);
 
   const byte* const end= rec + data_size;
@@ -2117,7 +2219,7 @@ static void page_mem_free(const buf_block_t &block, rec_t *rec,
     uint16_t next= free
       ? ((n_heap & 0x8000)
          ? static_cast<uint16_t>(free - rec)
-         : static_cast<uint16_t>(free - block.frame))
+         : static_cast<uint16_t>(free - block.page.frame))
       : uint16_t{0};
     mach_write_to_2(rec - REC_NEXT, next);
   }
@@ -2134,7 +2236,6 @@ void
 page_cur_delete_rec(
 /*================*/
 	page_cur_t*		cursor,	/*!< in/out: a page cursor */
-	const dict_index_t*	index,	/*!< in: record descriptor */
 	const rec_offs*		offsets,/*!< in: rec_get_offsets(
 					cursor->rec, index) */
 	mtr_t*			mtr)	/*!< in/out: mini-transaction */
@@ -2143,24 +2244,26 @@ page_cur_delete_rec(
 	rec_t*		current_rec;
 	rec_t*		prev_rec	= NULL;
 	rec_t*		next_rec;
-	ulint		cur_slot_no;
 	ulint		cur_n_owned;
 	rec_t*		rec;
 
 	/* page_zip_validate() will fail here when
 	btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
-	Then, both "page_zip" and "block->frame" would have the min-rec-mark
-	set on the smallest user record, but "block->frame" would additionally
-	have it set on the smallest-but-one record.  Because sloppy
+	Then, both "page_zip" and "block->page.frame" would have the
+	min-rec-mark set on the smallest user record, but
+	"block->page.frame" would additionally have it set on the
+	smallest-but-one record.  Because sloppy
 	page_zip_validate_low() only ignores min-rec-flag differences
 	in the smallest user record, it cannot be used here either. */
 
 	current_rec = cursor->rec;
+	const dict_index_t* const index = cursor->index;
 	buf_block_t* const block = cursor->block;
 	ut_ad(rec_offs_validate(current_rec, index, offsets));
-	ut_ad(!!page_is_comp(block->frame) == index->table->not_redundant());
-	ut_ad(fil_page_index_page_check(block->frame));
-	ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->frame)
+	ut_ad(!!page_is_comp(block->page.frame)
+	      == index->table->not_redundant());
+	ut_ad(fil_page_index_page_check(block->page.frame));
+	ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame)
 	      == index->id
 	      || mtr->is_inside_ibuf());
 	ut_ad(mtr->is_named_space(index->table->space));
@@ -2168,26 +2271,32 @@ page_cur_delete_rec(
 	/* The record must not be the supremum or infimum record. */
 	ut_ad(page_rec_is_user_rec(current_rec));
 
-	if (page_get_n_recs(block->frame) == 1
+	if (page_get_n_recs(block->page.frame) == 1
 	    && !rec_is_alter_metadata(current_rec, *index)) {
 		/* Empty the page. */
-		ut_ad(page_is_leaf(block->frame));
+		ut_ad(page_is_leaf(block->page.frame));
 		/* Usually, this should be the root page,
 		and the whole index tree should become empty.
 		However, this could also be a call in
 		btr_cur_pessimistic_update() to delete the only
 		record in the page and to insert another one. */
-		page_cur_move_to_next(cursor);
-		ut_ad(page_cur_is_after_last(cursor));
+		ut_ad(page_rec_is_supremum(page_rec_get_next(cursor->rec)));
+		page_cur_set_after_last(block, cursor);
 		page_create_empty(page_cur_get_block(cursor),
 				  const_cast<dict_index_t*>(index), mtr);
 		return;
 	}
 
 	/* Save to local variables some data associated with current_rec */
-	cur_slot_no = page_dir_find_owner_slot(current_rec);
-	ut_ad(cur_slot_no > 0);
-	cur_dir_slot = page_dir_get_nth_slot(block->frame, cur_slot_no);
+	ulint cur_slot_no = page_dir_find_owner_slot(current_rec);
+
+	if (UNIV_UNLIKELY(!cur_slot_no || cur_slot_no == ULINT_UNDEFINED)) {
+		/* Avoid crashing due to a corrupted page. */
+		page_cur_directory_corrupted(*block, *index);
+		return;
+	}
+
+	cur_dir_slot = page_dir_get_nth_slot(block->page.frame, cur_slot_no);
 	cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
 
 	/* The page gets invalid for btr_pcur_restore_pos().
@@ -2207,11 +2316,16 @@ page_cur_delete_rec(
 
 	while (current_rec != rec) {
 		prev_rec = rec;
-		rec = page_rec_get_next(rec);
+		if (!(rec = page_rec_get_next(rec))) {
+			/* Avoid crashing due to a corrupted page. */
+			return;
+                }
 	}
 
-	page_cur_move_to_next(cursor);
-	next_rec = cursor->rec;
+	if (!(next_rec = page_cur_move_to_next(cursor))) {
+		/* Avoid crashing due to a corrupted page. */
+		return;
+	}
 
 	/* Remove the record from the linked list of records */
 	/* If the deleted record is pointed to by a dir slot, update the
@@ -2227,7 +2341,7 @@ page_cur_delete_rec(
 		(page_dir_slot_get_rec(cur_dir_slot));
 
 	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
-		ut_ad(page_is_comp(block->frame));
+		ut_ad(page_is_comp(block->page.frame));
 		if (current_rec == slot_rec) {
 			page_zip_rec_set_owned(block, prev_rec, 1, mtr);
 			page_zip_rec_set_owned(block, slot_rec, 0, mtr);
@@ -2246,7 +2360,7 @@ page_cur_delete_rec(
 
 		page_header_reset_last_insert(block, mtr);
 		page_zip_dir_delete(block, rec, index, offsets,
-				    page_header_get_ptr(block->frame,
+				    page_header_get_ptr(block->page.frame,
 							PAGE_FREE),
 				    mtr);
 		if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
@@ -2263,7 +2377,7 @@ page_cur_delete_rec(
 	const size_t data_size = rec_offs_data_size(offsets);
 	const size_t extra_size = rec_offs_extra_size(offsets);
 
-	if (page_is_comp(block->frame)) {
+	if (page_is_comp(block->page.frame)) {
 		mtr->page_delete(*block, page_offset(prev_rec)
 				 - PAGE_NEW_INFIMUM,
 				 extra_size - REC_N_NEW_EXTRA_BYTES,
@@ -2292,9 +2406,9 @@ page_cur_delete_rec(
 		page_dir_balance_slot(*block, cur_slot_no);
 	}
 
-	ut_ad(page_is_comp(block->frame)
-	      ? page_simple_validate_new(block->frame)
-	      : page_simple_validate_old(block->frame));
+	ut_ad(page_is_comp(block->page.frame)
+	      ? page_simple_validate_new(block->page.frame)
+	      : page_simple_validate_old(block->page.frame));
 }
 
 /** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
@@ -2313,18 +2427,17 @@ bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
                                  size_t hdr_c, size_t data_c,
                                  const void *data, size_t data_len)
 {
-  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
-  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
-                                          block.frame);
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + page);
   const uint16_t h= mach_read_from_2(page_n_heap);
   const page_id_t id(block.page.id());
   if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW ||
                     h >= srv_page_size / REC_N_OLD_EXTRA_BYTES ||
-                    !fil_page_index_page_check(block.frame) ||
-                    page_get_page_no(block.frame) != id.page_no() ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
                     mach_read_from_2(my_assume_aligned<2>
-                                     (PAGE_OLD_SUPREMUM - REC_NEXT +
-                                      block.frame))))
+                                     (PAGE_OLD_SUPREMUM - REC_NEXT + page))))
   {
 corrupted:
     ib::error() << (reuse
@@ -2336,19 +2449,19 @@ corrupted:
     return true;
   }
 
-  byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  byte * const last_slot= page_dir_get_nth_slot(page, n_slots - 1);
   byte * const page_heap_top= my_assume_aligned<2>
-    (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
-  const byte *const heap_bot= &block.frame[PAGE_OLD_SUPREMUM_END];
-  byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+    (PAGE_HEAP_TOP + PAGE_HEADER + page);
+  const byte *const heap_bot= &page[PAGE_OLD_SUPREMUM_END];
+  byte *heap_top= page + mach_read_from_2(page_heap_top);
   if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
     goto corrupted;
   if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_OLD_SUPREMUM))
     goto corrupted;
-  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(page, 0)) !=
                                      PAGE_OLD_INFIMUM))
     goto corrupted;
-  rec_t * const prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+  rec_t * const prev_rec= page + PAGE_OLD_INFIMUM + prev;
   if (!prev);
   else if (UNIV_UNLIKELY(heap_bot + (REC_N_OLD_EXTRA_BYTES + 1) > prev_rec ||
                          prev_rec > heap_top))
@@ -2360,7 +2473,7 @@ corrupted:
     goto corrupted;
   const ulint pextra_size= REC_N_OLD_EXTRA_BYTES +
     (rec_get_1byte_offs_flag(prev_rec) ? pn_fields : pn_fields * 2);
-  if (prev_rec == &block.frame[PAGE_OLD_INFIMUM]);
+  if (prev_rec == &page[PAGE_OLD_INFIMUM]);
   else if (UNIV_UNLIKELY(prev_rec - pextra_size < heap_bot))
     goto corrupted;
   if (UNIV_UNLIKELY(hdr_c && prev_rec - hdr_c < heap_bot))
@@ -2368,8 +2481,8 @@ corrupted:
   const ulint pdata_size= rec_get_data_size_old(prev_rec);
   if (UNIV_UNLIKELY(prev_rec + pdata_size > heap_top))
     goto corrupted;
-  rec_t * const next_rec= block.frame + mach_read_from_2(prev_rec - REC_NEXT);
-  if (next_rec == block.frame + PAGE_OLD_SUPREMUM);
+  rec_t * const next_rec= page + mach_read_from_2(prev_rec - REC_NEXT);
+  if (next_rec == page + PAGE_OLD_SUPREMUM);
   else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > next_rec ||
                          next_rec > heap_top))
     goto corrupted;
@@ -2394,8 +2507,8 @@ corrupted:
   for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
        !(n_owned= rec_get_n_owned_old(owner_rec)); )
   {
-    owner_rec= block.frame + mach_read_from_2(owner_rec - REC_NEXT);
-    if (owner_rec == &block.frame[PAGE_OLD_SUPREMUM]);
+    owner_rec= page + mach_read_from_2(owner_rec - REC_NEXT);
+    if (owner_rec == &page[PAGE_OLD_SUPREMUM]);
     else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > owner_rec ||
                            owner_rec > heap_top))
       goto corrupted;
@@ -2409,10 +2522,10 @@ corrupted:
     goto corrupted;
   else
   {
-    mach_write_to_2(insert_buf, owner_rec - block.frame);
+    mach_write_to_2(insert_buf, owner_rec - page);
     static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
     const page_dir_slot_t * const first_slot=
-      page_dir_get_nth_slot(block.frame, 0);
+      page_dir_get_nth_slot(page, 0);
 
     while (memcmp_aligned<2>(owner_slot, insert_buf, 2))
       if ((owner_slot+= 2) == first_slot)
@@ -2441,8 +2554,8 @@ corrupted:
   if (reuse)
   {
     byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
-                                          block.frame);
-    rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+                                          page);
+    rec_t *free_rec= page + mach_read_from_2(page_free);
     if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > free_rec ||
                       free_rec > heap_top))
       goto corrupted;
@@ -2461,9 +2574,9 @@ corrupted:
                       fextra_size + fdata_size))
       goto corrupted;
     buf= free_rec - fextra_size;
-    const rec_t *const next_free= block.frame +
+    const rec_t *const next_free= page +
       mach_read_from_2(free_rec - REC_NEXT);
-    if (next_free == block.frame);
+    if (next_free == page);
     else if (UNIV_UNLIKELY(next_free < &heap_bot[REC_N_OLD_EXTRA_BYTES + 1] ||
                            heap_top < next_free))
       goto corrupted;
@@ -2487,11 +2600,11 @@ corrupted:
 
   ut_ad(data_size - data_c == data_len - (extra_size - hdr_c));
   byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
-                                               block.frame);
+                                               page);
   const uint16_t last_insert= mach_read_from_2(page_last_insert);
   memcpy(buf, insert_buf, extra_size);
   buf+= extra_size;
-  mach_write_to_2(page_last_insert, buf - block.frame);
+  mach_write_to_2(page_last_insert, buf - page);
   memcpy(prev_rec - REC_NEXT, page_last_insert, 2);
   memcpy(buf, prev_rec, data_c);
   memcpy(buf + data_c, static_cast<const byte*>(data) + (extra_size - hdr_c),
@@ -2500,25 +2613,25 @@ corrupted:
                       REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
 
   /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
-  if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  if (page[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
   {
-    byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *dir= &page[PAGE_DIRECTION_B + PAGE_HEADER];
     byte *n_dir= my_assume_aligned<2>
-      (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+      (&page[PAGE_N_DIRECTION + PAGE_HEADER]);
     if (UNIV_UNLIKELY(!last_insert))
     {
 no_direction:
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
       memset(n_dir, 0, 2);
     }
-    else if (block.frame + last_insert == prev_rec &&
+    else if (page + last_insert == prev_rec &&
              (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
     {
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
 inc_dir:
       mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
     }
-    else if (next_rec == block.frame + last_insert &&
+    else if (next_rec == page + last_insert &&
              (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
     {
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
@@ -2529,14 +2642,13 @@ inc_dir:
   }
 
   /* Update PAGE_N_RECS. */
-  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                          block.frame);
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
 
   mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
 
   if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
-    page_dir_split_slot(block, owner_slot);
-  ut_ad(page_simple_validate_old(block.frame));
+    return page_dir_split_slot(block, owner_slot);
+  ut_ad(page_simple_validate_old(page));
   return false;
 }
 
@@ -2557,21 +2669,20 @@ bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
                                size_t hdr_c, size_t data_c,
                                const void *data, size_t data_len)
 {
-  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
-  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER +
-                                          block.frame);
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + page);
   ulint h= mach_read_from_2(page_n_heap);
   const page_id_t id(block.page.id());
   if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) ||
                     (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES ||
                     (h & 0x7fff) < n_slots ||
-                    !fil_page_index_page_check(block.frame) ||
-                    page_get_page_no(block.frame) != id.page_no() ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
                     mach_read_from_2(my_assume_aligned<2>
-                                     (PAGE_NEW_SUPREMUM - REC_NEXT +
-                                      block.frame)) ||
+                                     (PAGE_NEW_SUPREMUM - REC_NEXT + page)) ||
                     ((enc_hdr_l & REC_STATUS_INSTANT) &&
-                     !page_is_leaf(block.frame)) ||
+                     !page_is_leaf(page)) ||
                     (enc_hdr_l >> 3) > data_len))
   {
 corrupted:
@@ -2584,42 +2695,42 @@ corrupted:
     return true;
   }
 
-  byte * const last_slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  byte * const last_slot= page_dir_get_nth_slot(page, n_slots - 1);
   byte * const page_heap_top= my_assume_aligned<2>
-    (PAGE_HEAP_TOP + PAGE_HEADER + block.frame);
-  const byte *const heap_bot= &block.frame[PAGE_NEW_SUPREMUM_END];
-  byte *heap_top= block.frame + mach_read_from_2(page_heap_top);
+    (PAGE_HEAP_TOP + PAGE_HEADER + page);
+  const byte *const heap_bot= &page[PAGE_NEW_SUPREMUM_END];
+  byte *heap_top= page + mach_read_from_2(page_heap_top);
   if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
     goto corrupted;
   if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_NEW_SUPREMUM))
     goto corrupted;
-  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(block.frame, 0)) !=
+  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(page, 0)) !=
                                      PAGE_NEW_INFIMUM))
     goto corrupted;
 
   uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
-  rec_t *prev_rec= block.frame + n;
+  rec_t *prev_rec= page + n;
   n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
   if (!prev);
   else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > prev_rec ||
                          prev_rec > heap_top))
     goto corrupted;
 
-  rec_t * const next_rec= block.frame + n;
-  if (next_rec == block.frame + PAGE_NEW_SUPREMUM);
+  rec_t * const next_rec= page + n;
+  if (next_rec == page + PAGE_NEW_SUPREMUM);
   else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > next_rec ||
                          next_rec > heap_top))
     goto corrupted;
 
   ulint n_owned;
   rec_t *owner_rec= next_rec;
-  n= static_cast<uint16_t>(next_rec - block.frame);
+  n= static_cast<uint16_t>(next_rec - page);
 
   for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
        !(n_owned= rec_get_n_owned_new(owner_rec)); )
   {
     n= static_cast<uint16_t>(n + mach_read_from_2(owner_rec - REC_NEXT));
-    owner_rec= block.frame + n;
+    owner_rec= page + n;
     if (n == PAGE_NEW_SUPREMUM);
     else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > owner_rec ||
                            owner_rec > heap_top))
@@ -2636,9 +2747,9 @@ corrupted:
   {
     static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
     alignas(2) byte slot_buf[2];
-    mach_write_to_2(slot_buf, owner_rec - block.frame);
+    mach_write_to_2(slot_buf, owner_rec - page);
     const page_dir_slot_t * const first_slot=
-      page_dir_get_nth_slot(block.frame, 0);
+      page_dir_get_nth_slot(page, 0);
 
     while (memcmp_aligned<2>(owner_slot, slot_buf, 2))
       if ((owner_slot+= 2) == first_slot)
@@ -2652,9 +2763,8 @@ corrupted:
   byte *buf;
   if (reuse)
   {
-    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
-                                          block.frame);
-    rec_t *free_rec= block.frame + mach_read_from_2(page_free);
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + page);
+    rec_t *free_rec= page + mach_read_from_2(page_free);
     if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > free_rec ||
                       free_rec > heap_top))
       goto corrupted;
@@ -2672,9 +2782,9 @@ corrupted:
       goto corrupted;
     if ((n= mach_read_from_2(free_rec - REC_NEXT)) != 0)
     {
-      n= static_cast<uint16_t>(n + free_rec - block.frame);
+      n= static_cast<uint16_t>(n + free_rec - page);
       if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
-                        heap_top < block.frame + n))
+                        heap_top < page + n))
         goto corrupted;
     }
     mach_write_to_2(page_free, n);
@@ -2705,7 +2815,7 @@ corrupted:
   h= (h & ((1U << 5) - 1)) << 3;
   static_assert(REC_STATUS_ORDINARY == 0, "compatibility");
   static_assert(REC_STATUS_INSTANT == 4, "compatibility");
-  if (page_is_leaf(block.frame))
+  if (page_is_leaf(page))
     h|= enc_hdr_l & REC_STATUS_INSTANT;
   else
   {
@@ -2717,9 +2827,9 @@ corrupted:
   buf+= REC_NEXT;
   mach_write_to_2(buf - REC_NEXT, static_cast<uint16_t>(next_rec - buf));
   byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
-                                               block.frame);
+                                               page);
   const uint16_t last_insert= mach_read_from_2(page_last_insert);
-  mach_write_to_2(page_last_insert, buf - block.frame);
+  mach_write_to_2(page_last_insert, buf - page);
   mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(buf - prev_rec));
   memcpy(buf, prev_rec, data_c);
   buf+= data_c;
@@ -2729,25 +2839,24 @@ corrupted:
                       REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
 
   /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
-  if (block.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  if (page[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
   {
-    byte *dir= &block.frame[PAGE_DIRECTION_B + PAGE_HEADER];
-    byte *n_dir= my_assume_aligned<2>
-      (&block.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+    byte *dir= &page[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n_dir= my_assume_aligned<2>(&page[PAGE_N_DIRECTION + PAGE_HEADER]);
     if (UNIV_UNLIKELY(!last_insert))
     {
 no_direction:
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
       memset(n_dir, 0, 2);
     }
-    else if (block.frame + last_insert == prev_rec &&
+    else if (page + last_insert == prev_rec &&
              (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
     {
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
 inc_dir:
       mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
     }
-    else if (next_rec == block.frame + last_insert &&
+    else if (next_rec == page + last_insert &&
              (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
     {
       *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
@@ -2758,14 +2867,13 @@ inc_dir:
   }
 
   /* Update PAGE_N_RECS. */
-  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                          block.frame);
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
 
   mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
 
   if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
-    page_dir_split_slot(block, owner_slot);
-  ut_ad(page_simple_validate_new(block.frame));
+    return page_dir_split_slot(block, owner_slot);
+  ut_ad(page_simple_validate_new(page));
   return false;
 }
 
@@ -2776,17 +2884,17 @@ page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
 @return whether the operation failed (inconcistency was noticed) */
 bool page_apply_delete_redundant(const buf_block_t &block, ulint prev)
 {
-  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
-  ulint n_recs= page_get_n_recs(block.frame);
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  ulint n_recs= page_get_n_recs(page);
   const page_id_t id(block.page.id());
 
   if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
-                    !fil_page_index_page_check(block.frame) ||
-                    page_get_page_no(block.frame) != id.page_no() ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
                     mach_read_from_2(my_assume_aligned<2>
-                                     (PAGE_OLD_SUPREMUM - REC_NEXT +
-                                      block.frame)) ||
-                    page_is_comp(block.frame)))
+                                     (PAGE_OLD_SUPREMUM - REC_NEXT + page)) ||
+                    page_is_comp(page)))
   {
 corrupted:
     ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT"
@@ -2794,12 +2902,12 @@ corrupted:
     return true;
   }
 
-  byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
-  rec_t *prev_rec= block.frame + PAGE_OLD_INFIMUM + prev;
+  byte *slot= page_dir_get_nth_slot(page, n_slots - 1);
+  rec_t *prev_rec= page + PAGE_OLD_INFIMUM + prev;
   if (UNIV_UNLIKELY(prev_rec > slot))
     goto corrupted;
   uint16_t n= mach_read_from_2(prev_rec - REC_NEXT);
-  rec_t *rec= block.frame + n;
+  rec_t *rec= page + n;
   if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
                     slot < rec))
     goto corrupted;
@@ -2811,7 +2919,7 @@ corrupted:
     goto corrupted;
 
   n= mach_read_from_2(rec - REC_NEXT);
-  rec_t *next= block.frame + n;
+  rec_t *next= page + n;
   if (n == PAGE_OLD_SUPREMUM);
   else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
                          slot < next))
@@ -2822,7 +2930,7 @@ corrupted:
   for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_old(s)); )
   {
     n= mach_read_from_2(s - REC_NEXT);
-    s= block.frame + n;
+    s= page + n;
     if (n == PAGE_OLD_SUPREMUM);
     else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
                            slot < s))
@@ -2834,9 +2942,9 @@ corrupted:
 
   /* The first slot is always pointing to the infimum record.
   Find the directory slot pointing to s. */
-  const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+  const byte * const first_slot= page + srv_page_size - (PAGE_DIR + 2);
   alignas(2) byte slot_offs[2];
-  mach_write_to_2(slot_offs, s - block.frame);
+  mach_write_to_2(slot_offs, s - page);
   static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
 
   while (memcmp_aligned<2>(slot, slot_offs, 2))
@@ -2846,7 +2954,7 @@ corrupted:
   if (rec == s)
   {
     s= prev_rec;
-    mach_write_to_2(slot, s - block.frame);
+    mach_write_to_2(slot, s - page);
   }
 
   memcpy(prev_rec - REC_NEXT, rec - REC_NEXT, 2);
@@ -2858,7 +2966,7 @@ corrupted:
   if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
     page_dir_balance_slot(block, (first_slot - slot) / 2);
 
-  ut_ad(page_simple_validate_old(block.frame));
+  ut_ad(page_simple_validate_old(page));
   return false;
 }
 
@@ -2872,17 +2980,17 @@ page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
 bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
                                size_t hdr_size, size_t data_size)
 {
-  const uint16_t n_slots= page_dir_get_n_slots(block.frame);
-  ulint n_recs= page_get_n_recs(block.frame);
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  ulint n_recs= page_get_n_recs(page);
   const page_id_t id(block.page.id());
 
   if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
-                    !fil_page_index_page_check(block.frame) ||
-                    page_get_page_no(block.frame) != id.page_no() ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
                     mach_read_from_2(my_assume_aligned<2>
-                                     (PAGE_NEW_SUPREMUM - REC_NEXT +
-                                      block.frame)) ||
-                    !page_is_comp(block.frame)))
+                                     (PAGE_NEW_SUPREMUM - REC_NEXT + page)) ||
+                    !page_is_comp(page)))
   {
 corrupted:
     ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC"
@@ -2890,13 +2998,13 @@ corrupted:
     return true;
   }
 
-  byte *slot= page_dir_get_nth_slot(block.frame, n_slots - 1);
+  byte *slot= page_dir_get_nth_slot(page, n_slots - 1);
   uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
-  rec_t *prev_rec= block.frame + n;
+  rec_t *prev_rec= page + n;
   if (UNIV_UNLIKELY(prev_rec > slot))
     goto corrupted;
   n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
-  rec_t *rec= block.frame + n;
+  rec_t *rec= page + n;
   if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
                     slot < rec))
     goto corrupted;
@@ -2905,14 +3013,14 @@ corrupted:
                     slot < rec + data_size))
     goto corrupted;
   n= static_cast<uint16_t>(n + mach_read_from_2(rec - REC_NEXT));
-  rec_t *next= block.frame + n;
+  rec_t *next= page + n;
   if (n == PAGE_NEW_SUPREMUM);
   else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
                          slot < next))
     goto corrupted;
 
   rec_t *s= rec;
-  n= static_cast<uint16_t>(rec - block.frame);
+  n= static_cast<uint16_t>(rec - page);
   ulint slot_owned;
   for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_new(s)); )
   {
@@ -2921,7 +3029,7 @@ corrupted:
                       next > static_cast<uint16_t>(-REC_N_NEW_EXTRA_BYTES)))
       goto corrupted;
     n= static_cast<uint16_t>(n + next);
-    s= block.frame + n;
+    s= page + n;
     if (n == PAGE_NEW_SUPREMUM);
     else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
                            slot < s))
@@ -2933,9 +3041,9 @@ corrupted:
 
   /* The first slot is always pointing to the infimum record.
   Find the directory slot pointing to s. */
-  const byte * const first_slot= block.frame + srv_page_size - (PAGE_DIR + 2);
+  const byte * const first_slot= page + srv_page_size - (PAGE_DIR + 2);
   alignas(2) byte slot_offs[2];
-  mach_write_to_2(slot_offs, s - block.frame);
+  mach_write_to_2(slot_offs, s - page);
   static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
 
   while (memcmp_aligned<2>(slot, slot_offs, 2))
@@ -2945,7 +3053,7 @@ corrupted:
   if (rec == s)
   {
     s= prev_rec;
-    mach_write_to_2(slot, s - block.frame);
+    mach_write_to_2(slot, s - page);
   }
 
   mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(next - prev_rec));
@@ -2957,7 +3065,7 @@ corrupted:
   if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
     page_dir_balance_slot(block, (first_slot - slot) / 2);
 
-  ut_ad(page_simple_validate_new(block.frame));
+  ut_ad(page_simple_validate_new(page));
   return false;
 }
 
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
index f5f7d17f2d5..21b291e3a8b 100644
--- a/storage/innobase/page/page0page.cc
+++ b/storage/innobase/page/page0page.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -82,7 +82,8 @@ is 50 x 4 bytes = 200 bytes. */
 
 /***************************************************************//**
 Looks for the directory slot which owns the given record.
-@return the directory slot number */
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
 ulint
 page_dir_find_owner_slot(
 /*=====================*/
@@ -98,44 +99,26 @@ page_dir_find_owner_slot(
 
 	if (page_is_comp(page)) {
 		while (rec_get_n_owned_new(r) == 0) {
-			r = rec_get_next_ptr_const(r, TRUE);
-			ut_ad(r >= page + PAGE_NEW_SUPREMUM);
-			ut_ad(r < page + (srv_page_size - PAGE_DIR));
+			r = page_rec_get_next_low(r, true);
+			if (UNIV_UNLIKELY(r < page + PAGE_NEW_SUPREMUM
+					  || r >= slot)) {
+				return ULINT_UNDEFINED;
+			}
 		}
 	} else {
 		while (rec_get_n_owned_old(r) == 0) {
-			r = rec_get_next_ptr_const(r, FALSE);
-			ut_ad(r >= page + PAGE_OLD_SUPREMUM);
-			ut_ad(r < page + (srv_page_size - PAGE_DIR));
+			r = page_rec_get_next_low(r, false);
+			if (UNIV_UNLIKELY(r < page + PAGE_OLD_SUPREMUM
+					  || r >= slot)) {
+				return ULINT_UNDEFINED;
+			}
 		}
 	}
 
-	uint16 rec_offs_bytes = mach_encode_2(ulint(r - page));
-
-	while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) {
-
+	while (UNIV_LIKELY(*(uint16*) slot
+			   != mach_encode_2(ulint(r - page)))) {
 		if (UNIV_UNLIKELY(slot == first_slot)) {
-			ib::error() << "Probable data corruption on page "
-				<< page_get_page_no(page)
-				<< ". Original record on that page;";
-
-			if (page_is_comp(page)) {
-				fputs("(compact record)", stderr);
-			} else {
-				rec_print_old(stderr, rec);
-			}
-
-			ib::error() << "Cannot find the dir slot for this"
-				" record on that page;";
-
-			if (page_is_comp(page)) {
-				fputs("(compact record)", stderr);
-			} else {
-				rec_print_old(stderr, page
-					      + mach_decode_2(rec_offs_bytes));
-			}
-
-			ut_error;
+			return ULINT_UNDEFINED;
 		}
 
 		slot += PAGE_DIR_SLOT_SIZE;
@@ -201,7 +184,7 @@ page_set_max_trx_id(
   ut_ad(!page_zip || page_zip == &block->page.zip);
   static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
   byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID +
-                                         PAGE_HEADER + block->frame);
+                                         PAGE_HEADER + block->page.frame);
 
   mtr->write<8>(*block, max_trx_id, trx_id);
   if (UNIV_LIKELY_NULL(page_zip))
@@ -228,7 +211,7 @@ page_set_autoinc(
                                    MTR_MEMO_PAGE_SX_FIX));
 
   byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC +
-                                    block->frame);
+                                    block->page.frame);
   ib_uint64_t old= mach_read_from_8(field);
   if (old == autoinc || (old > autoinc && !reset))
     return; /* nothing to update */
@@ -283,7 +266,7 @@ void page_create_low(const buf_block_t* block, bool comp)
 	compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE
 			    <= PAGE_DATA);
 
-	page = block->frame;
+	page = block->page.frame;
 
 	fil_page_set_type(page, FIL_PAGE_INDEX);
 
@@ -370,13 +353,14 @@ page_create_zip(
 	page_create_low(block, true);
 
 	if (index->is_spatial()) {
-		mach_write_to_2(FIL_PAGE_TYPE + block->frame, FIL_PAGE_RTREE);
-		memset(block->frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+		mach_write_to_2(FIL_PAGE_TYPE + block->page.frame,
+				FIL_PAGE_RTREE);
+		memset(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
 		memset(block->page.zip.data + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
 	}
 
-	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->frame, level);
-	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->frame,
+	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->page.frame, level);
+	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->page.frame,
 			max_trx_id);
 
 	if (!page_zip_compress(block, index, page_zip_level, mtr)) {
@@ -398,7 +382,7 @@ page_create_empty(
 	trx_id_t	max_trx_id;
 	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
 
-	ut_ad(fil_page_index_page_check(block->frame));
+	ut_ad(fil_page_index_page_check(block->page.frame));
 	ut_ad(!index->is_dummy);
 	ut_ad(block->page.id().space() == index->table->space->id);
 
@@ -408,12 +392,12 @@ page_create_empty(
 	for MVCC. */
 	if (dict_index_is_sec_or_ibuf(index)
 	    && !index->table->is_temporary()
-	    && page_is_leaf(block->frame)) {
-		max_trx_id = page_get_max_trx_id(block->frame);
+	    && page_is_leaf(block->page.frame)) {
+		max_trx_id = page_get_max_trx_id(block->page.frame);
 		ut_ad(max_trx_id);
 	} else if (block->page.id().page_no() == index->page) {
 		/* Preserve PAGE_ROOT_AUTO_INC. */
-		max_trx_id = page_get_max_trx_id(block->frame);
+		max_trx_id = page_get_max_trx_id(block->page.frame);
 	} else {
 		max_trx_id = 0;
 	}
@@ -421,7 +405,7 @@ page_create_empty(
 	if (page_zip) {
 		ut_ad(!index->table->is_temporary());
 		page_create_zip(block, index,
-				page_header_get_field(block->frame,
+				page_header_get_field(block->page.frame,
 						      PAGE_LEVEL),
 				max_trx_id, mtr);
 	} else {
@@ -430,9 +414,10 @@ page_create_empty(
 			static_assert(((FIL_PAGE_INDEX & 0xff00)
 				       | byte(FIL_PAGE_RTREE))
 				      == FIL_PAGE_RTREE, "compatibility");
-			mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->frame,
+			mtr->write<1>(*block,
+				      FIL_PAGE_TYPE + 1 + block->page.frame,
 				      byte(FIL_PAGE_RTREE));
-			if (mach_read_from_8(block->frame
+			if (mach_read_from_8(block->page.frame
 					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
 				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
 					    8, 0);
@@ -441,7 +426,7 @@ page_create_empty(
 
 		if (max_trx_id) {
 			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
-				      + block->frame, max_trx_id);
+				      + block->page.frame, max_trx_id);
 		}
 	}
 }
@@ -453,8 +438,10 @@ touch the lock table and max trx id on page or compress the page.
 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
 if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit(). */
-void
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+dberr_t
 page_copy_rec_list_end_no_locks(
 /*============================*/
 	buf_block_t*	new_block,	/*!< in: index page to copy to */
@@ -471,20 +458,24 @@ page_copy_rec_list_end_no_locks(
 	rec_offs*	offsets		= offsets_;
 	rec_offs_init(offsets_);
 
+	cur1.index = cur2.index = index;
 	page_cur_position(rec, block, &cur1);
 
-	if (page_cur_is_before_first(&cur1)) {
+	if (page_cur_is_before_first(&cur1) && !page_cur_move_to_next(&cur1)) {
+		return DB_CORRUPTION;
+	}
 
-		page_cur_move_to_next(&cur1);
+	if (UNIV_UNLIKELY(page_is_comp(new_page) != page_rec_is_comp(rec)
+			  || mach_read_from_2(new_page + srv_page_size - 10)
+			  != ulint(page_is_comp(new_page)
+				   ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM))) {
+		return DB_CORRUPTION;
 	}
 
-	btr_assert_not_corrupted(new_block, index);
-	ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
-	ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint)
-	     (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
-	const ulint n_core = page_is_leaf(block->frame)
+	const ulint n_core = page_is_leaf(block->page.frame)
 		? index->n_core_fields : 0;
 
+	dberr_t err = DB_SUCCESS;
 	page_cur_set_before_first(new_block, &cur2);
 
 	/* Copy records from the original page to the new page */
@@ -493,15 +484,12 @@ page_copy_rec_list_end_no_locks(
 		rec_t*	ins_rec;
 		offsets = rec_get_offsets(cur1.rec, index, offsets, n_core,
 					  ULINT_UNDEFINED, &heap);
-		ins_rec = page_cur_insert_rec_low(&cur2, index,
-						  cur1.rec, offsets, mtr);
-		if (UNIV_UNLIKELY(!ins_rec)) {
-			ib::fatal() << "Rec offset " << page_offset(rec)
-				<< ", cur1 offset " << page_offset(cur1.rec)
-				<< ", cur2 offset " << page_offset(cur2.rec);
+		ins_rec = page_cur_insert_rec_low(&cur2, cur1.rec, offsets,
+						  mtr);
+		if (UNIV_UNLIKELY(!ins_rec || !page_cur_move_to_next(&cur1))) {
+			err = DB_CORRUPTION;
+			break;
 		}
-
-		page_cur_move_to_next(&cur1);
 		ut_ad(!(rec_get_info_bits(cur1.rec, page_is_comp(new_page))
 			& REC_INFO_MIN_REC_FLAG));
 		cur2.rec = ins_rec;
@@ -510,6 +498,8 @@ page_copy_rec_list_end_no_locks(
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
+
+	return err;
 }
 
 /*************************************************************//**
@@ -520,10 +510,10 @@ The records are copied to the start of the record list on new_page.
 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
 if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
 
-@return pointer to the original successor of the infimum record on
-new_page, or NULL on zip overflow (new_block will be decompressed) */
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
 rec_t*
 page_copy_rec_list_end(
 /*===================*/
@@ -531,18 +521,22 @@ page_copy_rec_list_end(
 	buf_block_t*	block,		/*!< in: index page containing rec */
 	rec_t*		rec,		/*!< in: record on page */
 	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
 {
-	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_t*		new_page	= new_block->page.frame;
 	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
-	page_t*		page		= block->frame;
+	page_t*		page		= block->page.frame;
 	rec_t*		ret		= page_rec_get_next(
 		page_get_infimum_rec(new_page));
 	ulint		num_moved	= 0;
-	rtr_rec_move_t*	rec_move	= NULL;
-	mem_heap_t*	heap		= NULL;
 	ut_ad(page_align(rec) == page);
 
+	if (UNIV_UNLIKELY(!ret)) {
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
+
 #ifdef UNIV_ZIP_DEBUG
 	if (new_page_zip) {
 		page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
@@ -568,26 +562,33 @@ page_copy_rec_list_end(
 	alignas(2) byte h[PAGE_N_DIRECTION + 2 - PAGE_LAST_INSERT];
 	memcpy_aligned<2>(h, PAGE_HEADER + PAGE_LAST_INSERT + new_page,
 			  sizeof h);
+	mem_heap_t* heap = nullptr;
+	rtr_rec_move_t* rec_move = nullptr;
 
 	if (index->is_spatial()) {
 		ulint	max_to_move = page_get_n_recs(
 			buf_block_get_frame(block));
 		heap = mem_heap_create(256);
-
-		rec_move = static_cast<rtr_rec_move_t*>(
+		rec_move= static_cast<rtr_rec_move_t*>(
 			mem_heap_alloc(heap, max_to_move * sizeof *rec_move));
-
 		/* For spatial index, we need to insert recs one by one
 		to keep recs ordered. */
-		rtr_page_copy_rec_list_end_no_locks(new_block,
-						    block, rec, index,
-						    heap, rec_move,
-						    max_to_move,
-						    &num_moved,
-						    mtr);
+		*err = rtr_page_copy_rec_list_end_no_locks(new_block,
+							   block, rec, index,
+							   heap, rec_move,
+							   max_to_move,
+							   &num_moved,
+							   mtr);
 	} else {
-		page_copy_rec_list_end_no_locks(new_block, block, rec,
-						index, mtr);
+		*err = page_copy_rec_list_end_no_locks(new_block, block, rec,
+						       index, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+err_exit:
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			return nullptr;
+		}
 		if (was_empty) {
 			mtr->memcpy<mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
 						      + PAGE_LAST_INSERT
@@ -626,40 +627,43 @@ page_copy_rec_list_end(
 			have at least one predecessor (the predefined
 			infimum record, or a freshly copied record
 			that is smaller than "ret"). */
-			ut_a(ret_pos > 0);
-
-			if (!page_zip_reorganize(new_block, index,
-						 page_zip_level, mtr)) {
+			if (UNIV_UNLIKELY(!ret_pos
+					  || ret_pos == ULINT_UNDEFINED)) {
+				*err = DB_CORRUPTION;
+				goto err_exit;
+			}
 
+			*err = page_zip_reorganize(new_block, index,
+						   page_zip_level, mtr);
+			switch (*err) {
+			case DB_FAIL:
 				if (!page_zip_decompress(new_page_zip,
 							 new_page, FALSE)) {
 					ut_error;
 				}
 				ut_ad(page_validate(new_page, index));
-
-				if (heap) {
-					mem_heap_free(heap);
-				}
-
-				return(NULL);
-			} else {
+				/* fall through */
+			default:
+				goto err_exit;
+			case DB_SUCCESS:
 				/* The page was reorganized:
 				Seek to ret_pos. */
 				ret = page_rec_get_nth(new_page, ret_pos);
+				ut_ad(ret);
 			}
 		}
 	}
 
 	/* Update the lock table and possible hash index */
 
-	if (dict_table_is_locking_disabled(index->table)) {
-	} else if (rec_move && dict_index_is_spatial(index)) {
+	if (!index->has_locking()) {
+	} else if (UNIV_LIKELY_NULL(rec_move)) {
 		lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
 	} else {
 		lock_move_rec_list_end(new_block, block, rec);
 	}
 
-	if (heap) {
+	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
 
@@ -678,8 +682,8 @@ if new_block is a compressed leaf page in a secondary index.
 This has to be done either within the same mini-transaction,
 or by invoking ibuf_reset_free_bits() before mtr_commit().
 
-@return pointer to the original predecessor of the supremum record on
-new_page, or NULL on zip overflow (new_block will be decompressed) */
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
 rec_t*
 page_copy_rec_list_start(
 /*=====================*/
@@ -687,9 +691,10 @@ page_copy_rec_list_start(
 	buf_block_t*	block,		/*!< in: index page containing rec */
 	rec_t*		rec,		/*!< in: record on page */
 	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
 {
-	ut_ad(page_align(rec) == block->frame);
+	ut_ad(page_align(rec) == block->page.frame);
 
 	page_t*		new_page	= buf_block_get_frame(new_block);
 	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
@@ -704,22 +709,32 @@ page_copy_rec_list_start(
 	rec_offs*	offsets		= offsets_;
 	rec_offs_init(offsets_);
 
+	if (UNIV_UNLIKELY(!ret)) {
+corrupted:
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
+
 	/* Here, "ret" may be pointing to a user record or the
 	predefined infimum record. */
 
 	if (page_rec_is_infimum(rec)) {
+		*err = DB_SUCCESS;
 		return(ret);
 	}
 
+	page_cur_set_before_first(block, &cur1);
+	if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+		goto corrupted;
+	}
+
 	mtr_log_t	log_mode = MTR_LOG_NONE;
 
 	if (new_page_zip) {
 		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 	}
 
-	page_cur_set_before_first(block, &cur1);
-	page_cur_move_to_next(&cur1);
-
+	cur2.index = index;
 	page_cur_position(ret, new_block, &cur2);
 
 	const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0;
@@ -737,21 +752,27 @@ page_copy_rec_list_start(
 
 		/* For spatial index, we need to insert recs one by one
 		to keep recs ordered. */
-		rtr_page_copy_rec_list_start_no_locks(new_block,
-						      block, rec, index, heap,
-						      rec_move, max_to_move,
-						      &num_moved, mtr);
+		*err = rtr_page_copy_rec_list_start_no_locks(new_block,
+							     block, rec, index,
+							     heap, rec_move,
+							     max_to_move,
+							     &num_moved, mtr);
+		if (*err != DB_SUCCESS) {
+			return nullptr;
+		}
 	} else {
 		while (page_cur_get_rec(&cur1) != rec) {
 			offsets = rec_get_offsets(cur1.rec, index, offsets,
 						  n_core,
 						  ULINT_UNDEFINED, &heap);
-			cur2.rec = page_cur_insert_rec_low(&cur2, index,
-							   cur1.rec, offsets,
-							   mtr);
-			ut_a(cur2.rec);
+			cur2.rec = page_cur_insert_rec_low(&cur2, cur1.rec,
+							   offsets, mtr);
+			if (UNIV_UNLIKELY(!cur2.rec
+					  || !page_cur_move_to_next(&cur1))) {
+				*err = DB_CORRUPTION;
+				return nullptr;
+			}
 
-			page_cur_move_to_next(&cur1);
 			ut_ad(!(rec_get_info_bits(cur1.rec,
 						  page_is_comp(new_page))
 				& REC_INFO_MIN_REC_FLAG));
@@ -767,7 +788,8 @@ page_copy_rec_list_start(
 	for MVCC. */
 	if (n_core && !index->is_primary() && !index->table->is_temporary()) {
 		page_update_max_trx_id(new_block, nullptr,
-				       page_get_max_trx_id(block->frame), mtr);
+				       page_get_max_trx_id(block->page.frame),
+                                       mtr);
 	}
 
 	if (new_page_zip) {
@@ -778,45 +800,50 @@ page_copy_rec_list_start(
 
 		if (!page_zip_compress(new_block, index,
 				       page_zip_level, mtr)) {
-			ulint	ret_pos;
 #ifndef DBUG_OFF
 zip_reorganize:
 #endif /* DBUG_OFF */
 			/* Before trying to reorganize the page,
 			store the number of preceding records on the page. */
-			ret_pos = page_rec_get_n_recs_before(ret);
+			ulint ret_pos = page_rec_get_n_recs_before(ret);
 			/* Before copying, "ret" was the predecessor
 			of the predefined supremum record.  If it was
 			the predefined infimum record, then it would
 			still be the infimum, and we would have
 			ret_pos == 0. */
-
-			if (UNIV_UNLIKELY
-			    (!page_zip_reorganize(new_block, index,
-						  page_zip_level, mtr))) {
-
+			if (UNIV_UNLIKELY(!ret_pos
+					  || ret_pos == ULINT_UNDEFINED)) {
+				*err = DB_CORRUPTION;
+				return nullptr;
+			}
+			*err = page_zip_reorganize(new_block, index,
+						   page_zip_level, mtr);
+			switch (*err) {
+			case DB_SUCCESS:
+				ret = page_rec_get_nth(new_page, ret_pos);
+				ut_ad(ret);
+				break;
+			case DB_FAIL:
 				if (UNIV_UNLIKELY
 				    (!page_zip_decompress(new_page_zip,
 							  new_page, FALSE))) {
 					ut_error;
 				}
 				ut_ad(page_validate(new_page, index));
-
+				/* fall through */
+			default:
 				if (UNIV_LIKELY_NULL(heap)) {
 					mem_heap_free(heap);
 				}
 
-				return(NULL);
+				return nullptr;
 			}
-
-			/* The page was reorganized: Seek to ret_pos. */
-			ret = page_rec_get_nth(new_page, ret_pos);
 		}
 	}
 
 	/* Update the lock table and possible hash index */
 
-	if (dict_table_is_locking_disabled(index->table)) {
+	if (!index->has_locking()) {
 	} else if (dict_index_is_spatial(index)) {
 		lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
 	} else {
@@ -829,13 +856,14 @@ zip_reorganize:
 
 	btr_search_move_or_delete_hash_entries(new_block, block);
 
+	*err = DB_SUCCESS;
 	return(ret);
 }
 
 /*************************************************************//**
 Deletes records from a page from a given record onward, including that record.
 The infimum and supremum records are not deleted. */
-void
+dberr_t
 page_delete_rec_list_end(
 /*=====================*/
 	rec_t*		rec,	/*!< in: pointer to record on page */
@@ -848,29 +876,32 @@ page_delete_rec_list_end(
 				delete, or ULINT_UNDEFINED if not known */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
+  page_t * const page= block->page.frame;
+
   ut_ad(size == ULINT_UNDEFINED || size < srv_page_size);
-  ut_ad(page_align(rec) == block->frame);
-  ut_ad(index->table->not_redundant() == !!page_is_comp(block->frame));
+  ut_ad(page_align(rec) == page);
+  ut_ad(index->table->not_redundant() == !!page_is_comp(page));
 #ifdef UNIV_ZIP_DEBUG
   ut_a(!block->page.zip.data ||
-       page_zip_validate(&block->page.zip, block->frame, index));
+       page_zip_validate(&block->page.zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
 
   if (page_rec_is_supremum(rec))
   {
     ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
     /* Nothing to do, there are no records bigger than the page supremum. */
-    return;
+    return DB_SUCCESS;
   }
 
-  if (page_rec_is_infimum(rec) || n_recs == page_get_n_recs(block->frame) ||
-      rec == (page_is_comp(block->frame)
-              ? page_rec_get_next_low(block->frame + PAGE_NEW_INFIMUM, 1)
-              : page_rec_get_next_low(block->frame + PAGE_OLD_INFIMUM, 0)))
+  if (page_rec_is_infimum(rec) ||
+      n_recs == page_get_n_recs(page) ||
+      rec == (page_is_comp(page)
+              ? page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1)
+              : page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0)))
   {
     /* We are deleting all records. */
     page_create_empty(block, index, mtr);
-    return;
+    return DB_SUCCESS;
   }
 
 #if 0 // FIXME: consider deleting the last record as a special case
@@ -878,14 +909,14 @@ page_delete_rec_list_end(
   {
     page_cur_t cursor= { index, rec, offsets, block };
     page_cur_delete_rec(&cursor, index, offsets, mtr);
-    return;
+    return DB_SUCCESS;
   }
 #endif
 
   /* The page becomes invalid for optimistic searches */
   buf_block_modify_clock_inc(block);
 
-  const ulint n_core= page_is_leaf(block->frame) ? index->n_core_fields : 0;
+  const ulint n_core= page_is_leaf(page) ? index->n_core_fields : 0;
   mem_heap_t *heap= nullptr;
   rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
   rec_offs *offsets= offsets_;
@@ -894,29 +925,34 @@ page_delete_rec_list_end(
 #if 1 // FIXME: remove this, and write minimal amount of log! */
   if (UNIV_LIKELY_NULL(block->page.zip.data))
   {
-    ut_ad(page_is_comp(block->frame));
+    ut_ad(page_is_comp(page));
     do
     {
       page_cur_t cur;
       page_cur_position(rec, block, &cur);
+      cur.index= index;
       offsets= rec_get_offsets(rec, index, offsets, n_core,
 			       ULINT_UNDEFINED, &heap);
-      rec= rec_get_next_ptr(rec, TRUE);
+      rec= const_cast<rec_t*>(page_rec_get_next_low(rec, true));
 #ifdef UNIV_ZIP_DEBUG
-      ut_a(page_zip_validate(&block->page.zip, block->frame, index));
+      ut_a(page_zip_validate(&block->page.zip, page, index));
 #endif /* UNIV_ZIP_DEBUG */
-      page_cur_delete_rec(&cur, index, offsets, mtr);
+      page_cur_delete_rec(&cur, offsets, mtr);
     }
     while (page_offset(rec) != PAGE_NEW_SUPREMUM);
 
     if (UNIV_LIKELY_NULL(heap))
       mem_heap_free(heap);
-    return;
+    return DB_SUCCESS;
   }
 #endif
 
   byte *prev_rec= page_rec_get_prev(rec);
-  byte *last_rec= page_rec_get_prev(page_get_supremum_rec(block->frame));
+  if (UNIV_UNLIKELY(!prev_rec))
+    return DB_CORRUPTION;
+  byte *last_rec= page_rec_get_prev(page_get_supremum_rec(page));
+  if (UNIV_UNLIKELY(!last_rec))
+    return DB_CORRUPTION;
 
   // FIXME: consider a special case of shrinking PAGE_HEAP_TOP
 
@@ -933,7 +969,7 @@ page_delete_rec_list_end(
       offsets = rec_get_offsets(rec2, index, offsets, n_core,
                                 ULINT_UNDEFINED, &heap);
       ulint s= rec_offs_size(offsets);
-      ut_ad(ulint(rec2 - block->frame) + s - rec_offs_extra_size(offsets) <
+      ut_ad(ulint(rec2 - page) + s - rec_offs_extra_size(offsets) <
             srv_page_size);
       ut_ad(size + s < srv_page_size);
       size+= s;
@@ -942,12 +978,15 @@ page_delete_rec_list_end(
       if (scrub)
         mtr->memset(block, page_offset(rec2), rec_offs_data_size(offsets), 0);
 
-      rec2 = page_rec_get_next(rec2);
+      rec2= page_rec_get_next(rec2);
     }
-    while (!page_rec_is_supremum(rec2));
+    while (rec2 && !page_rec_is_supremum(rec2));
 
     if (UNIV_LIKELY_NULL(heap))
       mem_heap_free(heap);
+
+    if (UNIV_UNLIKELY(!rec))
+      return DB_CORRUPTION;
   }
 
   ut_ad(size < srv_page_size);
@@ -957,36 +996,38 @@ page_delete_rec_list_end(
     const rec_t *owner_rec= rec;
     ulint count= 0;
 
-    if (page_is_comp(block->frame))
+    if (page_is_comp(page))
       while (!(n_owned= rec_get_n_owned_new(owner_rec)))
       {
         count++;
-	owner_rec= rec_get_next_ptr_const(owner_rec, TRUE);
+	if (!(owner_rec= page_rec_get_next_low(owner_rec, true)))
+          return DB_CORRUPTION;
       }
     else
       while (!(n_owned= rec_get_n_owned_old(owner_rec)))
       {
         count++;
-	owner_rec= rec_get_next_ptr_const(owner_rec, FALSE);
+	if (!(owner_rec= page_rec_get_next_low(owner_rec, false)))
+          return DB_CORRUPTION;
       }
 
     ut_ad(n_owned > count);
     n_owned-= count;
     slot_index= page_dir_find_owner_slot(owner_rec);
-    ut_ad(slot_index > 0);
   }
 
+  if (UNIV_UNLIKELY(!slot_index || slot_index == ULINT_UNDEFINED))
+    return DB_CORRUPTION;
+
   mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
-                                 (PAGE_N_DIR_SLOTS + PAGE_HEADER +
-                                  block->frame), slot_index + 1);
+                                 (PAGE_N_DIR_SLOTS + PAGE_HEADER + page),
+                                 slot_index + 1);
   mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
-                                 (PAGE_LAST_INSERT + PAGE_HEADER +
-                                  block->frame), 0U);
+                                 (PAGE_LAST_INSERT + PAGE_HEADER + page), 0U);
   /* Catenate the deleted chain segment to the page free list */
   alignas(4) byte page_header[4];
-  byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE +
-                                        block->frame);
-  const uint16_t free= page_header_get_field(block->frame, PAGE_FREE);
+  byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE + page);
+  const uint16_t free= page_header_get_field(page, PAGE_FREE);
   static_assert(PAGE_FREE + 2 == PAGE_GARBAGE, "compatibility");
 
   mach_write_to_2(page_header, page_offset(rec));
@@ -995,20 +1036,19 @@ page_delete_rec_list_end(
                   size);
   mtr->memcpy(*block, page_free, page_header, 4);
 
-  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                          block->frame);
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
   mtr->write<2>(*block, page_n_recs,
                 ulint{mach_read_from_2(page_n_recs)} - n_recs);
 
   /* Update the page directory; there is no need to balance the number
   of the records owned by the supremum record, as it is allowed to be
   less than PAGE_DIR_SLOT_MIN_N_OWNED */
-  page_dir_slot_t *slot= page_dir_get_nth_slot(block->frame, slot_index);
+  page_dir_slot_t *slot= page_dir_get_nth_slot(page, slot_index);
 
-  if (page_is_comp(block->frame))
+  if (page_is_comp(page))
   {
     mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_NEW_SUPREMUM);
-    byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + block->frame;
+    byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + page;
     byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
                                       n_owned << REC_N_OWNED_SHIFT);
 #if 0 // FIXME: implement minimal logging for ROW_FORMAT=COMPRESSED
@@ -1016,7 +1056,7 @@ page_delete_rec_list_end(
     {
       *owned= new_owned;
       memcpy_aligned<2>(PAGE_N_DIR_SLOTS + PAGE_HEADER + block->page.zip.data,
-                        PAGE_N_DIR_SLOTS + PAGE_HEADER + block->frame,
+                        PAGE_N_DIR_SLOTS + PAGE_HEADER + page,
 			PAGE_N_RECS + 2 - PAGE_N_DIR_SLOTS);
       // TODO: the equivalent of page_zip_dir_delete() for all records
       mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
@@ -1024,7 +1064,7 @@ page_delete_rec_list_end(
       mach_write_to_2(last_rec - REC_NEXT, free
                     ? static_cast<uint16_t>(free - page_offset(last_rec))
                     : 0U);
-      return;
+      return DB_SUCCESS;
     }
 #endif
     mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
@@ -1037,13 +1077,15 @@ page_delete_rec_list_end(
   else
   {
     mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_OLD_SUPREMUM);
-    byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + block->frame;
+    byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + page;
     byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
                                       n_owned << REC_N_OWNED_SHIFT);
     mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
     mtr->write<2>(*block, prev_rec - REC_NEXT, PAGE_OLD_SUPREMUM);
     mtr->write<2>(*block, last_rec - REC_NEXT, free);
   }
+
+  return DB_SUCCESS;
 }
 
 /*************************************************************//**
@@ -1064,7 +1106,7 @@ page_delete_rec_list_start(
 
 	rec_offs_init(offsets_);
 
-	ut_ad(page_align(rec) == block->frame);
+	ut_ad(page_align(rec) == block->page.frame);
 	ut_ad((ibool) !!page_rec_is_comp(rec)
 	      == dict_table_is_comp(index->table));
 #ifdef UNIV_ZIP_DEBUG
@@ -1092,8 +1134,12 @@ page_delete_rec_list_start(
 		return;
 	}
 
+	cur1.index = index;
 	page_cur_set_before_first(block, &cur1);
-	page_cur_move_to_next(&cur1);
+	if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+		ut_ad("corrupted page" == 0);
+		return;
+	}
 
 	const ulint	n_core = page_rec_is_leaf(rec)
 		? index->n_core_fields : 0;
@@ -1102,7 +1148,7 @@ page_delete_rec_list_start(
 		offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
 					  offsets, n_core,
 					  ULINT_UNDEFINED, &heap);
-		page_cur_delete_rec(&cur1, index, offsets, mtr);
+		page_cur_delete_rec(&cur1, offsets, mtr);
 	}
 
 	if (UNIV_LIKELY_NULL(heap)) {
@@ -1110,101 +1156,11 @@ page_delete_rec_list_start(
 	}
 }
 
-/*************************************************************//**
-Moves record list end to another page. Moved records include
-split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return TRUE on success; FALSE on compression failure (new_block will
-be decompressed) */
-ibool
-page_move_rec_list_end(
-/*===================*/
-	buf_block_t*	new_block,	/*!< in/out: index page where to move */
-	buf_block_t*	block,		/*!< in: index page from where to move */
-	rec_t*		split_rec,	/*!< in: first record to move */
-	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	page_t*		new_page	= buf_block_get_frame(new_block);
-	ulint		old_data_size;
-	ulint		new_data_size;
-	ulint		old_n_recs;
-	ulint		new_n_recs;
-
-	ut_ad(!dict_index_is_spatial(index));
-
-	old_data_size = page_get_data_size(new_page);
-	old_n_recs = page_get_n_recs(new_page);
-#ifdef UNIV_ZIP_DEBUG
-	{
-		page_zip_des_t*	new_page_zip
-			= buf_block_get_page_zip(new_block);
-		page_zip_des_t*	page_zip
-			= buf_block_get_page_zip(block);
-		ut_a(!new_page_zip == !page_zip);
-		ut_a(!new_page_zip
-		     || page_zip_validate(new_page_zip, new_page, index));
-		ut_a(!page_zip
-		     || page_zip_validate(page_zip, page_align(split_rec),
-					  index));
-	}
-#endif /* UNIV_ZIP_DEBUG */
-
-	if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block,
-						  split_rec, index, mtr))) {
-		return(FALSE);
-	}
-
-	new_data_size = page_get_data_size(new_page);
-	new_n_recs = page_get_n_recs(new_page);
-
-	ut_ad(new_data_size >= old_data_size);
-
-	page_delete_rec_list_end(split_rec, block, index,
-				 new_n_recs - old_n_recs,
-				 new_data_size - old_data_size, mtr);
-
-	return(TRUE);
-}
-
-/*************************************************************//**
-Moves record list start to another page. Moved records do not include
-split_rec.
-
-IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
-if new_block is a compressed leaf page in a secondary index.
-This has to be done either within the same mini-transaction,
-or by invoking ibuf_reset_free_bits() before mtr_commit().
-
-@return TRUE on success; FALSE on compression failure */
-ibool
-page_move_rec_list_start(
-/*=====================*/
-	buf_block_t*	new_block,	/*!< in/out: index page where to move */
-	buf_block_t*	block,		/*!< in/out: page containing split_rec */
-	rec_t*		split_rec,	/*!< in: first record not to move */
-	dict_index_t*	index,		/*!< in: record descriptor */
-	mtr_t*		mtr)		/*!< in: mtr */
-{
-	if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block,
-						    split_rec, index, mtr))) {
-		return(FALSE);
-	}
-
-	page_delete_rec_list_start(split_rec, block, index, mtr);
-
-	return(TRUE);
-}
-
 /************************************************************//**
 Returns the nth record of the record list.
 This is the inverse function of page_rec_get_n_recs_before().
-@return nth record */
+@return nth record
+@retval nullptr on corrupted page */
 const rec_t*
 page_rec_get_nth_const(
 /*===================*/
@@ -1223,7 +1179,6 @@ page_rec_get_nth_const(
 	ut_ad(nth < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
 
 	for (i = 0;; i++) {
-
 		slot = page_dir_get_nth_slot(page, i);
 		n_owned = page_dir_slot_get_n_owned(slot);
 
@@ -1234,87 +1189,154 @@ page_rec_get_nth_const(
 		}
 	}
 
-	ut_ad(i > 0);
-	slot = page_dir_get_nth_slot(page, i - 1);
-	rec = page_dir_slot_get_rec(slot);
+	if (UNIV_UNLIKELY(!i)) {
+		return nullptr;
+	}
+	rec = page_dir_slot_get_rec(slot + 2);
 
 	if (page_is_comp(page)) {
 		do {
 			rec = page_rec_get_next_low(rec, TRUE);
-			ut_ad(rec);
-		} while (nth--);
+		} while (rec && nth--);
 	} else {
 		do {
 			rec = page_rec_get_next_low(rec, FALSE);
-			ut_ad(rec);
-		} while (nth--);
+		} while (rec && nth--);
 	}
 
 	return(rec);
 }
 
-/***************************************************************//**
-Returns the number of records before the given record in chain.
-The number includes infimum and supremum records.
-@return number of records */
-ulint
-page_rec_get_n_recs_before(
-/*=======================*/
-	const rec_t*	rec)	/*!< in: the physical record */
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record
+@retval nullptr on error */
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record, must not be page
+				infimum */
 {
-	const page_dir_slot_t*	slot;
-	const rec_t*		slot_rec;
-	const page_t*		page;
-	ulint			i;
-	lint			n	= 0;
+	const rec_t*		rec2;
+	const rec_t*		prev_rec = NULL;
 
 	ut_ad(page_rec_check(rec));
 
-	page = page_align(rec);
-	if (page_is_comp(page)) {
-		while (rec_get_n_owned_new(rec) == 0) {
+	const page_t* const page = page_align(rec);
 
-			rec = rec_get_next_ptr_const(rec, TRUE);
-			n--;
-		}
+	ut_ad(!page_rec_is_infimum(rec));
 
-		for (i = 0; ; i++) {
-			slot = page_dir_get_nth_slot(page, i);
-			slot_rec = page_dir_slot_get_rec(slot);
+	ulint slot_no = page_dir_find_owner_slot(rec);
 
-			n += lint(rec_get_n_owned_new(slot_rec));
+	if (UNIV_UNLIKELY(!slot_no || slot_no == ULINT_UNDEFINED)) {
+		return nullptr;
+	}
+
+	const page_dir_slot_t* slot = page_dir_get_nth_slot(page, slot_no - 1);
 
-			if (rec == slot_rec) {
+	if (UNIV_UNLIKELY(!(rec2 = page_dir_slot_get_rec_validate(slot)))) {
+		return nullptr;
+	}
 
+	if (page_is_comp(page)) {
+		while (rec2 && rec != rec2) {
+			prev_rec = rec2;
+			ulint offs = rec_get_next_offs(rec2, TRUE);
+			if (offs < PAGE_NEW_INFIMUM
+			    || offs > page_header_get_field(page,
+							    PAGE_HEAP_TOP)) {
+				return nullptr;
+			}
+			rec2 = page + offs;
+		}
+		switch (rec_get_status(prev_rec)) {
+		case REC_STATUS_INSTANT:
+		case REC_STATUS_ORDINARY:
+			if (!page_is_leaf(page)) {
+				return nullptr;
+			}
+			break;
+		case REC_STATUS_INFIMUM:
+			break;
+		case REC_STATUS_NODE_PTR:
+			if (!page_is_leaf(page)) {
 				break;
 			}
+			/* fall through */
+		default:
+			return nullptr;
 		}
 	} else {
-		while (rec_get_n_owned_old(rec) == 0) {
-
-			rec = rec_get_next_ptr_const(rec, FALSE);
-			n--;
+		while (rec2 && rec != rec2) {
+			prev_rec = rec2;
+			ulint offs = rec_get_next_offs(rec2, FALSE);
+			if (offs < PAGE_OLD_INFIMUM
+			    || offs > page_header_get_field(page,
+							    PAGE_HEAP_TOP)) {
+				return nullptr;
+			}
+			rec2 = page + offs;
 		}
+	}
 
-		for (i = 0; ; i++) {
-			slot = page_dir_get_nth_slot(page, i);
-			slot_rec = page_dir_slot_get_rec(slot);
+	return(prev_rec);
+}
 
-			n += lint(rec_get_n_owned_old(slot_rec));
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec)
+{
+  const page_t *const page= page_align(rec);
+  const page_dir_slot_t *slot = page_dir_get_nth_slot(page, 0);
+  const page_dir_slot_t *const end_slot= slot - 2 * page_dir_get_n_slots(page);
 
-			if (rec == slot_rec) {
+  lint n= 0;
 
-				break;
-			}
-		}
-	}
+  ut_ad(page_rec_check(rec));
 
-	n--;
+  if (page_is_comp(page))
+  {
+    for (; rec_get_n_owned_new(rec) == 0; n--)
+      if (UNIV_UNLIKELY(!(rec= page_rec_get_next_low(rec, true))))
+        return ULINT_UNDEFINED;
 
-	ut_ad(n >= 0);
-	ut_ad((ulong) n < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
+    do
+    {
+      const rec_t *slot_rec= page_dir_slot_get_rec_validate(slot);
+      if (UNIV_UNLIKELY(!slot_rec))
+        break;
+      n+= lint(rec_get_n_owned_new(slot_rec));
 
-	return((ulint) n);
+      if (rec == slot_rec)
+        goto found;
+    }
+    while ((slot-= 2) > end_slot);
+  }
+  else
+  {
+    for (; rec_get_n_owned_old(rec) == 0; n--)
+      if (UNIV_UNLIKELY(!(rec= page_rec_get_next_low(rec, false))))
+        return ULINT_UNDEFINED;
+
+    do
+    {
+      const rec_t *slot_rec= page_dir_slot_get_rec_validate(slot);
+      if (UNIV_UNLIKELY(!slot_rec))
+        break;
+      n+= lint(rec_get_n_owned_old(slot_rec));
+
+      if (rec == slot_rec)
+        goto found;
+    }
+    while ((slot-= 2) > end_slot);
+  }
+
+  return ULINT_UNDEFINED;
+found:
+  return --n < 0 ? ULINT_UNDEFINED : ulint(n);
 }
 
 /************************************************************//**
@@ -1394,7 +1416,7 @@ page_print_list(
 	dict_index_t*	index,	/*!< in: dictionary index of the page */
 	ulint		pr_n)	/*!< in: print n first and n last entries */
 {
-	page_t*		page		= block->frame;
+	page_t*		page		= block->page.frame;
 	page_cur_t	cur;
 	ulint		count;
 	ulint		n_recs;
@@ -1495,7 +1517,7 @@ page_print(
 	ulint		rn)	/*!< in: print rn first and last records
 				in directory */
 {
-	page_t*	page = block->frame;
+	page_t*	page = block->page.frame;
 
 	page_header_print(page);
 	page_dir_print(page, dn);
@@ -1834,15 +1856,14 @@ page_simple_validate_new(
 	slot_no = 0;
 	slot = page_dir_get_nth_slot(page, slot_no);
 
-	rec = page_get_infimum_rec(page);
+	rec = page + PAGE_NEW_INFIMUM;
 
 	for (;;) {
-		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
-
+		if (UNIV_UNLIKELY(rec < page + PAGE_NEW_INFIMUM
+				  || rec > rec_heap_top)) {
 			ib::error() << "Record " << page_offset(rec)
-				<< " is above rec heap top "
+				<< " is out of bounds: "
 				<< page_offset(rec_heap_top);
-
 			goto func_exit;
 		}
 
@@ -2298,14 +2319,21 @@ wrong_page_type:
 		}
 
 next_rec:
-		if (page_rec_is_supremum(rec)) {
+		old_rec = rec;
+		rec = page_rec_get_next_const(rec);
+
+		if (UNIV_UNLIKELY(!rec != page_rec_is_supremum(old_rec))) {
+			ib::error() << "supremum is not last record: " << offs;
+			ret = FALSE;
+		}
+
+		if (!rec) {
+			rec = old_rec; /* supremum */
 			break;
 		}
 
 		count++;
 		own_count++;
-		old_rec = rec;
-		rec = page_rec_get_next_const(rec);
 
 		if (page_rec_is_infimum(old_rec)
 		    && page_rec_is_user_rec(rec)) {
@@ -2460,37 +2488,36 @@ page_find_rec_with_heap_no(
 @param[in]	page	index tree leaf page
 @return the last record, not delete-marked
 @retval infimum record if all records are delete-marked */
-const rec_t*
-page_find_rec_max_not_deleted(
-	const page_t*	page)
+const rec_t *page_find_rec_max_not_deleted(const page_t *page)
 {
-	const rec_t*	rec = page_get_infimum_rec(page);
-	const rec_t*	prev_rec = NULL; // remove warning
+  ut_ad(page_is_leaf(page));
 
-	/* Because the page infimum is never delete-marked
-	and never the metadata pseudo-record (MIN_REC_FLAG)),
-	prev_rec will always be assigned to it first. */
-	ut_ad(!rec_get_info_bits(rec, page_rec_is_comp(rec)));
-	ut_ad(page_is_leaf(page));
-
-	if (page_is_comp(page)) {
-		do {
-			if (!(rec[-REC_NEW_INFO_BITS]
-			      & (REC_INFO_DELETED_FLAG
-				 | REC_INFO_MIN_REC_FLAG))) {
-				prev_rec = rec;
-			}
-			rec = page_rec_get_next_low(rec, true);
-		} while (rec != page + PAGE_NEW_SUPREMUM);
-	} else {
-		do {
-			if (!(rec[-REC_OLD_INFO_BITS]
-			      & (REC_INFO_DELETED_FLAG
-				 | REC_INFO_MIN_REC_FLAG))) {
-				prev_rec = rec;
-			}
-			rec = page_rec_get_next_low(rec, false);
-		} while (rec != page + PAGE_OLD_SUPREMUM);
-	}
-	return(prev_rec);
+  if (page_is_comp(page))
+  {
+    const rec_t *rec= page + PAGE_NEW_INFIMUM;
+    const rec_t *prev_rec= rec;
+    do
+    {
+      if (!(rec[-REC_NEW_INFO_BITS] &
+            (REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)))
+        prev_rec= rec;
+      if (!(rec= page_rec_get_next_low(rec, true)))
+        return page + PAGE_NEW_INFIMUM;
+    } while (rec != page + PAGE_NEW_SUPREMUM);
+    return prev_rec;
+  }
+  else
+  {
+    const rec_t *rec= page + PAGE_OLD_INFIMUM;
+    const rec_t *prev_rec= rec;
+    do
+    {
+      if (!(rec[-REC_OLD_INFO_BITS] &
+            (REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)))
+        prev_rec= rec;
+      if (!(rec= page_rec_get_next_low(rec, false)))
+        return page + PAGE_OLD_INFIMUM;
+    } while (rec != page + PAGE_OLD_SUPREMUM);
+    return prev_rec;
+  }
 }
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index 12e75605dc9..36624f43d10 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2014, 2021, MariaDB Corporation.
+Copyright (c) 2014, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -411,14 +411,10 @@ static void page_zip_compress_write_log(buf_block_t *block,
 {
   ut_ad(!index->is_ibuf());
 
-  if (mtr->get_log_mode() != MTR_LOG_ALL)
-  {
-    ut_ad(mtr->get_log_mode() == MTR_LOG_NONE ||
-          mtr->get_log_mode() == MTR_LOG_NO_REDO);
+  if (!mtr->is_logged())
     return;
-  }
 
-  const page_t *page= block->frame;
+  const page_t *page= block->page.frame;
   const page_zip_des_t *page_zip= &block->page.zip;
   /* Read the number of user records. */
   ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
@@ -443,7 +439,6 @@ static void page_zip_compress_write_log(buf_block_t *block,
   if (trailer_size)
     mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
                  trailer_size);
-  block->page.status = buf_page_t::INIT_ON_FLUSH; /* because of mtr_t::init() */
 }
 
 /******************************************************//**
@@ -1280,7 +1275,7 @@ page_zip_compress(
 	my_bool			cmp_per_index_enabled;
 	cmp_per_index_enabled	= srv_cmp_per_index_enabled;
 
-	page_t* page = block->frame;
+	page_t* page = block->page.frame;
 	page_zip_des_t* page_zip = &block->page.zip;
 
 	ut_a(page_is_comp(page));
@@ -1348,9 +1343,9 @@ page_zip_compress(
 #endif /* PAGE_ZIP_COMPRESS_DBG */
 	page_zip_stat[page_zip->ssize - 1].compressed++;
 	if (cmp_per_index_enabled) {
-		mutex_enter(&page_zip_stat_per_index_mutex);
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
 		page_zip_stat_per_index[ind_id].compressed++;
-		mutex_exit(&page_zip_stat_per_index_mutex);
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 	}
 
 	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
@@ -1501,10 +1496,10 @@ err_exit:
 		page_zip_stat[page_zip->ssize - 1].compressed_usec
 			+= time_diff;
 		if (cmp_per_index_enabled) {
-			mutex_enter(&page_zip_stat_per_index_mutex);
+			mysql_mutex_lock(&page_zip_stat_per_index_mutex);
 			page_zip_stat_per_index[ind_id].compressed_usec
 				+= time_diff;
-			mutex_exit(&page_zip_stat_per_index_mutex);
+			mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 		}
 		return false;
 	}
@@ -1568,10 +1563,10 @@ err_exit:
 	page_zip_stat[page_zip->ssize - 1].compressed_ok++;
 	page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
 	if (cmp_per_index_enabled) {
-		mutex_enter(&page_zip_stat_per_index_mutex);
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
 		page_zip_stat_per_index[ind_id].compressed_ok++;
 		page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
-		mutex_exit(&page_zip_stat_per_index_mutex);
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 	}
 
 	if (page_is_leaf(page)) {
@@ -1642,8 +1637,8 @@ page_zip_fields_decode(
 		return(NULL);
 	}
 
-	table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0,
-				      DICT_TF_COMPACT, 0);
+	table = dict_table_t::create({C_STRING_WITH_LEN("ZIP_DUMMY")},
+				     nullptr, n, 0, DICT_TF_COMPACT, 0);
 	index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
 	index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
 	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
@@ -3209,10 +3204,10 @@ page_zip_decompress(
 	index_id_t	index_id = btr_page_get_index_id(page);
 
 	if (srv_cmp_per_index_enabled) {
-		mutex_enter(&page_zip_stat_per_index_mutex);
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
 		page_zip_stat_per_index[index_id].decompressed++;
 		page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
-		mutex_exit(&page_zip_stat_per_index_mutex);
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
 	}
 
 	/* Update the stat counter for LRU policy. */
@@ -3534,7 +3529,7 @@ page_zip_write_rec_ext(
 	ulint		len;
 	byte*		externs	= storage;
 	ulint		n_ext	= rec_offs_n_extern(offsets);
-	const page_t* const page = block->frame;
+	const page_t* const page = block->page.frame;
 	page_zip_des_t* const page_zip = &block->page.zip;
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
@@ -3652,7 +3647,7 @@ void page_zip_write_rec(buf_block_t *block, const byte *rec,
                         const dict_index_t *index, const rec_offs *offsets,
                         ulint create, mtr_t *mtr)
 {
-	const page_t* const page = block->frame;
+	const page_t* const page = block->page.frame;
 	page_zip_des_t* const page_zip = &block->page.zip;
 	byte*		data;
 	byte*		storage;
@@ -3842,7 +3837,7 @@ page_zip_write_blob_ptr(
 {
 	const byte*	field;
 	byte*		externs;
-	const page_t* const page = block->frame;
+	const page_t* const page = block->page.frame;
 	page_zip_des_t* const page_zip = &block->page.zip;
 	ulint		blob_no;
 	ulint		len;
@@ -3906,7 +3901,7 @@ page_zip_write_node_ptr(
 	byte*	storage;
 	page_zip_des_t* const page_zip = &block->page.zip;
 
-	ut_d(const page_t* const page = block->frame);
+	ut_d(const page_t* const page = block->page.frame);
 	ut_ad(page_simple_validate_new(page));
 	ut_ad(page_zip_simple_validate(page_zip));
 	ut_ad(page_zip_get_size(page_zip)
@@ -3953,7 +3948,7 @@ page_zip_write_trx_id_and_roll_ptr(
 {
 	page_zip_des_t* const page_zip = &block->page.zip;
 
-	ut_d(const page_t* const page = block->frame);
+	ut_d(const page_t* const page = block->page.frame);
 	ut_ad(page_align(rec) == page);
 	ut_ad(page_simple_validate_new(page));
 	ut_ad(page_zip_simple_validate(page_zip));
@@ -4051,7 +4046,7 @@ page_zip_clear_rec(
 	byte*	field;
 	ulint	len;
 
-	ut_ad(page_align(rec) == block->frame);
+	ut_ad(page_align(rec) == block->page.frame);
 	page_zip_des_t* const page_zip = &block->page.zip;
 
 	/* page_zip_validate() would fail here if a record
@@ -4059,7 +4054,7 @@ page_zip_clear_rec(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
 	ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
-	ut_ad(page_zip_header_cmp(page_zip, block->frame));
+	ut_ad(page_zip_header_cmp(page_zip, block->page.frame));
 
 	heap_no = rec_get_heap_no_new(rec);
 	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
@@ -4069,7 +4064,7 @@ page_zip_clear_rec(
 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
 			  rec_offs_extra_size(offsets));
 
-	if (!page_is_leaf(block->frame)) {
+	if (!page_is_leaf(block->page.frame)) {
 		/* Clear node_ptr. On the compressed page,
 		there is an array of node_ptr immediately before the
 		dense page directory, at the very end of the page. */
@@ -4135,7 +4130,7 @@ clear_page_zip:
 void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
                               mtr_t *mtr)
 {
-  ut_ad(page_align(rec) == block->frame);
+  ut_ad(page_align(rec) == block->page.frame);
   byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
   byte b= *slot;
   if (flag)
@@ -4144,7 +4139,7 @@ void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
     b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
   mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
 #ifdef UNIV_ZIP_DEBUG
-  ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
+  ut_a(page_zip_validate(&block->page.zip, block->page.frame, nullptr));
 #endif /* UNIV_ZIP_DEBUG */
 }
 
@@ -4159,7 +4154,7 @@ page_zip_rec_set_owned(
 	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-  ut_ad(page_align(rec) == block->frame);
+  ut_ad(page_align(rec) == block->page.frame);
   page_zip_des_t *const page_zip= &block->page.zip;
   byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
   MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
@@ -4182,8 +4177,8 @@ page_zip_dir_insert(
 	byte*		rec,	/*!< in: record to insert */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
-	ut_ad(page_align(cursor->rec) == cursor->block->frame);
-	ut_ad(page_align(rec) == cursor->block->frame);
+	ut_ad(page_align(cursor->rec) == cursor->block->page.frame);
+	ut_ad(page_align(rec) == cursor->block->page.frame);
 	page_zip_des_t *const page_zip= &cursor->block->page.zip;
 
 	ulint	n_dense;
@@ -4276,7 +4271,7 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
                          const dict_index_t *index, const rec_offs *offsets,
                          const byte *free, mtr_t *mtr)
 {
-  ut_ad(page_align(rec) == block->frame);
+  ut_ad(page_align(rec) == block->page.frame);
   page_zip_des_t *const page_zip= &block->page.zip;
 
   ut_ad(rec_offs_validate(rec, index, offsets));
@@ -4290,22 +4285,22 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
   mach_write_to_2(rec - REC_NEXT,
                   free ? static_cast<uint16_t>(free - rec) : 0);
   byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
-                                        block->frame);
+                                        block->page.frame);
   mtr->write<2>(*block, page_free, page_offset(rec));
   byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
-                                      block->frame);
+                                      block->page.frame);
   mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
                 mach_read_from_2(garbage));
   compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
   memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
   byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
   ut_a(slot_rec);
-  uint16_t n_recs= page_get_n_recs(block->frame);
+  uint16_t n_recs= page_get_n_recs(block->page.frame);
   ut_ad(n_recs);
-  ut_ad(n_recs > 1 || page_get_page_no(block->frame) == index->page);
+  ut_ad(n_recs > 1 || page_get_page_no(block->page.frame) == index->page);
   /* This could not be done before page_zip_dir_find(). */
   byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
-                                          block->frame);
+                                          block->page.frame);
   mtr->write<2>(*block, page_n_recs, n_recs - 1U);
   memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
                     2);
@@ -4342,14 +4337,14 @@ void page_zip_dir_delete(buf_block_t *block, byte *rec,
   if (const ulint n_ext= rec_offs_n_extern(offsets))
   {
     ut_ad(index->is_primary());
-    ut_ad(page_is_leaf(block->frame));
+    ut_ad(page_is_leaf(block->page.frame));
 
     /* Shift and zero fill the array of BLOB pointers. */
     ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
     ut_a(blob_no + n_ext <= page_zip->n_blobs);
 
     byte *externs= page_zip->data + page_zip_get_size(page_zip) -
-      (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
+      (page_dir_get_n_heap(block->page.frame) - PAGE_HEAP_NO_USER_LOW) *
       PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
     byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
 
@@ -4382,9 +4377,9 @@ IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
 non-clustered index, the caller must update the insert buffer free
 bits in the same mini-transaction in such a way that the modification
 will be redo-logged.
-@retval true on success
-@retval false on failure; the block will be left intact */
-bool
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
 page_zip_reorganize(
 	buf_block_t*	block,	/*!< in/out: page with compressed page;
 				on the compressed page, in: size;
@@ -4414,10 +4409,10 @@ page_zip_reorganize(
 
 	temp_block = buf_block_alloc();
 	btr_search_drop_page_hash_index(block, false);
-	temp_page = temp_block->frame;
+	temp_page = temp_block->page.frame;
 
 	/* Copy the old page to temporary space */
-	memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_block->frame, block->frame,
+	memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_page, block->page.frame,
 					   srv_page_size);
 
 	/* Recreate the page: note that global data on page (possible
@@ -4435,20 +4430,22 @@ page_zip_reorganize(
 	/* Copy the records from the temporary space to the recreated page;
 	do not copy the lock bits yet */
 
-	page_copy_rec_list_end_no_locks(block, temp_block,
-					page_get_infimum_rec(temp_page),
-					index, mtr);
+	dberr_t err = page_copy_rec_list_end_no_locks(
+		block, temp_block, page_get_infimum_rec(temp_page),
+		index, mtr);
 
 	/* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
 	memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
 			  temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
 	/* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
-	ut_ad(dict_index_is_clust(index) || !page_is_leaf(temp_page)
+	ut_ad(err != DB_SUCCESS
+	      || index->is_clust() || !page_is_leaf(temp_page)
 	      || page_get_max_trx_id(page) != 0);
 	/* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
 	clustered index root pages. */
-	ut_ad(page_get_max_trx_id(page) == 0
-	      || (dict_index_is_clust(index)
+	ut_ad(err != DB_SUCCESS
+	      || page_get_max_trx_id(page) == 0
+	      || (index->is_clust()
 		  ? !page_has_siblings(temp_page)
 		  : page_is_leaf(temp_page)));
 
@@ -4480,14 +4477,13 @@ page_zip_reorganize(
 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
 		}
 
-		buf_block_free(temp_block);
-		return false;
+		err = DB_FAIL;
+	} else {
+		lock_move_reorganize_page(block, temp_block);
 	}
 
-	lock_move_reorganize_page(block, temp_block);
-
 	buf_block_free(temp_block);
-	return true;
+	return err;
 }
 
 /**********************************************************************//**
@@ -4503,7 +4499,7 @@ page_zip_copy_recs(
 	dict_index_t*		index,		/*!< in: index of the B-tree */
 	mtr_t*			mtr)		/*!< in: mini-transaction */
 {
-	page_t* page = block->frame;
+	page_t* page = block->page.frame;
 	page_zip_des_t* page_zip = &block->page.zip;
 
 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
@@ -4561,7 +4557,7 @@ page_zip_copy_recs(
 	to the compressed data page. */
 	{
 		page_zip_t*	data = page_zip->data;
-		memcpy(page_zip, src_zip, sizeof *page_zip);
+		new (page_zip) page_zip_des_t(*src_zip, false);
 		page_zip->data = data;
 	}
 	ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
@@ -4590,37 +4586,26 @@ page_zip_copy_recs(
 #endif /* !UNIV_INNOCHECKSUM */
 
 /** Calculate the compressed page checksum.
-@param[in]	data			compressed page
-@param[in]	size			size of compressed page
-@param[in]	algo			algorithm to use
+@param data		compressed page
+@param size		size of compressed page
+@param use_adler	whether to use Adler32 instead of a XOR of 3 CRC-32C
 @return page checksum */
-uint32_t
-page_zip_calc_checksum(
-	const void*			data,
-	ulint				size,
-	srv_checksum_algorithm_t	algo)
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler)
 {
 	uLong		adler;
 	const Bytef*	s = static_cast<const byte*>(data);
 
 	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
 	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+	ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 
-	switch (algo) {
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	if (!use_adler) {
 		return ut_crc32(s + FIL_PAGE_OFFSET,
 				FIL_PAGE_LSN - FIL_PAGE_OFFSET)
 			^ ut_crc32(s + FIL_PAGE_TYPE, 2)
 			^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
 				   size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-	case SRV_CHECKSUM_ALGORITHM_INNODB:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
-
+	} else {
 		adler = adler32(0L, s + FIL_PAGE_OFFSET,
 				FIL_PAGE_LSN - FIL_PAGE_OFFSET);
 		adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
@@ -4630,15 +4615,7 @@ page_zip_calc_checksum(
 			- FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 
 		return(uint32_t(adler));
-	case SRV_CHECKSUM_ALGORITHM_NONE:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		return(BUF_NO_CHECKSUM_MAGIC);
-	/* no default so the compiler will emit a warning if new enum
-	is added and not handled here */
 	}
-
-	ut_error;
-	return(0);
 }
 
 /** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
@@ -4647,13 +4624,6 @@ page_zip_calc_checksum(
 @return whether the stored checksum matches innodb_checksum_algorithm */
 bool page_zip_verify_checksum(const byte *data, size_t size)
 {
-	const srv_checksum_algorithm_t	curr_algo =
-		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
-
-	if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
-		return true;
-	}
-
 	if (buf_is_zeroes(span<const byte>(data, size))) {
 		return true;
 	}
@@ -4661,62 +4631,37 @@ bool page_zip_verify_checksum(const byte *data, size_t size)
 	const uint32_t stored = mach_read_from_4(
 		data + FIL_PAGE_SPACE_OR_CHKSUM);
 
-	uint32_t calc = page_zip_calc_checksum(data, size, curr_algo);
+	uint32_t calc = page_zip_calc_checksum(data, size, false);
 
 #ifdef UNIV_INNOCHECKSUM
+	extern FILE* log_file;
+	extern uint32_t cur_page_num;
+
 	if (log_file) {
 		fprintf(log_file, "page::" UINT32PF ";"
-			" %s checksum: calculated = " UINT32PF ";"
+			" checksum: calculated = " UINT32PF ";"
 			" recorded = " UINT32PF "\n", cur_page_num,
-			buf_checksum_algorithm_name(
-				static_cast<srv_checksum_algorithm_t>(
-				srv_checksum_algorithm)),
 			calc, stored);
 	}
-
-	if (!strict_verify) {
-		const uint32_t	crc32 = page_zip_calc_checksum(
-			data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
-
-		if (log_file) {
-			fprintf(log_file, "page::" UINT32PF ": crc32 checksum:"
-				" calculated = " UINT32PF "; recorded = " UINT32PF "\n",
-				cur_page_num, crc32, stored);
-			fprintf(log_file, "page::" UINT32PF ": none checksum:"
-				" calculated = %lu; recorded = " UINT32PF "\n",
-				cur_page_num, BUF_NO_CHECKSUM_MAGIC, stored);
-		}
-	}
 #endif /* UNIV_INNOCHECKSUM */
 
 	if (stored == calc) {
 		return(TRUE);
 	}
 
-	switch (curr_algo) {
+#ifndef UNIV_INNOCHECKSUM
+	switch (srv_checksum_algorithm) {
 	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
-	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
-		return FALSE;
-	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
-	case SRV_CHECKSUM_ALGORITHM_CRC32:
+		break;
+	default:
 		if (stored == BUF_NO_CHECKSUM_MAGIC) {
 			return(TRUE);
 		}
 
-		return stored == page_zip_calc_checksum(
-			data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
-	case SRV_CHECKSUM_ALGORITHM_INNODB:
-		if (stored == BUF_NO_CHECKSUM_MAGIC) {
-			return TRUE;
-		}
-
-		return stored == page_zip_calc_checksum(
-			data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
-	case SRV_CHECKSUM_ALGORITHM_NONE:
-		return TRUE;
+		return stored == page_zip_calc_checksum(data, size, true);
 	}
+#endif /* !UNIV_INNOCHECKSUM */
 
 	return FALSE;
 }
diff --git a/storage/innobase/pars/make_bison.sh b/storage/innobase/pars/make_bison.sh
index 6b3cb693978..25c967aceda 100755
--- a/storage/innobase/pars/make_bison.sh
+++ b/storage/innobase/pars/make_bison.sh
@@ -25,6 +25,7 @@ mv pars0grm.tab.h ../include/pars0grm.h
 
 sed -e '
 s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/'"pars0grm.tab.h"'/'"pars0grm.h"'/;
 s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/;
 ' < "$TMPFILE" > "$OUTFILE"
 
diff --git a/storage/innobase/pars/pars0grm.cc b/storage/innobase/pars/pars0grm.cc
index 1e10a61f5ad..75d7089fb5e 100644
--- a/storage/innobase/pars/pars0grm.cc
+++ b/storage/innobase/pars/pars0grm.cc
@@ -1,8 +1,8 @@
-/* A Bison parser, made by GNU Bison 3.4.2.  */
+/* A Bison parser, made by GNU Bison 3.7.6.  */
 
 /* Bison implementation for Yacc-like parsers in C
 
-   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2019 Free Software Foundation,
+   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
    Inc.
 
    This program is free software: you can redistribute it and/or modify
@@ -16,7 +16,7 @@
    GNU General Public License for more details.
 
    You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
 
 /* As a special exception, you may create a larger work that contains
    part or all of the Bison parser skeleton and distribute that work
@@ -34,6 +34,10 @@
 /* C LALR(1) parser skeleton written by Richard Stallman, by
    simplifying the original so-called "semantic" parser.  */
 
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+   especially those whose name start with YY_ or yy_.  They are
+   private implementation details that can be changed or removed.  */
+
 /* All symbols defined below should begin with yy or YY, to avoid
    infringing on user name space.  This should be done even for local
    variables, as they might otherwise be expanded by user macros.
@@ -41,14 +45,11 @@
    define necessary library symbols; they are noted "INFRINGES ON
    USER NAME SPACE" below.  */
 
-/* Undocumented macros, especially those whose name start with YY_,
-   are private implementation details.  Do not rely on them.  */
-
-/* Identify Bison output.  */
-#define YYBISON 1
+/* Identify Bison output, and Bison version.  */
+#define YYBISON 30706
 
-/* Bison version.  */
-#define YYBISON_VERSION "3.4.2"
+/* Bison version string.  */
+#define YYBISON_VERSION "3.7.6"
 
 /* Skeleton name.  */
 #define YYSKELETON_NAME "yacc.c"
@@ -89,8 +90,17 @@ que_node_t */
 int
 yylex(void);
 
-#line 89 "pars0grm.cc"
+#line 90 "pars0grm.cc"
 
+# ifndef YY_CAST
+#  ifdef __cplusplus
+#   define YY_CAST(Type, Val) static_cast<Type> (Val)
+#   define YY_REINTERPRET_CAST(Type, Val) reinterpret_cast<Type> (Val)
+#  else
+#   define YY_CAST(Type, Val) ((Type) (Val))
+#   define YY_REINTERPRET_CAST(Type, Val) ((Type) (Val))
+#  endif
+# endif
 # ifndef YY_NULLPTR
 #  if defined __cplusplus
 #   if 201103L <= __cplusplus
@@ -103,154 +113,257 @@ yylex(void);
 #  endif
 # endif
 
-/* Enabling verbose error messages.  */
-#ifdef YYERROR_VERBOSE
-# undef YYERROR_VERBOSE
-# define YYERROR_VERBOSE 1
-#else
-# define YYERROR_VERBOSE 0
-#endif
-
-/* Use api.header.include to #include this header
-   instead of duplicating it here.  */
-#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
-# define YY_YY_PARS0GRM_TAB_H_INCLUDED
-/* Debug traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 0
-#endif
-#if YYDEBUG
-extern int yydebug;
-#endif
-
-/* Token type.  */
-#ifndef YYTOKENTYPE
-# define YYTOKENTYPE
-  enum yytokentype
-  {
-    PARS_INT_LIT = 258,
-    PARS_FLOAT_LIT = 259,
-    PARS_STR_LIT = 260,
-    PARS_NULL_LIT = 261,
-    PARS_ID_TOKEN = 262,
-    PARS_AND_TOKEN = 263,
-    PARS_OR_TOKEN = 264,
-    PARS_NOT_TOKEN = 265,
-    PARS_GE_TOKEN = 266,
-    PARS_LE_TOKEN = 267,
-    PARS_NE_TOKEN = 268,
-    PARS_PROCEDURE_TOKEN = 269,
-    PARS_IN_TOKEN = 270,
-    PARS_INT_TOKEN = 271,
-    PARS_CHAR_TOKEN = 272,
-    PARS_IS_TOKEN = 273,
-    PARS_BEGIN_TOKEN = 274,
-    PARS_END_TOKEN = 275,
-    PARS_IF_TOKEN = 276,
-    PARS_THEN_TOKEN = 277,
-    PARS_ELSE_TOKEN = 278,
-    PARS_ELSIF_TOKEN = 279,
-    PARS_LOOP_TOKEN = 280,
-    PARS_WHILE_TOKEN = 281,
-    PARS_RETURN_TOKEN = 282,
-    PARS_SELECT_TOKEN = 283,
-    PARS_COUNT_TOKEN = 284,
-    PARS_FROM_TOKEN = 285,
-    PARS_WHERE_TOKEN = 286,
-    PARS_FOR_TOKEN = 287,
-    PARS_DDOT_TOKEN = 288,
-    PARS_ORDER_TOKEN = 289,
-    PARS_BY_TOKEN = 290,
-    PARS_ASC_TOKEN = 291,
-    PARS_DESC_TOKEN = 292,
-    PARS_INSERT_TOKEN = 293,
-    PARS_INTO_TOKEN = 294,
-    PARS_VALUES_TOKEN = 295,
-    PARS_UPDATE_TOKEN = 296,
-    PARS_SET_TOKEN = 297,
-    PARS_DELETE_TOKEN = 298,
-    PARS_CURRENT_TOKEN = 299,
-    PARS_OF_TOKEN = 300,
-    PARS_CREATE_TOKEN = 301,
-    PARS_TABLE_TOKEN = 302,
-    PARS_INDEX_TOKEN = 303,
-    PARS_UNIQUE_TOKEN = 304,
-    PARS_CLUSTERED_TOKEN = 305,
-    PARS_ON_TOKEN = 306,
-    PARS_ASSIGN_TOKEN = 307,
-    PARS_DECLARE_TOKEN = 308,
-    PARS_CURSOR_TOKEN = 309,
-    PARS_SQL_TOKEN = 310,
-    PARS_OPEN_TOKEN = 311,
-    PARS_FETCH_TOKEN = 312,
-    PARS_CLOSE_TOKEN = 313,
-    PARS_NOTFOUND_TOKEN = 314,
-    PARS_TO_BINARY_TOKEN = 315,
-    PARS_SUBSTR_TOKEN = 316,
-    PARS_CONCAT_TOKEN = 317,
-    PARS_INSTR_TOKEN = 318,
-    PARS_LENGTH_TOKEN = 319,
-    PARS_COMMIT_TOKEN = 320,
-    PARS_ROLLBACK_TOKEN = 321,
-    PARS_WORK_TOKEN = 322,
-    PARS_EXIT_TOKEN = 323,
-    PARS_FUNCTION_TOKEN = 324,
-    PARS_LOCK_TOKEN = 325,
-    PARS_SHARE_TOKEN = 326,
-    PARS_MODE_TOKEN = 327,
-    PARS_LIKE_TOKEN = 328,
-    PARS_LIKE_TOKEN_EXACT = 329,
-    PARS_LIKE_TOKEN_PREFIX = 330,
-    PARS_LIKE_TOKEN_SUFFIX = 331,
-    PARS_LIKE_TOKEN_SUBSTR = 332,
-    PARS_TABLE_NAME_TOKEN = 333,
-    PARS_BIGINT_TOKEN = 334,
-    NEG = 335
-  };
-#endif
+#include "pars0grm.h"
+/* Symbol kind.  */
+enum yysymbol_kind_t
+{
+  YYSYMBOL_YYEMPTY = -2,
+  YYSYMBOL_YYEOF = 0,                      /* "end of file"  */
+  YYSYMBOL_YYerror = 1,                    /* error  */
+  YYSYMBOL_YYUNDEF = 2,                    /* "invalid token"  */
+  YYSYMBOL_PARS_INT_LIT = 3,               /* PARS_INT_LIT  */
+  YYSYMBOL_PARS_FLOAT_LIT = 4,             /* PARS_FLOAT_LIT  */
+  YYSYMBOL_PARS_STR_LIT = 5,               /* PARS_STR_LIT  */
+  YYSYMBOL_PARS_NULL_LIT = 6,              /* PARS_NULL_LIT  */
+  YYSYMBOL_PARS_ID_TOKEN = 7,              /* PARS_ID_TOKEN  */
+  YYSYMBOL_PARS_AND_TOKEN = 8,             /* PARS_AND_TOKEN  */
+  YYSYMBOL_PARS_OR_TOKEN = 9,              /* PARS_OR_TOKEN  */
+  YYSYMBOL_PARS_NOT_TOKEN = 10,            /* PARS_NOT_TOKEN  */
+  YYSYMBOL_PARS_GE_TOKEN = 11,             /* PARS_GE_TOKEN  */
+  YYSYMBOL_PARS_LE_TOKEN = 12,             /* PARS_LE_TOKEN  */
+  YYSYMBOL_PARS_NE_TOKEN = 13,             /* PARS_NE_TOKEN  */
+  YYSYMBOL_PARS_PROCEDURE_TOKEN = 14,      /* PARS_PROCEDURE_TOKEN  */
+  YYSYMBOL_PARS_IN_TOKEN = 15,             /* PARS_IN_TOKEN  */
+  YYSYMBOL_PARS_INT_TOKEN = 16,            /* PARS_INT_TOKEN  */
+  YYSYMBOL_PARS_CHAR_TOKEN = 17,           /* PARS_CHAR_TOKEN  */
+  YYSYMBOL_PARS_IS_TOKEN = 18,             /* PARS_IS_TOKEN  */
+  YYSYMBOL_PARS_BEGIN_TOKEN = 19,          /* PARS_BEGIN_TOKEN  */
+  YYSYMBOL_PARS_END_TOKEN = 20,            /* PARS_END_TOKEN  */
+  YYSYMBOL_PARS_IF_TOKEN = 21,             /* PARS_IF_TOKEN  */
+  YYSYMBOL_PARS_THEN_TOKEN = 22,           /* PARS_THEN_TOKEN  */
+  YYSYMBOL_PARS_ELSE_TOKEN = 23,           /* PARS_ELSE_TOKEN  */
+  YYSYMBOL_PARS_ELSIF_TOKEN = 24,          /* PARS_ELSIF_TOKEN  */
+  YYSYMBOL_PARS_LOOP_TOKEN = 25,           /* PARS_LOOP_TOKEN  */
+  YYSYMBOL_PARS_WHILE_TOKEN = 26,          /* PARS_WHILE_TOKEN  */
+  YYSYMBOL_PARS_RETURN_TOKEN = 27,         /* PARS_RETURN_TOKEN  */
+  YYSYMBOL_PARS_SELECT_TOKEN = 28,         /* PARS_SELECT_TOKEN  */
+  YYSYMBOL_PARS_COUNT_TOKEN = 29,          /* PARS_COUNT_TOKEN  */
+  YYSYMBOL_PARS_FROM_TOKEN = 30,           /* PARS_FROM_TOKEN  */
+  YYSYMBOL_PARS_WHERE_TOKEN = 31,          /* PARS_WHERE_TOKEN  */
+  YYSYMBOL_PARS_FOR_TOKEN = 32,            /* PARS_FOR_TOKEN  */
+  YYSYMBOL_PARS_DDOT_TOKEN = 33,           /* PARS_DDOT_TOKEN  */
+  YYSYMBOL_PARS_ORDER_TOKEN = 34,          /* PARS_ORDER_TOKEN  */
+  YYSYMBOL_PARS_BY_TOKEN = 35,             /* PARS_BY_TOKEN  */
+  YYSYMBOL_PARS_ASC_TOKEN = 36,            /* PARS_ASC_TOKEN  */
+  YYSYMBOL_PARS_DESC_TOKEN = 37,           /* PARS_DESC_TOKEN  */
+  YYSYMBOL_PARS_INSERT_TOKEN = 38,         /* PARS_INSERT_TOKEN  */
+  YYSYMBOL_PARS_INTO_TOKEN = 39,           /* PARS_INTO_TOKEN  */
+  YYSYMBOL_PARS_VALUES_TOKEN = 40,         /* PARS_VALUES_TOKEN  */
+  YYSYMBOL_PARS_UPDATE_TOKEN = 41,         /* PARS_UPDATE_TOKEN  */
+  YYSYMBOL_PARS_SET_TOKEN = 42,            /* PARS_SET_TOKEN  */
+  YYSYMBOL_PARS_DELETE_TOKEN = 43,         /* PARS_DELETE_TOKEN  */
+  YYSYMBOL_PARS_CURRENT_TOKEN = 44,        /* PARS_CURRENT_TOKEN  */
+  YYSYMBOL_PARS_OF_TOKEN = 45,             /* PARS_OF_TOKEN  */
+  YYSYMBOL_PARS_CREATE_TOKEN = 46,         /* PARS_CREATE_TOKEN  */
+  YYSYMBOL_PARS_TABLE_TOKEN = 47,          /* PARS_TABLE_TOKEN  */
+  YYSYMBOL_PARS_INDEX_TOKEN = 48,          /* PARS_INDEX_TOKEN  */
+  YYSYMBOL_PARS_UNIQUE_TOKEN = 49,         /* PARS_UNIQUE_TOKEN  */
+  YYSYMBOL_PARS_CLUSTERED_TOKEN = 50,      /* PARS_CLUSTERED_TOKEN  */
+  YYSYMBOL_PARS_ON_TOKEN = 51,             /* PARS_ON_TOKEN  */
+  YYSYMBOL_PARS_ASSIGN_TOKEN = 52,         /* PARS_ASSIGN_TOKEN  */
+  YYSYMBOL_PARS_DECLARE_TOKEN = 53,        /* PARS_DECLARE_TOKEN  */
+  YYSYMBOL_PARS_CURSOR_TOKEN = 54,         /* PARS_CURSOR_TOKEN  */
+  YYSYMBOL_PARS_SQL_TOKEN = 55,            /* PARS_SQL_TOKEN  */
+  YYSYMBOL_PARS_OPEN_TOKEN = 56,           /* PARS_OPEN_TOKEN  */
+  YYSYMBOL_PARS_FETCH_TOKEN = 57,          /* PARS_FETCH_TOKEN  */
+  YYSYMBOL_PARS_CLOSE_TOKEN = 58,          /* PARS_CLOSE_TOKEN  */
+  YYSYMBOL_PARS_NOTFOUND_TOKEN = 59,       /* PARS_NOTFOUND_TOKEN  */
+  YYSYMBOL_PARS_TO_BINARY_TOKEN = 60,      /* PARS_TO_BINARY_TOKEN  */
+  YYSYMBOL_PARS_SUBSTR_TOKEN = 61,         /* PARS_SUBSTR_TOKEN  */
+  YYSYMBOL_PARS_CONCAT_TOKEN = 62,         /* PARS_CONCAT_TOKEN  */
+  YYSYMBOL_PARS_INSTR_TOKEN = 63,          /* PARS_INSTR_TOKEN  */
+  YYSYMBOL_PARS_LENGTH_TOKEN = 64,         /* PARS_LENGTH_TOKEN  */
+  YYSYMBOL_PARS_COMMIT_TOKEN = 65,         /* PARS_COMMIT_TOKEN  */
+  YYSYMBOL_PARS_ROLLBACK_TOKEN = 66,       /* PARS_ROLLBACK_TOKEN  */
+  YYSYMBOL_PARS_WORK_TOKEN = 67,           /* PARS_WORK_TOKEN  */
+  YYSYMBOL_PARS_EXIT_TOKEN = 68,           /* PARS_EXIT_TOKEN  */
+  YYSYMBOL_PARS_FUNCTION_TOKEN = 69,       /* PARS_FUNCTION_TOKEN  */
+  YYSYMBOL_PARS_LOCK_TOKEN = 70,           /* PARS_LOCK_TOKEN  */
+  YYSYMBOL_PARS_SHARE_TOKEN = 71,          /* PARS_SHARE_TOKEN  */
+  YYSYMBOL_PARS_MODE_TOKEN = 72,           /* PARS_MODE_TOKEN  */
+  YYSYMBOL_PARS_LIKE_TOKEN = 73,           /* PARS_LIKE_TOKEN  */
+  YYSYMBOL_PARS_LIKE_TOKEN_EXACT = 74,     /* PARS_LIKE_TOKEN_EXACT  */
+  YYSYMBOL_PARS_LIKE_TOKEN_PREFIX = 75,    /* PARS_LIKE_TOKEN_PREFIX  */
+  YYSYMBOL_PARS_LIKE_TOKEN_SUFFIX = 76,    /* PARS_LIKE_TOKEN_SUFFIX  */
+  YYSYMBOL_PARS_LIKE_TOKEN_SUBSTR = 77,    /* PARS_LIKE_TOKEN_SUBSTR  */
+  YYSYMBOL_PARS_TABLE_NAME_TOKEN = 78,     /* PARS_TABLE_NAME_TOKEN  */
+  YYSYMBOL_PARS_BIGINT_TOKEN = 79,         /* PARS_BIGINT_TOKEN  */
+  YYSYMBOL_80_ = 80,                       /* '='  */
+  YYSYMBOL_81_ = 81,                       /* '<'  */
+  YYSYMBOL_82_ = 82,                       /* '>'  */
+  YYSYMBOL_83_ = 83,                       /* '-'  */
+  YYSYMBOL_84_ = 84,                       /* '+'  */
+  YYSYMBOL_85_ = 85,                       /* '*'  */
+  YYSYMBOL_86_ = 86,                       /* '/'  */
+  YYSYMBOL_NEG = 87,                       /* NEG  */
+  YYSYMBOL_88_ = 88,                       /* '%'  */
+  YYSYMBOL_89_ = 89,                       /* ';'  */
+  YYSYMBOL_90_ = 90,                       /* '('  */
+  YYSYMBOL_91_ = 91,                       /* ')'  */
+  YYSYMBOL_92_ = 92,                       /* ','  */
+  YYSYMBOL_YYACCEPT = 93,                  /* $accept  */
+  YYSYMBOL_top_statement = 94,             /* top_statement  */
+  YYSYMBOL_statement = 95,                 /* statement  */
+  YYSYMBOL_statement_list = 96,            /* statement_list  */
+  YYSYMBOL_exp = 97,                       /* exp  */
+  YYSYMBOL_function_name = 98,             /* function_name  */
+  YYSYMBOL_user_function_call = 99,        /* user_function_call  */
+  YYSYMBOL_table_list = 100,               /* table_list  */
+  YYSYMBOL_variable_list = 101,            /* variable_list  */
+  YYSYMBOL_exp_list = 102,                 /* exp_list  */
+  YYSYMBOL_select_item = 103,              /* select_item  */
+  YYSYMBOL_select_item_list = 104,         /* select_item_list  */
+  YYSYMBOL_select_list = 105,              /* select_list  */
+  YYSYMBOL_search_condition = 106,         /* search_condition  */
+  YYSYMBOL_for_update_clause = 107,        /* for_update_clause  */
+  YYSYMBOL_lock_shared_clause = 108,       /* lock_shared_clause  */
+  YYSYMBOL_order_direction = 109,          /* order_direction  */
+  YYSYMBOL_order_by_clause = 110,          /* order_by_clause  */
+  YYSYMBOL_select_statement = 111,         /* select_statement  */
+  YYSYMBOL_insert_statement_start = 112,   /* insert_statement_start  */
+  YYSYMBOL_insert_statement = 113,         /* insert_statement  */
+  YYSYMBOL_column_assignment = 114,        /* column_assignment  */
+  YYSYMBOL_column_assignment_list = 115,   /* column_assignment_list  */
+  YYSYMBOL_cursor_positioned = 116,        /* cursor_positioned  */
+  YYSYMBOL_update_statement_start = 117,   /* update_statement_start  */
+  YYSYMBOL_update_statement_searched = 118, /* update_statement_searched  */
+  YYSYMBOL_update_statement_positioned = 119, /* update_statement_positioned  */
+  YYSYMBOL_delete_statement_start = 120,   /* delete_statement_start  */
+  YYSYMBOL_delete_statement_searched = 121, /* delete_statement_searched  */
+  YYSYMBOL_delete_statement_positioned = 122, /* delete_statement_positioned  */
+  YYSYMBOL_assignment_statement = 123,     /* assignment_statement  */
+  YYSYMBOL_elsif_element = 124,            /* elsif_element  */
+  YYSYMBOL_elsif_list = 125,               /* elsif_list  */
+  YYSYMBOL_else_part = 126,                /* else_part  */
+  YYSYMBOL_if_statement = 127,             /* if_statement  */
+  YYSYMBOL_while_statement = 128,          /* while_statement  */
+  YYSYMBOL_for_statement = 129,            /* for_statement  */
+  YYSYMBOL_exit_statement = 130,           /* exit_statement  */
+  YYSYMBOL_return_statement = 131,         /* return_statement  */
+  YYSYMBOL_open_cursor_statement = 132,    /* open_cursor_statement  */
+  YYSYMBOL_close_cursor_statement = 133,   /* close_cursor_statement  */
+  YYSYMBOL_fetch_statement = 134,          /* fetch_statement  */
+  YYSYMBOL_column_def = 135,               /* column_def  */
+  YYSYMBOL_column_def_list = 136,          /* column_def_list  */
+  YYSYMBOL_opt_column_len = 137,           /* opt_column_len  */
+  YYSYMBOL_opt_not_null = 138,             /* opt_not_null  */
+  YYSYMBOL_create_table = 139,             /* create_table  */
+  YYSYMBOL_column_list = 140,              /* column_list  */
+  YYSYMBOL_unique_def = 141,               /* unique_def  */
+  YYSYMBOL_clustered_def = 142,            /* clustered_def  */
+  YYSYMBOL_create_index = 143,             /* create_index  */
+  YYSYMBOL_table_name = 144,               /* table_name  */
+  YYSYMBOL_commit_statement = 145,         /* commit_statement  */
+  YYSYMBOL_rollback_statement = 146,       /* rollback_statement  */
+  YYSYMBOL_type_name = 147,                /* type_name  */
+  YYSYMBOL_variable_declaration = 148,     /* variable_declaration  */
+  YYSYMBOL_variable_declaration_list = 149, /* variable_declaration_list  */
+  YYSYMBOL_cursor_declaration = 150,       /* cursor_declaration  */
+  YYSYMBOL_function_declaration = 151,     /* function_declaration  */
+  YYSYMBOL_declaration = 152,              /* declaration  */
+  YYSYMBOL_declaration_list = 153,         /* declaration_list  */
+  YYSYMBOL_procedure_definition = 154      /* procedure_definition  */
+};
+typedef enum yysymbol_kind_t yysymbol_kind_t;
 
-/* Value type.  */
-#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
-typedef int YYSTYPE;
-# define YYSTYPE_IS_TRIVIAL 1
-# define YYSTYPE_IS_DECLARED 1
-#endif
 
 
-extern YYSTYPE yylval;
 
-int yyparse (void);
+#ifdef short
+# undef short
+#endif
 
-#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED  */
+/* On compilers that do not define __PTRDIFF_MAX__ etc., make sure
+   <limits.h> and (if available) <stdint.h> are included
+   so that the code can choose integer types of a good width.  */
 
+#ifndef __PTRDIFF_MAX__
+# include <limits.h> /* INFRINGES ON USER NAME SPACE */
+# if defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__
+#  include <stdint.h> /* INFRINGES ON USER NAME SPACE */
+#  define YY_STDINT_H
+# endif
+#endif
 
+/* Narrow types that promote to a signed type and that can represent a
+   signed or unsigned integer of at least N bits.  In tables they can
+   save space and decrease cache pressure.  Promoting to a signed type
+   helps avoid bugs in integer arithmetic.  */
 
-#ifdef short
-# undef short
+#ifdef __INT_LEAST8_MAX__
+typedef __INT_LEAST8_TYPE__ yytype_int8;
+#elif defined YY_STDINT_H
+typedef int_least8_t yytype_int8;
+#else
+typedef signed char yytype_int8;
 #endif
 
-#ifdef YYTYPE_UINT8
-typedef YYTYPE_UINT8 yytype_uint8;
+#ifdef __INT_LEAST16_MAX__
+typedef __INT_LEAST16_TYPE__ yytype_int16;
+#elif defined YY_STDINT_H
+typedef int_least16_t yytype_int16;
 #else
-typedef unsigned char yytype_uint8;
+typedef short yytype_int16;
 #endif
 
-#ifdef YYTYPE_INT8
-typedef YYTYPE_INT8 yytype_int8;
-#else
-typedef signed char yytype_int8;
+/* Work around bug in HP-UX 11.23, which defines these macros
+   incorrectly for preprocessor constants.  This workaround can likely
+   be removed in 2023, as HPE has promised support for HP-UX 11.23
+   (aka HP-UX 11i v2) only through the end of 2022; see Table 2 of
+   <https://h20195.www2.hpe.com/V2/getpdf.aspx/4AA4-7673ENW.pdf>.  */
+#ifdef __hpux
+# undef UINT_LEAST8_MAX
+# undef UINT_LEAST16_MAX
+# define UINT_LEAST8_MAX 255
+# define UINT_LEAST16_MAX 65535
 #endif
 
-#ifdef YYTYPE_UINT16
-typedef YYTYPE_UINT16 yytype_uint16;
+#if defined __UINT_LEAST8_MAX__ && __UINT_LEAST8_MAX__ <= __INT_MAX__
+typedef __UINT_LEAST8_TYPE__ yytype_uint8;
+#elif (!defined __UINT_LEAST8_MAX__ && defined YY_STDINT_H \
+       && UINT_LEAST8_MAX <= INT_MAX)
+typedef uint_least8_t yytype_uint8;
+#elif !defined __UINT_LEAST8_MAX__ && UCHAR_MAX <= INT_MAX
+typedef unsigned char yytype_uint8;
 #else
-typedef unsigned short yytype_uint16;
+typedef short yytype_uint8;
 #endif
 
-#ifdef YYTYPE_INT16
-typedef YYTYPE_INT16 yytype_int16;
+#if defined __UINT_LEAST16_MAX__ && __UINT_LEAST16_MAX__ <= __INT_MAX__
+typedef __UINT_LEAST16_TYPE__ yytype_uint16;
+#elif (!defined __UINT_LEAST16_MAX__ && defined YY_STDINT_H \
+       && UINT_LEAST16_MAX <= INT_MAX)
+typedef uint_least16_t yytype_uint16;
+#elif !defined __UINT_LEAST16_MAX__ && USHRT_MAX <= INT_MAX
+typedef unsigned short yytype_uint16;
 #else
-typedef short yytype_int16;
+typedef int yytype_uint16;
+#endif
+
+#ifndef YYPTRDIFF_T
+# if defined __PTRDIFF_TYPE__ && defined __PTRDIFF_MAX__
+#  define YYPTRDIFF_T __PTRDIFF_TYPE__
+#  define YYPTRDIFF_MAXIMUM __PTRDIFF_MAX__
+# elif defined PTRDIFF_MAX
+#  ifndef ptrdiff_t
+#   include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  endif
+#  define YYPTRDIFF_T ptrdiff_t
+#  define YYPTRDIFF_MAXIMUM PTRDIFF_MAX
+# else
+#  define YYPTRDIFF_T long
+#  define YYPTRDIFF_MAXIMUM LONG_MAX
+# endif
 #endif
 
 #ifndef YYSIZE_T
@@ -258,7 +371,7 @@ typedef short yytype_int16;
 #  define YYSIZE_T __SIZE_TYPE__
 # elif defined size_t
 #  define YYSIZE_T size_t
-# elif ! defined YYSIZE_T
+# elif defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__
 #  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
 #  define YYSIZE_T size_t
 # else
@@ -266,7 +379,20 @@ typedef short yytype_int16;
 # endif
 #endif
 
-#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+#define YYSIZE_MAXIMUM                                  \
+  YY_CAST (YYPTRDIFF_T,                                 \
+           (YYPTRDIFF_MAXIMUM < YY_CAST (YYSIZE_T, -1)  \
+            ? YYPTRDIFF_MAXIMUM                         \
+            : YY_CAST (YYSIZE_T, -1)))
+
+#define YYSIZEOF(X) YY_CAST (YYPTRDIFF_T, sizeof (X))
+
+
+/* Stored state numbers (used for stacks). */
+typedef yytype_int16 yy_state_t;
+
+/* State numbers in computations.  */
+typedef int yy_state_fast_t;
 
 #ifndef YY_
 # if defined YYENABLE_NLS && YYENABLE_NLS
@@ -280,38 +406,37 @@ typedef short yytype_int16;
 # endif
 #endif
 
-#ifndef YY_ATTRIBUTE
-# if (defined __GNUC__                                               \
-      && (2 < __GNUC__ || (__GNUC__ == 2 && 96 <= __GNUC_MINOR__)))  \
-     || defined __SUNPRO_C && 0x5110 <= __SUNPRO_C
-#  define YY_ATTRIBUTE(Spec) __attribute__(Spec)
-# else
-#  define YY_ATTRIBUTE(Spec) /* empty */
-# endif
-#endif
 
 #ifndef YY_ATTRIBUTE_PURE
-# define YY_ATTRIBUTE_PURE   YY_ATTRIBUTE ((__pure__))
+# if defined __GNUC__ && 2 < __GNUC__ + (96 <= __GNUC_MINOR__)
+#  define YY_ATTRIBUTE_PURE __attribute__ ((__pure__))
+# else
+#  define YY_ATTRIBUTE_PURE
+# endif
 #endif
 
 #ifndef YY_ATTRIBUTE_UNUSED
-# define YY_ATTRIBUTE_UNUSED YY_ATTRIBUTE ((__unused__))
+# if defined __GNUC__ && 2 < __GNUC__ + (7 <= __GNUC_MINOR__)
+#  define YY_ATTRIBUTE_UNUSED __attribute__ ((__unused__))
+# else
+#  define YY_ATTRIBUTE_UNUSED
+# endif
 #endif
 
 /* Suppress unused-variable warnings by "using" E.  */
 #if ! defined lint || defined __GNUC__
-# define YYUSE(E) ((void) (E))
+# define YY_USE(E) ((void) (E))
 #else
-# define YYUSE(E) /* empty */
+# define YY_USE(E) /* empty */
 #endif
 
 #if defined __GNUC__ && ! defined __ICC && 407 <= __GNUC__ * 100 + __GNUC_MINOR__
 /* Suppress an incorrect diagnostic about yylval being uninitialized.  */
-# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN \
-    _Pragma ("GCC diagnostic push") \
-    _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"")\
+# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN                            \
+    _Pragma ("GCC diagnostic push")                                     \
+    _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"")              \
     _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
-# define YY_IGNORE_MAYBE_UNINITIALIZED_END \
+# define YY_IGNORE_MAYBE_UNINITIALIZED_END      \
     _Pragma ("GCC diagnostic pop")
 #else
 # define YY_INITIAL_VALUE(Value) Value
@@ -324,10 +449,22 @@ typedef short yytype_int16;
 # define YY_INITIAL_VALUE(Value) /* Nothing. */
 #endif
 
+#if defined __cplusplus && defined __GNUC__ && ! defined __ICC && 6 <= __GNUC__
+# define YY_IGNORE_USELESS_CAST_BEGIN                          \
+    _Pragma ("GCC diagnostic push")                            \
+    _Pragma ("GCC diagnostic ignored \"-Wuseless-cast\"")
+# define YY_IGNORE_USELESS_CAST_END            \
+    _Pragma ("GCC diagnostic pop")
+#endif
+#ifndef YY_IGNORE_USELESS_CAST_BEGIN
+# define YY_IGNORE_USELESS_CAST_BEGIN
+# define YY_IGNORE_USELESS_CAST_END
+#endif
+
 
 #define YY_ASSERT(E) ((void) (0 && (E)))
 
-#if ! defined yyoverflow || YYERROR_VERBOSE
+#if !defined yyoverflow
 
 /* The parser invokes alloca or malloc; define the necessary symbols.  */
 
@@ -392,8 +529,7 @@ void free (void *); /* INFRINGES ON USER NAME SPACE */
 #   endif
 #  endif
 # endif
-#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
-
+#endif /* !defined yyoverflow */
 
 #if (! defined yyoverflow \
      && (! defined __cplusplus \
@@ -402,17 +538,17 @@ void free (void *); /* INFRINGES ON USER NAME SPACE */
 /* A type that is properly aligned for any stack member.  */
 union yyalloc
 {
-  yytype_int16 yyss_alloc;
+  yy_state_t yyss_alloc;
   YYSTYPE yyvs_alloc;
 };
 
 /* The size of the maximum gap between one aligned stack and the next.  */
-# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+# define YYSTACK_GAP_MAXIMUM (YYSIZEOF (union yyalloc) - 1)
 
 /* The size of an array large to enough to hold all stacks, each with
    N elements.  */
 # define YYSTACK_BYTES(N) \
-     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+     ((N) * (YYSIZEOF (yy_state_t) + YYSIZEOF (YYSTYPE)) \
       + YYSTACK_GAP_MAXIMUM)
 
 # define YYCOPY_NEEDED 1
@@ -425,11 +561,11 @@ union yyalloc
 # define YYSTACK_RELOCATE(Stack_alloc, Stack)                           \
     do                                                                  \
       {                                                                 \
-        YYSIZE_T yynewbytes;                                            \
+        YYPTRDIFF_T yynewbytes;                                         \
         YYCOPY (&yyptr->Stack_alloc, Stack, yysize);                    \
         Stack = &yyptr->Stack_alloc;                                    \
-        yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
-        yyptr += yynewbytes / sizeof (*yyptr);                          \
+        yynewbytes = yystacksize * YYSIZEOF (*Stack) + YYSTACK_GAP_MAXIMUM; \
+        yyptr += yynewbytes / YYSIZEOF (*yyptr);                        \
       }                                                                 \
     while (0)
 
@@ -441,12 +577,12 @@ union yyalloc
 # ifndef YYCOPY
 #  if defined __GNUC__ && 1 < __GNUC__
 #   define YYCOPY(Dst, Src, Count) \
-      __builtin_memcpy (Dst, Src, (Count) * sizeof (*(Src)))
+      __builtin_memcpy (Dst, Src, YY_CAST (YYSIZE_T, (Count)) * sizeof (*(Src)))
 #  else
 #   define YYCOPY(Dst, Src, Count)              \
       do                                        \
         {                                       \
-          YYSIZE_T yyi;                         \
+          YYPTRDIFF_T yyi;                      \
           for (yyi = 0; yyi < (Count); yyi++)   \
             (Dst)[yyi] = (Src)[yyi];            \
         }                                       \
@@ -458,42 +594,45 @@ union yyalloc
 /* YYFINAL -- State number of the termination state.  */
 #define YYFINAL  5
 /* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST   603
+#define YYLAST   611
 
 /* YYNTOKENS -- Number of terminals.  */
-#define YYNTOKENS  96
+#define YYNTOKENS  93
 /* YYNNTS -- Number of nonterminals.  */
-#define YYNNTS  64
+#define YYNNTS  62
 /* YYNRULES -- Number of rules.  */
-#define YYNRULES  150
+#define YYNRULES  145
 /* YYNSTATES -- Number of states.  */
-#define YYNSTATES  300
+#define YYNSTATES  290
 
-#define YYUNDEFTOK  2
+/* YYMAXUTOK -- Last valid token kind.  */
 #define YYMAXUTOK   335
 
+
 /* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM
    as returned by yylex, with out-of-bounds checking.  */
-#define YYTRANSLATE(YYX)                                                \
-  ((unsigned) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+#define YYTRANSLATE(YYX)                                \
+  (0 <= (YYX) && (YYX) <= YYMAXUTOK                     \
+   ? YY_CAST (yysymbol_kind_t, yytranslate[YYX])        \
+   : YYSYMBOL_YYUNDEF)
 
 /* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
    as returned by yylex.  */
-static const yytype_uint8 yytranslate[] =
+static const yytype_int8 yytranslate[] =
 {
        0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,    88,     2,     2,
-      90,    91,    85,    84,    93,    83,     2,    86,     2,     2,
+      90,    91,    85,    84,    92,    83,     2,    86,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,    89,
-      81,    80,    82,    92,     2,     2,     2,     2,     2,     2,
+      81,    80,    82,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,    94,     2,    95,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
@@ -519,62 +658,68 @@ static const yytype_uint8 yytranslate[] =
 
 #if YYDEBUG
   /* YYRLINE[YYN] -- Source line where rule number YYN was defined.  */
-static const yytype_uint16 yyrline[] =
+static const yytype_int16 yyrline[] =
 {
        0,   140,   140,   143,   144,   145,   146,   147,   148,   149,
      150,   151,   152,   153,   154,   155,   156,   157,   158,   159,
-     160,   161,   162,   166,   167,   172,   173,   175,   176,   177,
-     178,   179,   180,   181,   182,   183,   184,   185,   186,   187,
-     189,   190,   191,   192,   193,   194,   195,   196,   197,   199,
-     204,   205,   206,   207,   208,   211,   213,   214,   218,   224,
-     228,   229,   234,   235,   236,   241,   242,   243,   247,   248,
-     256,   257,   258,   263,   265,   268,   272,   273,   277,   278,
-     283,   284,   289,   290,   291,   295,   296,   303,   318,   323,
-     326,   334,   340,   341,   346,   352,   361,   369,   377,   384,
-     392,   400,   407,   413,   414,   419,   420,   422,   426,   433,
-     439,   449,   453,   457,   464,   471,   475,   483,   492,   493,
-     498,   499,   504,   505,   511,   519,   520,   525,   526,   530,
-     531,   535,   549,   550,   554,   559,   564,   565,   566,   570,
-     576,   578,   579,   583,   591,   597,   598,   601,   603,   604,
-     608
+     160,   161,   165,   166,   171,   172,   174,   175,   176,   177,
+     178,   179,   180,   181,   182,   183,   184,   185,   186,   188,
+     189,   190,   191,   192,   193,   194,   195,   196,   198,   203,
+     204,   205,   206,   207,   211,   215,   216,   221,   222,   223,
+     228,   229,   230,   234,   235,   243,   244,   245,   250,   252,
+     255,   259,   260,   264,   265,   270,   271,   276,   277,   278,
+     282,   283,   290,   305,   310,   313,   321,   327,   328,   333,
+     339,   348,   356,   364,   371,   379,   387,   394,   400,   401,
+     406,   407,   409,   413,   420,   426,   436,   440,   444,   451,
+     458,   462,   470,   479,   480,   485,   486,   491,   492,   498,
+     506,   507,   512,   513,   517,   518,   522,   536,   537,   541,
+     546,   551,   552,   553,   557,   563,   565,   566,   570,   578,
+     584,   585,   588,   590,   591,   595
 };
 #endif
 
-#if YYDEBUG || YYERROR_VERBOSE || 0
+/** Accessing symbol of state STATE.  */
+#define YY_ACCESSING_SYMBOL(State) YY_CAST (yysymbol_kind_t, yystos[State])
+
+#if YYDEBUG || 0
+/* The user-facing name of the symbol whose (internal) number is
+   YYSYMBOL.  No bounds checking.  */
+static const char *yysymbol_name (yysymbol_kind_t yysymbol) YY_ATTRIBUTE_UNUSED;
+
 /* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
    First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
 static const char *const yytname[] =
 {
-  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
-  "PARS_STR_LIT", "PARS_NULL_LIT", "PARS_ID_TOKEN", "PARS_AND_TOKEN",
-  "PARS_OR_TOKEN", "PARS_NOT_TOKEN", "PARS_GE_TOKEN", "PARS_LE_TOKEN",
-  "PARS_NE_TOKEN", "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN",
-  "PARS_INT_TOKEN", "PARS_CHAR_TOKEN", "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN",
-  "PARS_END_TOKEN", "PARS_IF_TOKEN", "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN",
-  "PARS_ELSIF_TOKEN", "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN",
-  "PARS_RETURN_TOKEN", "PARS_SELECT_TOKEN", "PARS_COUNT_TOKEN",
-  "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN", "PARS_FOR_TOKEN",
-  "PARS_DDOT_TOKEN", "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN",
-  "PARS_DESC_TOKEN", "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN",
-  "PARS_VALUES_TOKEN", "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN",
-  "PARS_DELETE_TOKEN", "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN",
-  "PARS_CREATE_TOKEN", "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN",
-  "PARS_UNIQUE_TOKEN", "PARS_CLUSTERED_TOKEN", "PARS_ON_TOKEN",
-  "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN", "PARS_CURSOR_TOKEN",
-  "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN", "PARS_FETCH_TOKEN",
-  "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN", "PARS_TO_BINARY_TOKEN",
-  "PARS_SUBSTR_TOKEN", "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN",
-  "PARS_LENGTH_TOKEN", "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN",
-  "PARS_WORK_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
-  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN",
-  "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT", "PARS_LIKE_TOKEN_PREFIX",
-  "PARS_LIKE_TOKEN_SUFFIX", "PARS_LIKE_TOKEN_SUBSTR",
-  "PARS_TABLE_NAME_TOKEN", "PARS_BIGINT_TOKEN", "'='", "'<'", "'>'", "'-'",
-  "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'", "'?'", "','",
-  "'{'", "'}'", "$accept", "top_statement", "statement", "statement_list",
-  "exp", "function_name", "question_mark_list", "stored_procedure_call",
-  "user_function_call", "table_list", "variable_list", "exp_list",
-  "select_item", "select_item_list", "select_list", "search_condition",
+  "\"end of file\"", "error", "\"invalid token\"", "PARS_INT_LIT",
+  "PARS_FLOAT_LIT", "PARS_STR_LIT", "PARS_NULL_LIT", "PARS_ID_TOKEN",
+  "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN", "PARS_GE_TOKEN",
+  "PARS_LE_TOKEN", "PARS_NE_TOKEN", "PARS_PROCEDURE_TOKEN",
+  "PARS_IN_TOKEN", "PARS_INT_TOKEN", "PARS_CHAR_TOKEN", "PARS_IS_TOKEN",
+  "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN", "PARS_THEN_TOKEN",
+  "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN", "PARS_LOOP_TOKEN",
+  "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN", "PARS_SELECT_TOKEN",
+  "PARS_COUNT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_ORDER_TOKEN", "PARS_BY_TOKEN",
+  "PARS_ASC_TOKEN", "PARS_DESC_TOKEN", "PARS_INSERT_TOKEN",
+  "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN", "PARS_UPDATE_TOKEN",
+  "PARS_SET_TOKEN", "PARS_DELETE_TOKEN", "PARS_CURRENT_TOKEN",
+  "PARS_OF_TOKEN", "PARS_CREATE_TOKEN", "PARS_TABLE_TOKEN",
+  "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN", "PARS_CLUSTERED_TOKEN",
+  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+  "PARS_TO_BINARY_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_CONCAT_TOKEN",
+  "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN", "PARS_COMMIT_TOKEN",
+  "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN", "PARS_EXIT_TOKEN",
+  "PARS_FUNCTION_TOKEN", "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN",
+  "PARS_MODE_TOKEN", "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT",
+  "PARS_LIKE_TOKEN_PREFIX", "PARS_LIKE_TOKEN_SUFFIX",
+  "PARS_LIKE_TOKEN_SUBSTR", "PARS_TABLE_NAME_TOKEN", "PARS_BIGINT_TOKEN",
+  "'='", "'<'", "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'",
+  "'('", "')'", "','", "$accept", "top_statement", "statement",
+  "statement_list", "exp", "function_name", "user_function_call",
+  "table_list", "variable_list", "exp_list", "select_item",
+  "select_item_list", "select_list", "search_condition",
   "for_update_clause", "lock_shared_clause", "order_direction",
   "order_by_clause", "select_statement", "insert_statement_start",
   "insert_statement", "column_assignment", "column_assignment_list",
@@ -592,12 +737,18 @@ static const char *const yytname[] =
   "cursor_declaration", "function_declaration", "declaration",
   "declaration_list", "procedure_definition", YY_NULLPTR
 };
+
+static const char *
+yysymbol_name (yysymbol_kind_t yysymbol)
+{
+  return yytname[yysymbol];
+}
 #endif
 
-# ifdef YYPRINT
+#ifdef YYPRINT
 /* YYTOKNUM[NUM] -- (External) token number corresponding to the
    (internal) symbol number NUM (which must be that of a token).  */
-static const yytype_uint16 yytoknum[] =
+static const yytype_int16 yytoknum[] =
 {
        0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
      265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
@@ -608,54 +759,53 @@ static const yytype_uint16 yytoknum[] =
      315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
      325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
       61,    60,    62,    45,    43,    42,    47,   335,    37,    59,
-      40,    41,    63,    44,   123,   125
+      40,    41,    44
 };
-# endif
+#endif
 
-#define YYPACT_NINF -129
+#define YYPACT_NINF (-146)
 
-#define yypact_value_is_default(Yystate) \
-  (!!((Yystate) == (-129)))
+#define yypact_value_is_default(Yyn) \
+  ((Yyn) == YYPACT_NINF)
 
-#define YYTABLE_NINF -1
+#define YYTABLE_NINF (-1)
 
-#define yytable_value_is_error(Yytable_value) \
+#define yytable_value_is_error(Yyn) \
   0
 
   /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
      STATE-NUM.  */
 static const yytype_int16 yypact[] =
 {
-       5,    34,    46,   -28,   -41,  -129,  -129,   -12,    45,    57,
-      23,  -129,     9,  -129,  -129,  -129,    20,    -9,  -129,  -129,
-    -129,  -129,     2,  -129,    83,    87,   278,  -129,    93,    28,
-      71,   427,   427,  -129,   335,   105,    85,    -1,   104,   -27,
-     129,   132,   133,    76,    77,  -129,   141,  -129,   149,  -129,
-      61,    19,    62,   118,    65,    66,   118,    68,    69,    70,
-      72,    73,    74,    75,    78,    79,    82,    84,    89,    90,
-      91,    94,   138,  -129,   427,  -129,  -129,  -129,  -129,    86,
-     427,    96,  -129,  -129,  -129,  -129,  -129,   427,   427,   438,
-      92,   454,    95,  -129,     1,  -129,   -24,   130,   157,    -1,
-    -129,  -129,   144,    -1,    -1,  -129,   139,  -129,   154,  -129,
-    -129,  -129,    98,  -129,  -129,  -129,   108,  -129,  -129,   345,
-    -129,  -129,  -129,  -129,  -129,  -129,  -129,  -129,  -129,  -129,
-    -129,  -129,  -129,  -129,  -129,  -129,  -129,  -129,  -129,  -129,
-    -129,   112,     1,   135,   285,   143,    -8,    15,   427,   427,
-     427,   427,   427,   278,   203,   427,   427,   427,   427,   427,
-     427,   427,   427,   278,   124,   204,   381,    -1,   427,  -129,
-     209,  -129,   120,  -129,   173,   215,   131,   427,   180,     1,
-    -129,  -129,  -129,  -129,   285,   285,    30,    30,     1,    10,
-    -129,    30,    30,    30,    60,    60,    -8,    -8,     1,   -39,
-     192,   137,  -129,   136,  -129,   -13,  -129,   472,   146,  -129,
-     147,   225,   227,   151,  -129,   136,  -129,   -21,     0,   229,
-     278,   427,  -129,   213,   219,  -129,   427,   220,  -129,   237,
-     427,    -1,   214,   427,   427,   209,    23,  -129,    14,   196,
-     160,   158,   162,  -129,  -129,   278,   486,  -129,   231,     1,
-    -129,  -129,  -129,   218,   194,   517,     1,  -129,   175,  -129,
-     225,    -1,  -129,  -129,  -129,   278,  -129,  -129,   251,   234,
-     278,   266,   260,  -129,   181,   278,   201,   239,  -129,   235,
-     184,   271,  -129,   272,   208,   275,   258,  -129,  -129,  -129,
-      17,  -129,    -7,  -129,  -129,   277,  -129,  -129,  -129,  -129
+       8,    25,    40,   -44,   -43,  -146,  -146,   -41,    37,    54,
+       9,  -146,    44,  -146,  -146,  -146,   -24,   -30,  -146,  -146,
+    -146,  -146,    -5,  -146,    63,    97,   543,  -146,    93,    24,
+      79,   148,   148,  -146,    13,   126,    98,     0,   111,    -3,
+     135,   136,   138,    80,    83,  -146,  -146,   414,    67,    -7,
+      70,   130,    84,    85,   130,    86,    87,    88,    89,    90,
+      91,    92,    94,    95,    99,   100,   104,   105,   107,   108,
+     141,  -146,   148,  -146,  -146,  -146,  -146,   112,   148,   119,
+    -146,  -146,  -146,  -146,  -146,   148,   148,   193,   123,   208,
+     124,  -146,   304,  -146,   -26,   152,   172,     0,  -146,  -146,
+     181,     0,     0,  -146,   174,  -146,   186,  -146,  -146,  -146,
+    -146,  -146,  -146,   137,  -146,  -146,   102,  -146,  -146,  -146,
+    -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,   139,   304,
+     167,    -1,   170,    17,   159,   148,   148,   148,   148,   148,
+     543,   225,   148,   148,   148,   148,   148,   148,   148,   148,
+     543,   149,   228,    31,     0,   148,  -146,   229,  -146,   147,
+    -146,   198,   240,   148,   203,   304,  -146,  -146,  -146,  -146,
+      -1,    -1,    16,    16,   304,   371,  -146,    16,    16,    16,
+      49,    49,    17,    17,   304,   -64,   457,   158,  -146,   160,
+    -146,   -25,  -146,   247,   171,  -146,   161,   250,   254,   164,
+    -146,   160,   -38,   255,   543,   148,  -146,   239,   244,  -146,
+     148,   242,  -146,   258,   148,     0,   236,   148,   148,   229,
+       9,  -146,   -33,   218,   179,  -146,  -146,   543,   274,  -146,
+     251,   304,  -146,  -146,  -146,   230,   214,   289,   304,  -146,
+     205,  -146,   250,     0,  -146,   543,  -146,  -146,   284,   269,
+     543,   301,   295,  -146,   216,   543,   237,   272,  -146,   500,
+     219,   303,  -146,   311,   249,   312,   286,  -146,  -146,  -146,
+     -28,  -146,   103,  -146,  -146,   315,  -146,  -146,  -146,  -146
 };
 
   /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
@@ -663,278 +813,276 @@ static const yytype_int16 yypact[] =
      means the default is an error.  */
 static const yytype_uint8 yydefact[] =
 {
-       0,     0,     0,     0,     0,     1,     2,     0,     0,   140,
-       0,   141,   147,   136,   138,   137,     0,     0,   142,   145,
-     146,   148,     0,   139,     0,     0,     0,   149,     0,     0,
-       0,     0,     0,   112,    70,     0,     0,     0,     0,   127,
-       0,     0,     0,     0,     0,   111,     0,    23,     0,     3,
-       0,     0,     0,    76,     0,     0,    76,     0,     0,     0,
+       0,     0,     0,     0,     0,     1,     2,     0,     0,   135,
+       0,   136,   142,   131,   133,   132,     0,     0,   137,   140,
+     141,   143,     0,   134,     0,     0,     0,   144,     0,     0,
+       0,     0,     0,   107,    65,     0,     0,     0,     0,   122,
+       0,     0,     0,     0,     0,   106,    22,     0,     0,     0,
+       0,    71,     0,     0,    71,     0,     0,     0,     0,     0,
        0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,   144,     0,    27,    28,    29,    30,    25,
-       0,    31,    50,    51,    52,    53,    54,     0,     0,     0,
-       0,     0,     0,    73,    68,    71,    75,     0,     0,     0,
-     132,   133,     0,     0,     0,   128,   129,   113,     0,   114,
-     134,   135,     0,   150,    24,    10,     0,    90,    11,     0,
-      96,    97,    14,    15,    99,   100,    12,    13,     9,     7,
-       4,     5,     6,     8,    16,    18,    17,    21,    22,    19,
-      20,     0,   101,     0,    47,     0,    36,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,    65,     0,     0,    62,     0,     0,     0,    88,
-       0,    98,     0,   130,     0,    62,    55,    65,     0,    77,
-     143,    48,    49,    37,    45,    46,    42,    43,    44,   105,
-      39,    38,    40,    41,    33,    32,    34,    35,    66,     0,
-       0,     0,    63,    74,    72,    76,    60,     0,     0,    92,
-      95,     0,     0,    63,   116,   115,    56,     0,     0,     0,
-       0,     0,   103,   107,     0,    26,     0,     0,    69,     0,
-       0,     0,    78,     0,     0,     0,     0,   118,     0,     0,
-       0,     0,     0,    89,    94,   106,     0,   104,     0,    67,
-     109,    64,    61,     0,    80,     0,    91,    93,   120,   124,
-       0,     0,    59,    58,    57,     0,   108,    79,     0,    85,
-       0,     0,   122,   119,     0,   102,     0,     0,    87,     0,
-       0,     0,   117,     0,     0,     0,     0,   121,   123,   125,
-       0,    81,    82,   110,   131,     0,    83,    84,    86,   126
+       0,   139,     0,    26,    27,    28,    29,    24,     0,    30,
+      49,    50,    51,    52,    53,     0,     0,     0,     0,     0,
+       0,    68,    63,    66,    70,     0,     0,     0,   127,   128,
+       0,     0,     0,   123,   124,   108,     0,   109,   129,   130,
+     145,    23,     9,     0,    85,    10,     0,    91,    92,    13,
+      14,    94,    95,    11,    12,     8,     6,     3,     4,     5,
+       7,    15,    17,    16,    20,    21,    18,    19,     0,    96,
+       0,    46,     0,    35,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,    60,
+       0,     0,    57,     0,     0,     0,    83,     0,    93,     0,
+     125,     0,    57,    60,     0,    72,   138,    47,    48,    36,
+      44,    45,    41,    42,    43,   100,    38,    37,    39,    40,
+      32,    31,    33,    34,    61,     0,     0,     0,    58,    69,
+      67,    71,    55,     0,     0,    87,    90,     0,     0,    58,
+     111,   110,     0,     0,     0,     0,    98,   102,     0,    25,
+       0,     0,    64,     0,     0,     0,    73,     0,     0,     0,
+       0,   113,     0,     0,     0,    84,    89,   101,     0,    99,
+       0,    62,   104,    59,    56,     0,    75,     0,    86,    88,
+     115,   119,     0,     0,    54,     0,   103,    74,     0,    80,
+       0,     0,   117,   114,     0,    97,     0,     0,    82,     0,
+       0,     0,   112,     0,     0,     0,     0,   116,   118,   120,
+       0,    76,    77,   105,   126,     0,    78,    79,    81,   121
 };
 
   /* YYPGOTO[NTERM-NUM].  */
 static const yytype_int16 yypgoto[] =
 {
-    -129,  -129,   -48,  -128,   -30,  -129,  -129,  -129,  -129,  -129,
-     113,   110,   123,  -129,  -129,   -52,  -129,  -129,  -129,  -129,
-     -40,  -129,  -129,    55,  -129,   238,  -129,  -129,  -129,  -129,
-    -129,  -129,  -129,    88,  -129,  -129,  -129,  -129,  -129,  -129,
-    -129,  -129,  -129,  -129,    35,  -129,  -129,  -129,  -129,  -129,
-    -129,  -129,  -129,   -96,  -129,  -129,    81,   290,  -129,  -129,
-    -129,   286,  -129,  -129
+    -146,  -146,   -47,  -145,   -29,  -146,  -146,  -146,   151,   153,
+     162,  -146,  -146,   -53,  -146,  -146,  -146,  -146,   -18,  -146,
+    -146,   106,  -146,   270,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,   117,  -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,  -146,    96,  -146,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,   -93,  -146,  -146,   109,   324,  -146,  -146,  -146,   316,
+    -146,  -146
 };
 
   /* YYDEFGOTO[NTERM-NUM].  */
 static const yytype_int16 yydefgoto[] =
 {
-      -1,     2,    47,    48,    94,    90,   217,    49,   214,   205,
-     203,   199,    95,    96,    97,   120,   254,   269,   298,   278,
-      50,    51,    52,   209,   210,   121,    53,    54,    55,    56,
-      57,    58,    59,   222,   223,   224,    60,    61,    62,    63,
-      64,    65,    66,    67,   237,   238,   272,   282,    68,   290,
-     106,   174,    69,   102,    70,    71,    16,    11,    12,    19,
-      20,    21,    22,     3
+       0,     2,    46,    47,    92,    88,   210,   201,   199,   195,
+      93,    94,    95,   117,   246,   259,   288,   268,    48,    49,
+      50,   205,   206,   118,    51,    52,    53,    54,    55,    56,
+      57,   216,   217,   218,    58,    59,    60,    61,    62,    63,
+      64,    65,   231,   232,   262,   272,    66,   280,   104,   171,
+      67,   100,    68,    69,    16,    11,    12,    19,    20,    21,
+      22,     3
 };
 
   /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM.  If
      positive, shift that token.  If negative, reduce the rule whose
      number is the opposite.  If YYTABLE_NINF, syntax error.  */
-static const yytype_uint16 yytable[] =
+static const yytype_int16 yytable[] =
 {
-     114,    89,    91,   169,   124,   152,   100,   171,   172,   148,
-     149,   117,   150,   151,   152,   165,    10,    30,   230,     1,
-     104,    26,   105,   148,   149,   189,   150,   151,   152,   296,
-     297,    31,   141,   220,   221,   200,    32,    33,    34,    13,
-      14,     4,    35,   152,   142,    24,     5,    34,    36,     7,
-     144,    37,   225,    38,   226,    17,    39,   146,   147,   116,
-      25,     6,    17,     9,    10,   154,    40,    41,    42,   166,
-     241,   206,   242,   152,   154,    43,    44,   101,    45,     8,
-     231,   155,   156,   157,   158,   159,   160,   161,   154,   179,
-      28,   243,   245,   226,    29,   155,   156,   157,   158,   159,
-     160,   161,    15,   154,    46,   259,   183,   260,   294,    23,
-     295,    72,    98,   158,   159,   160,   161,    73,   184,   185,
-     186,   187,   188,    74,    99,   191,   192,   193,   194,   195,
-     196,   197,   198,   154,   103,   252,   107,   275,   207,   108,
-     109,   114,   279,   110,   111,   160,   161,   198,   112,   119,
-     115,   118,   114,   232,   122,   123,    30,   126,   127,   128,
-     167,   129,   130,   131,   132,   274,    34,   133,   134,   113,
-      31,   135,   168,   136,   143,    32,    33,    34,   137,   138,
-     139,    35,   162,   140,   145,   164,   170,    36,   176,   173,
-      37,   246,    38,   175,   181,    39,   249,   114,   177,    30,
-     179,   180,   182,   255,   256,    40,    41,    42,   190,   201,
-     211,   202,   227,    31,    43,    44,   208,    45,    32,    33,
-      34,   212,   213,   216,    35,   219,   234,   114,   228,   229,
-      36,   114,   236,    37,   239,    38,   244,   221,    39,   248,
-     235,   240,    30,    46,   251,   250,   253,   261,    40,    41,
-      42,   262,   266,   263,   264,   286,    31,    43,    44,   267,
-      45,    32,    33,    34,   268,   271,   276,    35,   277,   280,
-     281,   283,   284,    36,   285,   287,    37,   288,    38,   289,
-     291,    39,   292,   293,   299,    30,    46,   218,   215,   204,
-     257,    40,    41,    42,   125,   273,   150,   151,   152,    31,
-      43,    44,    18,    45,    32,    33,    34,     0,    27,     0,
-      35,   247,     0,     0,     0,     0,    36,   258,     0,    37,
-       0,    38,     0,     0,    39,     0,     0,     0,     0,    46,
-       0,     0,     0,     0,    40,    41,    42,     0,    75,    76,
-      77,    78,    79,    43,    44,    80,    45,     0,    75,    76,
-      77,    78,    79,     0,     0,    80,     0,     0,   154,     0,
-       0,     0,     0,     0,    92,   155,   156,   157,   158,   159,
-     160,   161,    46,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,    75,    76,    77,    78,    79,   178,
-      81,    80,     0,     0,     0,    82,    83,    84,    85,    86,
-      81,     0,     0,     0,     0,    82,    83,    84,    85,    86,
-      92,     0,     0,     0,     0,     0,     0,     0,    87,     0,
-      93,     0,     0,     0,     0,    88,     0,     0,    87,     0,
-      75,    76,    77,    78,    79,    88,    81,    80,     0,     0,
-       0,    82,    83,    84,    85,    86,   148,   149,     0,   150,
-     151,   152,     0,     0,     0,     0,     0,     0,     0,     0,
-     153,     0,   148,   149,    87,   150,   151,   152,     0,     0,
-       0,    88,     0,     0,     0,     0,     0,     0,     0,   163,
-     148,   149,    81,   150,   151,   152,     0,    82,    83,    84,
-      85,    86,     0,     0,   148,   149,     0,   150,   151,   152,
-       0,     0,     0,     0,     0,   233,     0,     0,   265,     0,
-      87,   154,     0,     0,     0,     0,     0,    88,   155,   156,
-     157,   158,   159,   160,   161,   148,   149,   154,   150,   151,
-     152,     0,     0,     0,   155,   156,   157,   158,   159,   160,
-     161,     0,   270,     0,     0,   154,     0,     0,     0,     0,
-       0,     0,   155,   156,   157,   158,   159,   160,   161,   154,
-       0,     0,     0,     0,     0,     0,   155,   156,   157,   158,
-     159,   160,   161,     0,     0,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
-     154,     0,     0,     0,     0,     0,     0,   155,   156,   157,
-     158,   159,   160,   161
+     111,   121,    87,    89,   166,   185,   224,    98,   168,   169,
+     147,   148,   149,   162,    26,   196,    73,    74,    75,    76,
+      77,    34,     1,    78,    24,    13,    14,   219,   220,   149,
+     149,   114,     4,   113,    73,    74,    75,    76,    77,    25,
+       5,    78,    90,   139,   102,     6,   103,     7,    17,   141,
+       8,    10,   138,   235,   220,     9,   143,   144,   251,   252,
+      90,    10,   149,   284,   285,    23,   163,   225,    79,   237,
+      28,   202,   151,    80,    81,    82,    83,    84,    99,   152,
+     153,   154,   155,   156,   157,   158,    79,   175,    15,   151,
+     151,    80,    81,    82,    83,    84,    85,    17,    91,   155,
+     156,   157,   158,    86,    29,    73,    74,    75,    76,    77,
+     265,    70,    78,    71,    85,   269,   180,   181,   182,   183,
+     184,    86,   151,   187,   188,   189,   190,   191,   192,   193,
+     194,    72,   244,    96,   157,   158,   203,    97,   111,   286,
+     287,   101,   105,   106,   194,   107,   174,   108,   226,   111,
+     109,    73,    74,    75,    76,    77,   112,    79,    78,   115,
+     264,   116,    80,    81,    82,    83,    84,   145,   146,    34,
+     147,   148,   149,   119,   120,   123,   124,   125,   126,   127,
+     128,   129,   164,   130,   131,    85,   238,   165,   132,   133,
+     111,   241,    86,   134,   135,   175,   136,   137,   247,   248,
+     140,   145,   146,    79,   147,   148,   149,   142,    80,    81,
+      82,    83,    84,   159,   161,   150,   145,   146,   111,   147,
+     148,   149,   111,   167,   170,   172,   177,   173,   176,   178,
+     186,    85,   151,   160,   197,   198,   204,   207,    86,   152,
+     153,   154,   155,   156,   157,   158,   208,   209,   213,   222,
+     179,   228,   223,   229,   234,   145,   146,   230,   147,   148,
+     149,   233,   236,   215,   240,   243,   151,   242,   245,   253,
+     254,   257,   256,   152,   153,   154,   155,   156,   157,   158,
+     227,   151,   145,   146,   258,   147,   148,   149,   152,   153,
+     154,   155,   156,   157,   158,   261,   255,   145,   146,   266,
+     147,   148,   149,   267,   270,   271,   273,   275,   274,   278,
+     277,   283,   145,   146,   260,   147,   148,   149,   279,   282,
+     151,   281,   289,   211,   122,   200,   212,   152,   153,   154,
+     155,   156,   157,   158,   239,   249,    18,     0,    27,   250,
+       0,     0,     0,     0,     0,     0,     0,   151,   263,     0,
+       0,     0,     0,     0,   152,   153,   154,   155,   156,   157,
+     158,     0,   151,     0,     0,     0,     0,     0,     0,   152,
+     153,   154,   155,   156,   157,   158,     0,   151,    30,     0,
+       0,     0,     0,     0,   152,   153,   154,   155,   156,   157,
+     158,     0,    31,     0,   214,   215,     0,    32,    33,    34,
+       0,     0,     0,    35,     0,     0,     0,     0,     0,    36,
+       0,     0,    37,     0,    38,     0,     0,    39,     0,     0,
+       0,    30,     0,     0,     0,     0,     0,    40,    41,    42,
+       0,     0,     0,     0,   110,    31,    43,    44,     0,    45,
+      32,    33,    34,     0,     0,     0,    35,     0,     0,     0,
+       0,     0,    36,     0,     0,    37,     0,    38,     0,     0,
+      39,     0,     0,     0,    30,     0,     0,     0,     0,     0,
+      40,    41,    42,     0,     0,     0,     0,   221,    31,    43,
+      44,     0,    45,    32,    33,    34,     0,     0,     0,    35,
+       0,     0,     0,     0,     0,    36,     0,     0,    37,     0,
+      38,     0,     0,    39,     0,     0,     0,    30,     0,     0,
+       0,     0,     0,    40,    41,    42,     0,     0,     0,     0,
+     276,    31,    43,    44,     0,    45,    32,    33,    34,     0,
+       0,     0,    35,     0,     0,     0,     0,     0,    36,     0,
+       0,    37,     0,    38,     0,     0,    39,     0,     0,     0,
+      30,     0,     0,     0,     0,     0,    40,    41,    42,     0,
+       0,     0,     0,     0,    31,    43,    44,     0,    45,    32,
+      33,    34,     0,     0,     0,    35,     0,     0,     0,     0,
+       0,    36,     0,     0,    37,     0,    38,     0,     0,    39,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,    40,
+      41,    42,     0,     0,     0,     0,     0,     0,    43,    44,
+       0,    45
 };
 
 static const yytype_int16 yycheck[] =
 {
-      48,    31,    32,    99,    56,    13,     7,   103,   104,     8,
-       9,    51,    11,    12,    13,    39,     7,     7,    31,    14,
-      47,    19,    49,     8,     9,   153,    11,    12,    13,    36,
-      37,    21,    72,    23,    24,   163,    26,    27,    28,    16,
-      17,     7,    32,    13,    74,    54,     0,    28,    38,    90,
-      80,    41,    91,    43,    93,    53,    46,    87,    88,    40,
-      69,    89,    53,    18,     7,    73,    56,    57,    58,    93,
-      91,   167,    93,    13,    73,    65,    66,    78,    68,    91,
-      93,    80,    81,    82,    83,    84,    85,    86,    73,   119,
-       7,    91,   220,    93,     7,    80,    81,    82,    83,    84,
-      85,    86,    79,    73,    94,    91,    91,    93,    91,    89,
-      93,    18,     7,    83,    84,    85,    86,    89,   148,   149,
-     150,   151,   152,    52,    39,   155,   156,   157,   158,   159,
-     160,   161,   162,    73,    30,   231,     7,   265,   168,     7,
-       7,   189,   270,    67,    67,    85,    86,   177,     7,    31,
-      89,    89,   200,   205,    89,    89,     7,    89,    89,    89,
-      30,    89,    89,    89,    89,   261,    28,    89,    89,    20,
-      21,    89,    15,    89,    88,    26,    27,    28,    89,    89,
-      89,    32,    90,    89,    88,    90,    42,    38,    90,    50,
-      41,   221,    43,    39,    59,    46,   226,   245,    90,     7,
-     230,    89,    59,   233,   234,    56,    57,    58,     5,    85,
-      90,     7,    20,    21,    65,    66,     7,    68,    26,    27,
-      28,    48,     7,    92,    32,    45,    80,   275,    91,    93,
-      38,   279,     7,    41,     7,    43,     7,    24,    46,    20,
-      93,    90,     7,    94,     7,    25,    32,    51,    56,    57,
-      58,    91,    21,    95,    92,    20,    21,    65,    66,    41,
-      68,    26,    27,    28,    70,    90,    15,    32,    34,     3,
-      10,    90,    71,    38,    35,    91,    41,     6,    43,     7,
-      72,    46,     7,    25,     7,     7,    94,   177,   175,   166,
-     235,    56,    57,    58,    56,   260,    11,    12,    13,    21,
-      65,    66,    12,    68,    26,    27,    28,    -1,    22,    -1,
-      32,   223,    -1,    -1,    -1,    -1,    38,   236,    -1,    41,
-      -1,    43,    -1,    -1,    46,    -1,    -1,    -1,    -1,    94,
-      -1,    -1,    -1,    -1,    56,    57,    58,    -1,     3,     4,
-       5,     6,     7,    65,    66,    10,    68,    -1,     3,     4,
-       5,     6,     7,    -1,    -1,    10,    -1,    -1,    73,    -1,
-      -1,    -1,    -1,    -1,    29,    80,    81,    82,    83,    84,
-      85,    86,    94,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,     3,     4,     5,     6,     7,    44,
-      55,    10,    -1,    -1,    -1,    60,    61,    62,    63,    64,
-      55,    -1,    -1,    -1,    -1,    60,    61,    62,    63,    64,
-      29,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    83,    -1,
-      85,    -1,    -1,    -1,    -1,    90,    -1,    -1,    83,    -1,
-       3,     4,     5,     6,     7,    90,    55,    10,    -1,    -1,
-      -1,    60,    61,    62,    63,    64,     8,     9,    -1,    11,
-      12,    13,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      22,    -1,     8,     9,    83,    11,    12,    13,    -1,    -1,
-      -1,    90,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    25,
-       8,     9,    55,    11,    12,    13,    -1,    60,    61,    62,
-      63,    64,    -1,    -1,     8,     9,    -1,    11,    12,    13,
-      -1,    -1,    -1,    -1,    -1,    33,    -1,    -1,    22,    -1,
-      83,    73,    -1,    -1,    -1,    -1,    -1,    90,    80,    81,
-      82,    83,    84,    85,    86,     8,     9,    73,    11,    12,
-      13,    -1,    -1,    -1,    80,    81,    82,    83,    84,    85,
-      86,    -1,    25,    -1,    -1,    73,    -1,    -1,    -1,    -1,
-      -1,    -1,    80,    81,    82,    83,    84,    85,    86,    73,
-      -1,    -1,    -1,    -1,    -1,    -1,    80,    81,    82,    83,
-      84,    85,    86,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
-      73,    -1,    -1,    -1,    -1,    -1,    -1,    80,    81,    82,
-      83,    84,    85,    86
+      47,    54,    31,    32,    97,   150,    31,     7,   101,   102,
+      11,    12,    13,    39,    19,   160,     3,     4,     5,     6,
+       7,    28,    14,    10,    54,    16,    17,    91,    92,    13,
+      13,    49,     7,    40,     3,     4,     5,     6,     7,    69,
+       0,    10,    29,    72,    47,    89,    49,    90,    53,    78,
+      91,     7,    70,    91,    92,    18,    85,    86,    91,    92,
+      29,     7,    13,    91,    92,    89,    92,    92,    55,   214,
+       7,   164,    73,    60,    61,    62,    63,    64,    78,    80,
+      81,    82,    83,    84,    85,    86,    55,   116,    79,    73,
+      73,    60,    61,    62,    63,    64,    83,    53,    85,    83,
+      84,    85,    86,    90,     7,     3,     4,     5,     6,     7,
+     255,    18,    10,    89,    83,   260,   145,   146,   147,   148,
+     149,    90,    73,   152,   153,   154,   155,   156,   157,   158,
+     159,    52,   225,     7,    85,    86,   165,    39,   185,    36,
+      37,    30,     7,     7,   173,     7,    44,    67,   201,   196,
+      67,     3,     4,     5,     6,     7,    89,    55,    10,    89,
+     253,    31,    60,    61,    62,    63,    64,     8,     9,    28,
+      11,    12,    13,    89,    89,    89,    89,    89,    89,    89,
+      89,    89,    30,    89,    89,    83,   215,    15,    89,    89,
+     237,   220,    90,    89,    89,   224,    89,    89,   227,   228,
+      88,     8,     9,    55,    11,    12,    13,    88,    60,    61,
+      62,    63,    64,    90,    90,    22,     8,     9,   265,    11,
+      12,    13,   269,    42,    50,    39,    59,    90,    89,    59,
+       5,    83,    73,    25,    85,     7,     7,    90,    90,    80,
+      81,    82,    83,    84,    85,    86,    48,     7,    45,    91,
+      91,    80,    92,    92,    90,     8,     9,     7,    11,    12,
+      13,     7,     7,    24,    20,     7,    73,    25,    32,    51,
+      91,    41,    21,    80,    81,    82,    83,    84,    85,    86,
+      33,    73,     8,     9,    70,    11,    12,    13,    80,    81,
+      82,    83,    84,    85,    86,    90,    22,     8,     9,    15,
+      11,    12,    13,    34,     3,    10,    90,    35,    71,     6,
+      91,    25,     8,     9,    25,    11,    12,    13,     7,     7,
+      73,    72,     7,   172,    54,   163,   173,    80,    81,    82,
+      83,    84,    85,    86,   217,   229,    12,    -1,    22,   230,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    73,   252,    -1,
+      -1,    -1,    -1,    -1,    80,    81,    82,    83,    84,    85,
+      86,    -1,    73,    -1,    -1,    -1,    -1,    -1,    -1,    80,
+      81,    82,    83,    84,    85,    86,    -1,    73,     7,    -1,
+      -1,    -1,    -1,    -1,    80,    81,    82,    83,    84,    85,
+      86,    -1,    21,    -1,    23,    24,    -1,    26,    27,    28,
+      -1,    -1,    -1,    32,    -1,    -1,    -1,    -1,    -1,    38,
+      -1,    -1,    41,    -1,    43,    -1,    -1,    46,    -1,    -1,
+      -1,     7,    -1,    -1,    -1,    -1,    -1,    56,    57,    58,
+      -1,    -1,    -1,    -1,    20,    21,    65,    66,    -1,    68,
+      26,    27,    28,    -1,    -1,    -1,    32,    -1,    -1,    -1,
+      -1,    -1,    38,    -1,    -1,    41,    -1,    43,    -1,    -1,
+      46,    -1,    -1,    -1,     7,    -1,    -1,    -1,    -1,    -1,
+      56,    57,    58,    -1,    -1,    -1,    -1,    20,    21,    65,
+      66,    -1,    68,    26,    27,    28,    -1,    -1,    -1,    32,
+      -1,    -1,    -1,    -1,    -1,    38,    -1,    -1,    41,    -1,
+      43,    -1,    -1,    46,    -1,    -1,    -1,     7,    -1,    -1,
+      -1,    -1,    -1,    56,    57,    58,    -1,    -1,    -1,    -1,
+      20,    21,    65,    66,    -1,    68,    26,    27,    28,    -1,
+      -1,    -1,    32,    -1,    -1,    -1,    -1,    -1,    38,    -1,
+      -1,    41,    -1,    43,    -1,    -1,    46,    -1,    -1,    -1,
+       7,    -1,    -1,    -1,    -1,    -1,    56,    57,    58,    -1,
+      -1,    -1,    -1,    -1,    21,    65,    66,    -1,    68,    26,
+      27,    28,    -1,    -1,    -1,    32,    -1,    -1,    -1,    -1,
+      -1,    38,    -1,    -1,    41,    -1,    43,    -1,    -1,    46,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    56,
+      57,    58,    -1,    -1,    -1,    -1,    -1,    -1,    65,    66,
+      -1,    68
 };
 
   /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
      symbol of state STATE-NUM.  */
 static const yytype_uint8 yystos[] =
 {
-       0,    14,    97,   159,     7,     0,    89,    90,    91,    18,
-       7,   153,   154,    16,    17,    79,   152,    53,   153,   155,
-     156,   157,   158,    89,    54,    69,    19,   157,     7,     7,
+       0,    14,    94,   154,     7,     0,    89,    90,    91,    18,
+       7,   148,   149,    16,    17,    79,   147,    53,   148,   150,
+     151,   152,   153,    89,    54,    69,    19,   152,     7,     7,
        7,    21,    26,    27,    28,    32,    38,    41,    43,    46,
-      56,    57,    58,    65,    66,    68,    94,    98,    99,   103,
-     116,   117,   118,   122,   123,   124,   125,   126,   127,   128,
-     132,   133,   134,   135,   136,   137,   138,   139,   144,   148,
-     150,   151,    18,    89,    52,     3,     4,     5,     6,     7,
-      10,    55,    60,    61,    62,    63,    64,    83,    90,   100,
-     101,   100,    29,    85,   100,   108,   109,   110,     7,    39,
-       7,    78,   149,    30,    47,    49,   146,     7,     7,     7,
-      67,    67,     7,    20,    98,    89,    40,   116,    89,    31,
-     111,   121,    89,    89,   111,   121,    89,    89,    89,    89,
-      89,    89,    89,    89,    89,    89,    89,    89,    89,    89,
-      89,   116,   100,    88,   100,    88,   100,   100,     8,     9,
-      11,    12,    13,    22,    73,    80,    81,    82,    83,    84,
-      85,    86,    90,    25,    90,    39,    93,    30,    15,   149,
-      42,   149,   149,    50,   147,    39,    90,    90,    44,   100,
-      89,    59,    59,    91,   100,   100,   100,   100,   100,    99,
-       5,   100,   100,   100,   100,   100,   100,   100,   100,   107,
-      99,    85,     7,   106,   108,   105,   149,   100,     7,   119,
-     120,    90,    48,     7,   104,   106,    92,   102,   107,    45,
-      23,    24,   129,   130,   131,    91,    93,    20,    91,    93,
-      31,    93,   111,    33,    80,    93,     7,   140,   141,     7,
-      90,    91,    93,    91,     7,    99,   100,   129,    20,   100,
-      25,     7,   149,    32,   112,   100,   100,   119,   152,    91,
-      93,    51,    91,    95,    92,    22,    21,    41,    70,   113,
-      25,    90,   142,   140,   149,    99,    15,    34,   115,    99,
-       3,    10,   143,    90,    71,    35,    20,    91,     6,     7,
-     145,    72,     7,    25,    91,    93,    36,    37,   114,     7
+      56,    57,    58,    65,    66,    68,    95,    96,   111,   112,
+     113,   117,   118,   119,   120,   121,   122,   123,   127,   128,
+     129,   130,   131,   132,   133,   134,   139,   143,   145,   146,
+      18,    89,    52,     3,     4,     5,     6,     7,    10,    55,
+      60,    61,    62,    63,    64,    83,    90,    97,    98,    97,
+      29,    85,    97,   103,   104,   105,     7,    39,     7,    78,
+     144,    30,    47,    49,   141,     7,     7,     7,    67,    67,
+      20,    95,    89,    40,   111,    89,    31,   106,   116,    89,
+      89,   106,   116,    89,    89,    89,    89,    89,    89,    89,
+      89,    89,    89,    89,    89,    89,    89,    89,   111,    97,
+      88,    97,    88,    97,    97,     8,     9,    11,    12,    13,
+      22,    73,    80,    81,    82,    83,    84,    85,    86,    90,
+      25,    90,    39,    92,    30,    15,   144,    42,   144,   144,
+      50,   142,    39,    90,    44,    97,    89,    59,    59,    91,
+      97,    97,    97,    97,    97,    96,     5,    97,    97,    97,
+      97,    97,    97,    97,    97,   102,    96,    85,     7,   101,
+     103,   100,   144,    97,     7,   114,   115,    90,    48,     7,
+      99,   101,   102,    45,    23,    24,   124,   125,   126,    91,
+      92,    20,    91,    92,    31,    92,   106,    33,    80,    92,
+       7,   135,   136,     7,    90,    91,     7,    96,    97,   124,
+      20,    97,    25,     7,   144,    32,   107,    97,    97,   114,
+     147,    91,    92,    51,    91,    22,    21,    41,    70,   108,
+      25,    90,   137,   135,   144,    96,    15,    34,   110,    96,
+       3,    10,   138,    90,    71,    35,    20,    91,     6,     7,
+     140,    72,     7,    25,    91,    92,    36,    37,   109,     7
 };
 
   /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
 static const yytype_uint8 yyr1[] =
 {
-       0,    96,    97,    98,    98,    98,    98,    98,    98,    98,
-      98,    98,    98,    98,    98,    98,    98,    98,    98,    98,
-      98,    98,    98,    99,    99,   100,   100,   100,   100,   100,
-     100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
-     100,   100,   100,   100,   100,   100,   100,   100,   100,   100,
-     101,   101,   101,   101,   101,   102,   102,   102,   103,   104,
-     105,   105,   106,   106,   106,   107,   107,   107,   108,   108,
-     109,   109,   109,   110,   110,   110,   111,   111,   112,   112,
-     113,   113,   114,   114,   114,   115,   115,   116,   117,   118,
-     118,   119,   120,   120,   121,   122,   123,   124,   125,   126,
-     127,   128,   129,   130,   130,   131,   131,   131,   132,   133,
-     134,   135,   136,   137,   138,   139,   139,   140,   141,   141,
-     142,   142,   143,   143,   144,   145,   145,   146,   146,   147,
-     147,   148,   149,   149,   150,   151,   152,   152,   152,   153,
-     154,   154,   154,   155,   156,   157,   157,   158,   158,   158,
-     159
+       0,    93,    94,    95,    95,    95,    95,    95,    95,    95,
+      95,    95,    95,    95,    95,    95,    95,    95,    95,    95,
+      95,    95,    96,    96,    97,    97,    97,    97,    97,    97,
+      97,    97,    97,    97,    97,    97,    97,    97,    97,    97,
+      97,    97,    97,    97,    97,    97,    97,    97,    97,    98,
+      98,    98,    98,    98,    99,   100,   100,   101,   101,   101,
+     102,   102,   102,   103,   103,   104,   104,   104,   105,   105,
+     105,   106,   106,   107,   107,   108,   108,   109,   109,   109,
+     110,   110,   111,   112,   113,   113,   114,   115,   115,   116,
+     117,   118,   119,   120,   121,   122,   123,   124,   125,   125,
+     126,   126,   126,   127,   128,   129,   130,   131,   132,   133,
+     134,   134,   135,   136,   136,   137,   137,   138,   138,   139,
+     140,   140,   141,   141,   142,   142,   143,   144,   144,   145,
+     146,   147,   147,   147,   148,   149,   149,   149,   150,   151,
+     152,   152,   153,   153,   153,   154
 };
 
   /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN.  */
-static const yytype_uint8 yyr2[] =
+static const yytype_int8 yyr2[] =
 {
-       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
        2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
-       2,     2,     2,     1,     2,     1,     4,     1,     1,     1,
-       1,     1,     3,     3,     3,     3,     2,     3,     3,     3,
-       3,     3,     3,     3,     3,     3,     3,     2,     3,     3,
-       1,     1,     1,     1,     1,     0,     1,     3,     6,     3,
-       1,     3,     0,     1,     3,     0,     1,     3,     1,     4,
-       0,     1,     3,     1,     3,     1,     0,     2,     0,     2,
-       0,     4,     0,     1,     1,     0,     4,     8,     3,     5,
-       2,     3,     1,     3,     4,     4,     2,     2,     3,     2,
-       2,     3,     4,     1,     2,     0,     2,     1,     7,     6,
-      10,     1,     1,     2,     2,     4,     4,     4,     1,     3,
-       0,     3,     0,     2,     6,     1,     3,     0,     1,     0,
-       1,    10,     1,     1,     2,     2,     1,     1,     1,     3,
-       0,     1,     2,     6,     4,     1,     1,     0,     1,     2,
-      10
+       2,     2,     1,     2,     1,     4,     1,     1,     1,     1,
+       1,     3,     3,     3,     3,     2,     3,     3,     3,     3,
+       3,     3,     3,     3,     3,     3,     2,     3,     3,     1,
+       1,     1,     1,     1,     3,     1,     3,     0,     1,     3,
+       0,     1,     3,     1,     4,     0,     1,     3,     1,     3,
+       1,     0,     2,     0,     2,     0,     4,     0,     1,     1,
+       0,     4,     8,     3,     5,     2,     3,     1,     3,     4,
+       4,     2,     2,     3,     2,     2,     3,     4,     1,     2,
+       0,     2,     1,     7,     6,    10,     1,     1,     2,     2,
+       4,     4,     4,     1,     3,     0,     3,     0,     2,     6,
+       1,     3,     0,     1,     0,     1,    10,     1,     1,     2,
+       2,     1,     1,     1,     3,     0,     1,     2,     6,     4,
+       1,     1,     0,     1,     2,    10
 };
 
 
+enum { YYENOMEM = -2 };
+
 #define yyerrok         (yyerrstatus = 0)
 #define yyclearin       (yychar = YYEMPTY)
-#define YYEMPTY         (-2)
-#define YYEOF           0
 
 #define YYACCEPT        goto yyacceptlab
 #define YYABORT         goto yyabortlab
@@ -960,10 +1108,9 @@ static const yytype_uint8 yyr2[] =
       }                                                           \
   while (0)
 
-/* Error token number */
-#define YYTERROR        1
-#define YYERRCODE       256
-
+/* Backward compatibility with an undocumented macro.
+   Use YYerror or YYUNDEF. */
+#define YYERRCODE YYUNDEF
 
 
 /* Enable debugging if requested.  */
@@ -981,18 +1128,18 @@ do {                                            \
 } while (0)
 
 /* This macro is provided for backward compatibility. */
-#ifndef YY_LOCATION_PRINT
-# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
-#endif
+# ifndef YY_LOCATION_PRINT
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
 
 
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)                    \
+# define YY_SYMBOL_PRINT(Title, Kind, Value, Location)                    \
 do {                                                                      \
   if (yydebug)                                                            \
     {                                                                     \
       YYFPRINTF (stderr, "%s ", Title);                                   \
       yy_symbol_print (stderr,                                            \
-                  Type, Value); \
+                  Kind, Value); \
       YYFPRINTF (stderr, "\n");                                           \
     }                                                                     \
 } while (0)
@@ -1003,18 +1150,19 @@ do {                                                                      \
 `-----------------------------------*/
 
 static void
-yy_symbol_value_print (FILE *yyo, int yytype, YYSTYPE const * const yyvaluep)
+yy_symbol_value_print (FILE *yyo,
+                       yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep)
 {
   FILE *yyoutput = yyo;
-  YYUSE (yyoutput);
+  YY_USE (yyoutput);
   if (!yyvaluep)
     return;
 # ifdef YYPRINT
-  if (yytype < YYNTOKENS)
-    YYPRINT (yyo, yytoknum[yytype], *yyvaluep);
+  if (yykind < YYNTOKENS)
+    YYPRINT (yyo, yytoknum[yykind], *yyvaluep);
 # endif
   YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
-  YYUSE (yytype);
+  YY_USE (yykind);
   YY_IGNORE_MAYBE_UNINITIALIZED_END
 }
 
@@ -1024,12 +1172,13 @@ yy_symbol_value_print (FILE *yyo, int yytype, YYSTYPE const * const yyvaluep)
 `---------------------------*/
 
 static void
-yy_symbol_print (FILE *yyo, int yytype, YYSTYPE const * const yyvaluep)
+yy_symbol_print (FILE *yyo,
+                 yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep)
 {
   YYFPRINTF (yyo, "%s %s (",
-             yytype < YYNTOKENS ? "token" : "nterm", yytname[yytype]);
+             yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind));
 
-  yy_symbol_value_print (yyo, yytype, yyvaluep);
+  yy_symbol_value_print (yyo, yykind, yyvaluep);
   YYFPRINTF (yyo, ")");
 }
 
@@ -1039,7 +1188,7 @@ yy_symbol_print (FILE *yyo, int yytype, YYSTYPE const * const yyvaluep)
 `------------------------------------------------------------------*/
 
 static void
-yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+yy_stack_print (yy_state_t *yybottom, yy_state_t *yytop)
 {
   YYFPRINTF (stderr, "Stack now");
   for (; yybottom <= yytop; yybottom++)
@@ -1062,21 +1211,21 @@ do {                                                            \
 `------------------------------------------------*/
 
 static void
-yy_reduce_print (yytype_int16 *yyssp, YYSTYPE *yyvsp, int yyrule)
+yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp,
+                 int yyrule)
 {
-  unsigned long yylno = yyrline[yyrule];
+  int yylno = yyrline[yyrule];
   int yynrhs = yyr2[yyrule];
   int yyi;
-  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %d):\n",
              yyrule - 1, yylno);
   /* The symbols being reduced.  */
   for (yyi = 0; yyi < yynrhs; yyi++)
     {
       YYFPRINTF (stderr, "   $%d = ", yyi + 1);
       yy_symbol_print (stderr,
-                       yystos[yyssp[yyi + 1 - yynrhs]],
-                       &yyvsp[(yyi + 1) - (yynrhs)]
-                                              );
+                       YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]),
+                       &yyvsp[(yyi + 1) - (yynrhs)]);
       YYFPRINTF (stderr, "\n");
     }
 }
@@ -1091,8 +1240,8 @@ do {                                    \
    multiple parsers can coexist.  */
 int yydebug;
 #else /* !YYDEBUG */
-# define YYDPRINTF(Args)
-# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YYDPRINTF(Args) ((void) 0)
+# define YY_SYMBOL_PRINT(Title, Kind, Value, Location)
 # define YY_STACK_PRINT(Bottom, Top)
 # define YY_REDUCE_PRINT(Rule)
 #endif /* !YYDEBUG */
@@ -1115,254 +1264,30 @@ int yydebug;
 #endif
 
 
-#if YYERROR_VERBOSE
 
-# ifndef yystrlen
-#  if defined __GLIBC__ && defined _STRING_H
-#   define yystrlen strlen
-#  else
-/* Return the length of YYSTR.  */
-static YYSIZE_T
-yystrlen (const char *yystr)
-{
-  YYSIZE_T yylen;
-  for (yylen = 0; yystr[yylen]; yylen++)
-    continue;
-  return yylen;
-}
-#  endif
-# endif
-
-# ifndef yystpcpy
-#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
-#   define yystpcpy stpcpy
-#  else
-/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
-   YYDEST.  */
-static char *
-yystpcpy (char *yydest, const char *yysrc)
-{
-  char *yyd = yydest;
-  const char *yys = yysrc;
-
-  while ((*yyd++ = *yys++) != '\0')
-    continue;
-
-  return yyd - 1;
-}
-#  endif
-# endif
-
-# ifndef yytnamerr
-/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
-   quotes and backslashes, so that it's suitable for yyerror.  The
-   heuristic is that double-quoting is unnecessary unless the string
-   contains an apostrophe, a comma, or backslash (other than
-   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
-   null, do not copy; instead, return the length of what the result
-   would have been.  */
-static YYSIZE_T
-yytnamerr (char *yyres, const char *yystr)
-{
-  if (*yystr == '"')
-    {
-      YYSIZE_T yyn = 0;
-      char const *yyp = yystr;
-
-      for (;;)
-        switch (*++yyp)
-          {
-          case '\'':
-          case ',':
-            goto do_not_strip_quotes;
-
-          case '\\':
-            if (*++yyp != '\\')
-              goto do_not_strip_quotes;
-            else
-              goto append;
-
-          append:
-          default:
-            if (yyres)
-              yyres[yyn] = *yyp;
-            yyn++;
-            break;
-
-          case '"':
-            if (yyres)
-              yyres[yyn] = '\0';
-            return yyn;
-          }
-    do_not_strip_quotes: ;
-    }
 
-  if (! yyres)
-    return yystrlen (yystr);
-
-  return (YYSIZE_T) (yystpcpy (yyres, yystr) - yyres);
-}
-# endif
 
-/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
-   about the unexpected token YYTOKEN for the state stack whose top is
-   YYSSP.
-
-   Return 0 if *YYMSG was successfully written.  Return 1 if *YYMSG is
-   not large enough to hold the message.  In that case, also set
-   *YYMSG_ALLOC to the required number of bytes.  Return 2 if the
-   required number of bytes is too large to store.  */
-static int
-yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
-                yytype_int16 *yyssp, int yytoken)
-{
-  YYSIZE_T yysize0 = yytnamerr (YY_NULLPTR, yytname[yytoken]);
-  YYSIZE_T yysize = yysize0;
-  enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
-  /* Internationalized format string. */
-  const char *yyformat = YY_NULLPTR;
-  /* Arguments of yyformat. */
-  char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
-  /* Number of reported tokens (one for the "unexpected", one per
-     "expected"). */
-  int yycount = 0;
-
-  /* There are many possibilities here to consider:
-     - If this state is a consistent state with a default action, then
-       the only way this function was invoked is if the default action
-       is an error action.  In that case, don't check for expected
-       tokens because there are none.
-     - The only way there can be no lookahead present (in yychar) is if
-       this state is a consistent state with a default action.  Thus,
-       detecting the absence of a lookahead is sufficient to determine
-       that there is no unexpected or expected token to report.  In that
-       case, just report a simple "syntax error".
-     - Don't assume there isn't a lookahead just because this state is a
-       consistent state with a default action.  There might have been a
-       previous inconsistent state, consistent state with a non-default
-       action, or user semantic action that manipulated yychar.
-     - Of course, the expected token list depends on states to have
-       correct lookahead information, and it depends on the parser not
-       to perform extra reductions after fetching a lookahead from the
-       scanner and before detecting a syntax error.  Thus, state merging
-       (from LALR or IELR) and default reductions corrupt the expected
-       token list.  However, the list is correct for canonical LR with
-       one exception: it will still contain any token that will not be
-       accepted due to an error action in a later state.
-  */
-  if (yytoken != YYEMPTY)
-    {
-      int yyn = yypact[*yyssp];
-      yyarg[yycount++] = yytname[yytoken];
-      if (!yypact_value_is_default (yyn))
-        {
-          /* Start YYX at -YYN if negative to avoid negative indexes in
-             YYCHECK.  In other words, skip the first -YYN actions for
-             this state because they are default actions.  */
-          int yyxbegin = yyn < 0 ? -yyn : 0;
-          /* Stay within bounds of both yycheck and yytname.  */
-          int yychecklim = YYLAST - yyn + 1;
-          int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
-          int yyx;
-
-          for (yyx = yyxbegin; yyx < yyxend; ++yyx)
-            if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
-                && !yytable_value_is_error (yytable[yyx + yyn]))
-              {
-                if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
-                  {
-                    yycount = 1;
-                    yysize = yysize0;
-                    break;
-                  }
-                yyarg[yycount++] = yytname[yyx];
-                {
-                  YYSIZE_T yysize1 = yysize + yytnamerr (YY_NULLPTR, yytname[yyx]);
-                  if (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)
-                    yysize = yysize1;
-                  else
-                    return 2;
-                }
-              }
-        }
-    }
-
-  switch (yycount)
-    {
-# define YYCASE_(N, S)                      \
-      case N:                               \
-        yyformat = S;                       \
-      break
-    default: /* Avoid compiler warnings. */
-      YYCASE_(0, YY_("syntax error"));
-      YYCASE_(1, YY_("syntax error, unexpected %s"));
-      YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
-      YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
-      YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
-      YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
-# undef YYCASE_
-    }
-
-  {
-    YYSIZE_T yysize1 = yysize + yystrlen (yyformat);
-    if (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)
-      yysize = yysize1;
-    else
-      return 2;
-  }
-
-  if (*yymsg_alloc < yysize)
-    {
-      *yymsg_alloc = 2 * yysize;
-      if (! (yysize <= *yymsg_alloc
-             && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
-        *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
-      return 1;
-    }
-
-  /* Avoid sprintf, as that infringes on the user's name space.
-     Don't have undefined behavior even if the translation
-     produced a string with the wrong number of "%s"s.  */
-  {
-    char *yyp = *yymsg;
-    int yyi = 0;
-    while ((*yyp = *yyformat) != '\0')
-      if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
-        {
-          yyp += yytnamerr (yyp, yyarg[yyi++]);
-          yyformat += 2;
-        }
-      else
-        {
-          yyp++;
-          yyformat++;
-        }
-  }
-  return 0;
-}
-#endif /* YYERROR_VERBOSE */
 
 /*-----------------------------------------------.
 | Release the memory associated to this symbol.  |
 `-----------------------------------------------*/
 
 static void
-yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+yydestruct (const char *yymsg,
+            yysymbol_kind_t yykind, YYSTYPE *yyvaluep)
 {
-  YYUSE (yyvaluep);
+  YY_USE (yyvaluep);
   if (!yymsg)
     yymsg = "Deleting";
-  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+  YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp);
 
   YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
-  YYUSE (yytype);
+  YY_USE (yykind);
   YY_IGNORE_MAYBE_UNINITIALIZED_END
 }
 
 
-
-
-/* The lookahead symbol.  */
+/* Lookahead token kind.  */
 static int yychar;
 
 /* The semantic value of the lookahead symbol.  */
@@ -1371,6 +1296,8 @@ YYSTYPE yylval;
 static int yynerrs;
 
 
+
+
 /*----------.
 | yyparse.  |
 `----------*/
@@ -1378,43 +1305,36 @@ static int yynerrs;
 int
 yyparse (void)
 {
-    int yystate;
+    yy_state_fast_t yystate = 0;
     /* Number of tokens to shift before error messages enabled.  */
-    int yyerrstatus;
+    int yyerrstatus = 0;
 
-    /* The stacks and their tools:
-       'yyss': related to states.
-       'yyvs': related to semantic values.
-
-       Refer to the stacks through separate pointers, to allow yyoverflow
+    /* Refer to the stacks through separate pointers, to allow yyoverflow
        to reallocate them elsewhere.  */
 
-    /* The state stack.  */
-    yytype_int16 yyssa[YYINITDEPTH];
-    yytype_int16 *yyss;
-    yytype_int16 *yyssp;
+    /* Their size.  */
+    YYPTRDIFF_T yystacksize = YYINITDEPTH;
 
-    /* The semantic value stack.  */
-    YYSTYPE yyvsa[YYINITDEPTH];
-    YYSTYPE *yyvs;
-    YYSTYPE *yyvsp;
+    /* The state stack: array, bottom, top.  */
+    yy_state_t yyssa[YYINITDEPTH];
+    yy_state_t *yyss = yyssa;
+    yy_state_t *yyssp = yyss;
 
-    YYSIZE_T yystacksize;
+    /* The semantic value stack: array, bottom, top.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs = yyvsa;
+    YYSTYPE *yyvsp = yyvs;
 
   int yyn;
+  /* The return value of yyparse.  */
   int yyresult;
-  /* Lookahead token as an internal (translated) token number.  */
-  int yytoken = 0;
+  /* Lookahead symbol kind.  */
+  yysymbol_kind_t yytoken = YYSYMBOL_YYEMPTY;
   /* The variables used to return semantic value and location from the
      action routines.  */
   YYSTYPE yyval;
 
-#if YYERROR_VERBOSE
-  /* Buffer for error messages, and its allocated size.  */
-  char yymsgbuf[128];
-  char *yymsg = yymsgbuf;
-  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
-#endif
+
 
 #define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
 
@@ -1422,15 +1342,8 @@ yyparse (void)
      Keep to zero when no symbol should be popped.  */
   int yylen = 0;
 
-  yyssp = yyss = yyssa;
-  yyvsp = yyvs = yyvsa;
-  yystacksize = YYINITDEPTH;
-
   YYDPRINTF ((stderr, "Starting parse\n"));
 
-  yystate = 0;
-  yyerrstatus = 0;
-  yynerrs = 0;
   yychar = YYEMPTY; /* Cause a token to be read.  */
   goto yysetstate;
 
@@ -1445,12 +1358,15 @@ yynewstate:
 
 
 /*--------------------------------------------------------------------.
-| yynewstate -- set current state (the top of the stack) to yystate.  |
+| yysetstate -- set current state (the top of the stack) to yystate.  |
 `--------------------------------------------------------------------*/
 yysetstate:
   YYDPRINTF ((stderr, "Entering state %d\n", yystate));
   YY_ASSERT (0 <= yystate && yystate < YYNSTATES);
-  *yyssp = (yytype_int16) yystate;
+  YY_IGNORE_USELESS_CAST_BEGIN
+  *yyssp = YY_CAST (yy_state_t, yystate);
+  YY_IGNORE_USELESS_CAST_END
+  YY_STACK_PRINT (yyss, yyssp);
 
   if (yyss + yystacksize - 1 <= yyssp)
 #if !defined yyoverflow && !defined YYSTACK_RELOCATE
@@ -1458,23 +1374,23 @@ yysetstate:
 #else
     {
       /* Get the current used size of the three stacks, in elements.  */
-      YYSIZE_T yysize = (YYSIZE_T) (yyssp - yyss + 1);
+      YYPTRDIFF_T yysize = yyssp - yyss + 1;
 
 # if defined yyoverflow
       {
         /* Give user a chance to reallocate the stack.  Use copies of
            these so that the &'s don't force the real ones into
            memory.  */
+        yy_state_t *yyss1 = yyss;
         YYSTYPE *yyvs1 = yyvs;
-        yytype_int16 *yyss1 = yyss;
 
         /* Each stack pointer address is followed by the size of the
            data in use in that stack, in bytes.  This used to be a
            conditional around just the two extra args, but that might
            be undefined if yyoverflow is a macro.  */
         yyoverflow (YY_("memory exhausted"),
-                    &yyss1, yysize * sizeof (*yyssp),
-                    &yyvs1, yysize * sizeof (*yyvsp),
+                    &yyss1, yysize * YYSIZEOF (*yyssp),
+                    &yyvs1, yysize * YYSIZEOF (*yyvsp),
                     &yystacksize);
         yyss = yyss1;
         yyvs = yyvs1;
@@ -1488,14 +1404,15 @@ yysetstate:
         yystacksize = YYMAXDEPTH;
 
       {
-        yytype_int16 *yyss1 = yyss;
+        yy_state_t *yyss1 = yyss;
         union yyalloc *yyptr =
-          (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+          YY_CAST (union yyalloc *,
+                   YYSTACK_ALLOC (YY_CAST (YYSIZE_T, YYSTACK_BYTES (yystacksize))));
         if (! yyptr)
           goto yyexhaustedlab;
         YYSTACK_RELOCATE (yyss_alloc, yyss);
         YYSTACK_RELOCATE (yyvs_alloc, yyvs);
-# undef YYSTACK_RELOCATE
+#  undef YYSTACK_RELOCATE
         if (yyss1 != yyssa)
           YYSTACK_FREE (yyss1);
       }
@@ -1504,8 +1421,10 @@ yysetstate:
       yyssp = yyss + yysize - 1;
       yyvsp = yyvs + yysize - 1;
 
-      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
-                  (unsigned long) yystacksize));
+      YY_IGNORE_USELESS_CAST_BEGIN
+      YYDPRINTF ((stderr, "Stack size increased to %ld\n",
+                  YY_CAST (long, yystacksize)));
+      YY_IGNORE_USELESS_CAST_END
 
       if (yyss + yystacksize - 1 <= yyssp)
         YYABORT;
@@ -1532,18 +1451,29 @@ yybackup:
 
   /* Not known => get a lookahead token if don't already have one.  */
 
-  /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol.  */
+  /* YYCHAR is either empty, or end-of-input, or a valid lookahead.  */
   if (yychar == YYEMPTY)
     {
-      YYDPRINTF ((stderr, "Reading a token: "));
+      YYDPRINTF ((stderr, "Reading a token\n"));
       yychar = yylex ();
     }
 
   if (yychar <= YYEOF)
     {
-      yychar = yytoken = YYEOF;
+      yychar = YYEOF;
+      yytoken = YYSYMBOL_YYEOF;
       YYDPRINTF ((stderr, "Now at end of input.\n"));
     }
+  else if (yychar == YYerror)
+    {
+      /* The scanner already issued an error message, process directly
+         to error recovery.  But do not keep the error token as
+         lookahead, it is too special and may lead us to an endless
+         loop in error recovery. */
+      yychar = YYUNDEF;
+      yytoken = YYSYMBOL_YYerror;
+      goto yyerrlab1;
+    }
   else
     {
       yytoken = YYTRANSLATE (yychar);
@@ -1571,14 +1501,13 @@ yybackup:
 
   /* Shift the lookahead token.  */
   YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
-
-  /* Discard the shifted token.  */
-  yychar = YYEMPTY;
-
   yystate = yyn;
   YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
   *++yyvsp = yylval;
   YY_IGNORE_MAYBE_UNINITIALIZED_END
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
   goto yynewstate;
 
 
@@ -1613,778 +1542,771 @@ yyreduce:
   YY_REDUCE_PRINT (yyn);
   switch (yyn)
     {
-  case 23:
-#line 166 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 1616 "pars0grm.cc"
+  case 22: /* statement_list: statement  */
+#line 165 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1545 "pars0grm.cc"
+    break;
+
+  case 23: /* statement_list: statement_list statement  */
+#line 167 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
+#line 1551 "pars0grm.cc"
     break;
 
-  case 24:
-#line 168 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
-#line 1622 "pars0grm.cc"
+  case 24: /* exp: PARS_ID_TOKEN  */
+#line 171 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1557 "pars0grm.cc"
     break;
 
-  case 25:
-#line 172 "pars0grm.y"
-    { yyval = yyvsp[0];}
-#line 1628 "pars0grm.cc"
+  case 25: /* exp: function_name '(' exp_list ')'  */
+#line 173 "pars0grm.y"
+                                { yyval = pars_func(yyvsp[-3], yyvsp[-1]); }
+#line 1563 "pars0grm.cc"
     break;
 
-  case 26:
+  case 26: /* exp: PARS_INT_LIT  */
 #line 174 "pars0grm.y"
-    { yyval = pars_func(yyvsp[-3], yyvsp[-1]); }
-#line 1634 "pars0grm.cc"
+                                { yyval = yyvsp[0];}
+#line 1569 "pars0grm.cc"
     break;
 
-  case 27:
+  case 27: /* exp: PARS_FLOAT_LIT  */
 #line 175 "pars0grm.y"
-    { yyval = yyvsp[0];}
-#line 1640 "pars0grm.cc"
+                                { yyval = yyvsp[0];}
+#line 1575 "pars0grm.cc"
     break;
 
-  case 28:
+  case 28: /* exp: PARS_STR_LIT  */
 #line 176 "pars0grm.y"
-    { yyval = yyvsp[0];}
-#line 1646 "pars0grm.cc"
+                                { yyval = yyvsp[0];}
+#line 1581 "pars0grm.cc"
     break;
 
-  case 29:
+  case 29: /* exp: PARS_NULL_LIT  */
 #line 177 "pars0grm.y"
-    { yyval = yyvsp[0];}
-#line 1652 "pars0grm.cc"
+                                { yyval = yyvsp[0];}
+#line 1587 "pars0grm.cc"
     break;
 
-  case 30:
+  case 30: /* exp: PARS_SQL_TOKEN  */
 #line 178 "pars0grm.y"
-    { yyval = yyvsp[0];}
-#line 1658 "pars0grm.cc"
+                                { yyval = yyvsp[0];}
+#line 1593 "pars0grm.cc"
     break;
 
-  case 31:
+  case 31: /* exp: exp '+' exp  */
 #line 179 "pars0grm.y"
-    { yyval = yyvsp[0];}
-#line 1664 "pars0grm.cc"
+                                { yyval = pars_op('+', yyvsp[-2], yyvsp[0]); }
+#line 1599 "pars0grm.cc"
     break;
 
-  case 32:
+  case 32: /* exp: exp '-' exp  */
 #line 180 "pars0grm.y"
-    { yyval = pars_op('+', yyvsp[-2], yyvsp[0]); }
-#line 1670 "pars0grm.cc"
+                                { yyval = pars_op('-', yyvsp[-2], yyvsp[0]); }
+#line 1605 "pars0grm.cc"
     break;
 
-  case 33:
+  case 33: /* exp: exp '*' exp  */
 #line 181 "pars0grm.y"
-    { yyval = pars_op('-', yyvsp[-2], yyvsp[0]); }
-#line 1676 "pars0grm.cc"
+                                { yyval = pars_op('*', yyvsp[-2], yyvsp[0]); }
+#line 1611 "pars0grm.cc"
     break;
 
-  case 34:
+  case 34: /* exp: exp '/' exp  */
 #line 182 "pars0grm.y"
-    { yyval = pars_op('*', yyvsp[-2], yyvsp[0]); }
-#line 1682 "pars0grm.cc"
+                                { yyval = pars_op('/', yyvsp[-2], yyvsp[0]); }
+#line 1617 "pars0grm.cc"
     break;
 
-  case 35:
+  case 35: /* exp: '-' exp  */
 #line 183 "pars0grm.y"
-    { yyval = pars_op('/', yyvsp[-2], yyvsp[0]); }
-#line 1688 "pars0grm.cc"
+                                { yyval = pars_op('-', yyvsp[0], NULL); }
+#line 1623 "pars0grm.cc"
     break;
 
-  case 36:
+  case 36: /* exp: '(' exp ')'  */
 #line 184 "pars0grm.y"
-    { yyval = pars_op('-', yyvsp[0], NULL); }
-#line 1694 "pars0grm.cc"
+                                { yyval = yyvsp[-1]; }
+#line 1629 "pars0grm.cc"
     break;
 
-  case 37:
+  case 37: /* exp: exp '=' exp  */
 #line 185 "pars0grm.y"
-    { yyval = yyvsp[-1]; }
-#line 1700 "pars0grm.cc"
+                                { yyval = pars_op('=', yyvsp[-2], yyvsp[0]); }
+#line 1635 "pars0grm.cc"
     break;
 
-  case 38:
-#line 186 "pars0grm.y"
-    { yyval = pars_op('=', yyvsp[-2], yyvsp[0]); }
-#line 1706 "pars0grm.cc"
+  case 38: /* exp: exp PARS_LIKE_TOKEN PARS_STR_LIT  */
+#line 187 "pars0grm.y"
+                                { yyval = pars_op(PARS_LIKE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1641 "pars0grm.cc"
     break;
 
-  case 39:
+  case 39: /* exp: exp '<' exp  */
 #line 188 "pars0grm.y"
-    { yyval = pars_op(PARS_LIKE_TOKEN, yyvsp[-2], yyvsp[0]); }
-#line 1712 "pars0grm.cc"
+                                { yyval = pars_op('<', yyvsp[-2], yyvsp[0]); }
+#line 1647 "pars0grm.cc"
     break;
 
-  case 40:
+  case 40: /* exp: exp '>' exp  */
 #line 189 "pars0grm.y"
-    { yyval = pars_op('<', yyvsp[-2], yyvsp[0]); }
-#line 1718 "pars0grm.cc"
+                                { yyval = pars_op('>', yyvsp[-2], yyvsp[0]); }
+#line 1653 "pars0grm.cc"
     break;
 
-  case 41:
+  case 41: /* exp: exp PARS_GE_TOKEN exp  */
 #line 190 "pars0grm.y"
-    { yyval = pars_op('>', yyvsp[-2], yyvsp[0]); }
-#line 1724 "pars0grm.cc"
+                                { yyval = pars_op(PARS_GE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1659 "pars0grm.cc"
     break;
 
-  case 42:
+  case 42: /* exp: exp PARS_LE_TOKEN exp  */
 #line 191 "pars0grm.y"
-    { yyval = pars_op(PARS_GE_TOKEN, yyvsp[-2], yyvsp[0]); }
-#line 1730 "pars0grm.cc"
+                                { yyval = pars_op(PARS_LE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1665 "pars0grm.cc"
     break;
 
-  case 43:
+  case 43: /* exp: exp PARS_NE_TOKEN exp  */
 #line 192 "pars0grm.y"
-    { yyval = pars_op(PARS_LE_TOKEN, yyvsp[-2], yyvsp[0]); }
-#line 1736 "pars0grm.cc"
+                                { yyval = pars_op(PARS_NE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1671 "pars0grm.cc"
     break;
 
-  case 44:
+  case 44: /* exp: exp PARS_AND_TOKEN exp  */
 #line 193 "pars0grm.y"
-    { yyval = pars_op(PARS_NE_TOKEN, yyvsp[-2], yyvsp[0]); }
-#line 1742 "pars0grm.cc"
+                                { yyval = pars_op(PARS_AND_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1677 "pars0grm.cc"
     break;
 
-  case 45:
+  case 45: /* exp: exp PARS_OR_TOKEN exp  */
 #line 194 "pars0grm.y"
-    { yyval = pars_op(PARS_AND_TOKEN, yyvsp[-2], yyvsp[0]); }
-#line 1748 "pars0grm.cc"
+                                { yyval = pars_op(PARS_OR_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1683 "pars0grm.cc"
     break;
 
-  case 46:
+  case 46: /* exp: PARS_NOT_TOKEN exp  */
 #line 195 "pars0grm.y"
-    { yyval = pars_op(PARS_OR_TOKEN, yyvsp[-2], yyvsp[0]); }
-#line 1754 "pars0grm.cc"
+                                { yyval = pars_op(PARS_NOT_TOKEN, yyvsp[0], NULL); }
+#line 1689 "pars0grm.cc"
     break;
 
-  case 47:
-#line 196 "pars0grm.y"
-    { yyval = pars_op(PARS_NOT_TOKEN, yyvsp[0], NULL); }
-#line 1760 "pars0grm.cc"
+  case 47: /* exp: PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN  */
+#line 197 "pars0grm.y"
+                                { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
+#line 1695 "pars0grm.cc"
     break;
 
-  case 48:
-#line 198 "pars0grm.y"
-    { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
-#line 1766 "pars0grm.cc"
+  case 48: /* exp: PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN  */
+#line 199 "pars0grm.y"
+                                { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
+#line 1701 "pars0grm.cc"
     break;
 
-  case 49:
-#line 200 "pars0grm.y"
-    { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
-#line 1772 "pars0grm.cc"
+  case 49: /* function_name: PARS_TO_BINARY_TOKEN  */
+#line 203 "pars0grm.y"
+                                { yyval = &pars_to_binary_token; }
+#line 1707 "pars0grm.cc"
     break;
 
-  case 50:
+  case 50: /* function_name: PARS_SUBSTR_TOKEN  */
 #line 204 "pars0grm.y"
-    { yyval = &pars_to_binary_token; }
-#line 1778 "pars0grm.cc"
+                                { yyval = &pars_substr_token; }
+#line 1713 "pars0grm.cc"
     break;
 
-  case 51:
+  case 51: /* function_name: PARS_CONCAT_TOKEN  */
 #line 205 "pars0grm.y"
-    { yyval = &pars_substr_token; }
-#line 1784 "pars0grm.cc"
+                                { yyval = &pars_concat_token; }
+#line 1719 "pars0grm.cc"
     break;
 
-  case 52:
+  case 52: /* function_name: PARS_INSTR_TOKEN  */
 #line 206 "pars0grm.y"
-    { yyval = &pars_concat_token; }
-#line 1790 "pars0grm.cc"
+                                { yyval = &pars_instr_token; }
+#line 1725 "pars0grm.cc"
     break;
 
-  case 53:
+  case 53: /* function_name: PARS_LENGTH_TOKEN  */
 #line 207 "pars0grm.y"
-    { yyval = &pars_instr_token; }
-#line 1796 "pars0grm.cc"
-    break;
-
-  case 54:
-#line 208 "pars0grm.y"
-    { yyval = &pars_length_token; }
-#line 1802 "pars0grm.cc"
-    break;
-
-  case 58:
-#line 219 "pars0grm.y"
-    { yyval = pars_stored_procedure_call(
-					static_cast<sym_node_t*>(yyvsp[-4])); }
-#line 1809 "pars0grm.cc"
+                                { yyval = &pars_length_token; }
+#line 1731 "pars0grm.cc"
     break;
 
-  case 59:
-#line 224 "pars0grm.y"
-    { yyval = yyvsp[-2]; }
-#line 1815 "pars0grm.cc"
+  case 54: /* user_function_call: PARS_ID_TOKEN '(' ')'  */
+#line 211 "pars0grm.y"
+                                { yyval = yyvsp[-2]; }
+#line 1737 "pars0grm.cc"
     break;
 
-  case 60:
-#line 228 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 1821 "pars0grm.cc"
+  case 55: /* table_list: table_name  */
+#line 215 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1743 "pars0grm.cc"
     break;
 
-  case 61:
-#line 230 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 1827 "pars0grm.cc"
+  case 56: /* table_list: table_list ',' table_name  */
+#line 217 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1749 "pars0grm.cc"
     break;
 
-  case 62:
-#line 234 "pars0grm.y"
-    { yyval = NULL; }
-#line 1833 "pars0grm.cc"
+  case 57: /* variable_list: %empty  */
+#line 221 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1755 "pars0grm.cc"
     break;
 
-  case 63:
-#line 235 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 1839 "pars0grm.cc"
+  case 58: /* variable_list: PARS_ID_TOKEN  */
+#line 222 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1761 "pars0grm.cc"
     break;
 
-  case 64:
-#line 237 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 1845 "pars0grm.cc"
+  case 59: /* variable_list: variable_list ',' PARS_ID_TOKEN  */
+#line 224 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1767 "pars0grm.cc"
     break;
 
-  case 65:
-#line 241 "pars0grm.y"
-    { yyval = NULL; }
-#line 1851 "pars0grm.cc"
+  case 60: /* exp_list: %empty  */
+#line 228 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1773 "pars0grm.cc"
     break;
 
-  case 66:
-#line 242 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]);}
-#line 1857 "pars0grm.cc"
+  case 61: /* exp_list: exp  */
+#line 229 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]);}
+#line 1779 "pars0grm.cc"
     break;
 
-  case 67:
-#line 243 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 1863 "pars0grm.cc"
+  case 62: /* exp_list: exp_list ',' exp  */
+#line 230 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1785 "pars0grm.cc"
     break;
 
-  case 68:
-#line 247 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 1869 "pars0grm.cc"
+  case 63: /* select_item: exp  */
+#line 234 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1791 "pars0grm.cc"
     break;
 
-  case 69:
-#line 249 "pars0grm.y"
-    { yyval = pars_func(&pars_count_token,
+  case 64: /* select_item: PARS_COUNT_TOKEN '(' '*' ')'  */
+#line 236 "pars0grm.y"
+                                { yyval = pars_func(&pars_count_token,
 					  que_node_list_add_last(NULL,
 					    sym_tab_add_int_lit(
 						pars_sym_tab_global, 1))); }
-#line 1878 "pars0grm.cc"
+#line 1800 "pars0grm.cc"
     break;
 
-  case 70:
-#line 256 "pars0grm.y"
-    { yyval = NULL; }
-#line 1884 "pars0grm.cc"
+  case 65: /* select_item_list: %empty  */
+#line 243 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1806 "pars0grm.cc"
     break;
 
-  case 71:
-#line 257 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 1890 "pars0grm.cc"
+  case 66: /* select_item_list: select_item  */
+#line 244 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1812 "pars0grm.cc"
     break;
 
-  case 72:
-#line 259 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 1896 "pars0grm.cc"
+  case 67: /* select_item_list: select_item_list ',' select_item  */
+#line 246 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1818 "pars0grm.cc"
     break;
 
-  case 73:
-#line 263 "pars0grm.y"
-    { yyval = pars_select_list(&pars_star_denoter,
+  case 68: /* select_list: '*'  */
+#line 250 "pars0grm.y"
+                                { yyval = pars_select_list(&pars_star_denoter,
 								NULL); }
-#line 1903 "pars0grm.cc"
+#line 1825 "pars0grm.cc"
     break;
 
-  case 74:
-#line 266 "pars0grm.y"
-    { yyval = pars_select_list(
+  case 69: /* select_list: select_item_list PARS_INTO_TOKEN variable_list  */
+#line 253 "pars0grm.y"
+                                { yyval = pars_select_list(
 					yyvsp[-2], static_cast<sym_node_t*>(yyvsp[0])); }
-#line 1910 "pars0grm.cc"
+#line 1832 "pars0grm.cc"
     break;
 
-  case 75:
-#line 268 "pars0grm.y"
-    { yyval = pars_select_list(yyvsp[0], NULL); }
-#line 1916 "pars0grm.cc"
+  case 70: /* select_list: select_item_list  */
+#line 255 "pars0grm.y"
+                                { yyval = pars_select_list(yyvsp[0], NULL); }
+#line 1838 "pars0grm.cc"
     break;
 
-  case 76:
-#line 272 "pars0grm.y"
-    { yyval = NULL; }
-#line 1922 "pars0grm.cc"
+  case 71: /* search_condition: %empty  */
+#line 259 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1844 "pars0grm.cc"
     break;
 
-  case 77:
-#line 273 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 1928 "pars0grm.cc"
+  case 72: /* search_condition: PARS_WHERE_TOKEN exp  */
+#line 260 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1850 "pars0grm.cc"
     break;
 
-  case 78:
-#line 277 "pars0grm.y"
-    { yyval = NULL; }
-#line 1934 "pars0grm.cc"
+  case 73: /* for_update_clause: %empty  */
+#line 264 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1856 "pars0grm.cc"
     break;
 
-  case 79:
-#line 279 "pars0grm.y"
-    { yyval = &pars_update_token; }
-#line 1940 "pars0grm.cc"
+  case 74: /* for_update_clause: PARS_FOR_TOKEN PARS_UPDATE_TOKEN  */
+#line 266 "pars0grm.y"
+                                { yyval = &pars_update_token; }
+#line 1862 "pars0grm.cc"
     break;
 
-  case 80:
-#line 283 "pars0grm.y"
-    { yyval = NULL; }
-#line 1946 "pars0grm.cc"
+  case 75: /* lock_shared_clause: %empty  */
+#line 270 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1868 "pars0grm.cc"
     break;
 
-  case 81:
-#line 285 "pars0grm.y"
-    { yyval = &pars_share_token; }
-#line 1952 "pars0grm.cc"
+  case 76: /* lock_shared_clause: PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN  */
+#line 272 "pars0grm.y"
+                                { yyval = &pars_share_token; }
+#line 1874 "pars0grm.cc"
     break;
 
-  case 82:
-#line 289 "pars0grm.y"
-    { yyval = &pars_asc_token; }
-#line 1958 "pars0grm.cc"
+  case 77: /* order_direction: %empty  */
+#line 276 "pars0grm.y"
+                                { yyval = &pars_asc_token; }
+#line 1880 "pars0grm.cc"
     break;
 
-  case 83:
-#line 290 "pars0grm.y"
-    { yyval = &pars_asc_token; }
-#line 1964 "pars0grm.cc"
+  case 78: /* order_direction: PARS_ASC_TOKEN  */
+#line 277 "pars0grm.y"
+                                { yyval = &pars_asc_token; }
+#line 1886 "pars0grm.cc"
     break;
 
-  case 84:
-#line 291 "pars0grm.y"
-    { yyval = &pars_desc_token; }
-#line 1970 "pars0grm.cc"
+  case 79: /* order_direction: PARS_DESC_TOKEN  */
+#line 278 "pars0grm.y"
+                                { yyval = &pars_desc_token; }
+#line 1892 "pars0grm.cc"
     break;
 
-  case 85:
-#line 295 "pars0grm.y"
-    { yyval = NULL; }
-#line 1976 "pars0grm.cc"
+  case 80: /* order_by_clause: %empty  */
+#line 282 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1898 "pars0grm.cc"
     break;
 
-  case 86:
-#line 297 "pars0grm.y"
-    { yyval = pars_order_by(
+  case 81: /* order_by_clause: PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction  */
+#line 284 "pars0grm.y"
+                                { yyval = pars_order_by(
 					static_cast<sym_node_t*>(yyvsp[-1]),
 					static_cast<pars_res_word_t*>(yyvsp[0])); }
-#line 1984 "pars0grm.cc"
+#line 1906 "pars0grm.cc"
     break;
 
-  case 87:
-#line 308 "pars0grm.y"
-    { yyval = pars_select_statement(
+  case 82: /* select_statement: PARS_SELECT_TOKEN select_list PARS_FROM_TOKEN table_list search_condition for_update_clause lock_shared_clause order_by_clause  */
+#line 295 "pars0grm.y"
+                                { yyval = pars_select_statement(
 					static_cast<sel_node_t*>(yyvsp[-6]),
 					static_cast<sym_node_t*>(yyvsp[-4]),
 					static_cast<que_node_t*>(yyvsp[-3]),
 					static_cast<pars_res_word_t*>(yyvsp[-2]),
 					static_cast<pars_res_word_t*>(yyvsp[-1]),
 					static_cast<order_node_t*>(yyvsp[0])); }
-#line 1996 "pars0grm.cc"
+#line 1918 "pars0grm.cc"
     break;
 
-  case 88:
-#line 319 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 2002 "pars0grm.cc"
+  case 83: /* insert_statement_start: PARS_INSERT_TOKEN PARS_INTO_TOKEN table_name  */
+#line 306 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1924 "pars0grm.cc"
     break;
 
-  case 89:
-#line 324 "pars0grm.y"
-    { yyval = pars_insert_statement(
+  case 84: /* insert_statement: insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'  */
+#line 311 "pars0grm.y"
+                                { yyval = pars_insert_statement(
 					static_cast<sym_node_t*>(yyvsp[-4]), yyvsp[-1], NULL); }
-#line 2009 "pars0grm.cc"
+#line 1931 "pars0grm.cc"
     break;
 
-  case 90:
-#line 327 "pars0grm.y"
-    { yyval = pars_insert_statement(
+  case 85: /* insert_statement: insert_statement_start select_statement  */
+#line 314 "pars0grm.y"
+                                { yyval = pars_insert_statement(
 					static_cast<sym_node_t*>(yyvsp[-1]),
 					NULL,
 					static_cast<sel_node_t*>(yyvsp[0])); }
-#line 2018 "pars0grm.cc"
+#line 1940 "pars0grm.cc"
     break;
 
-  case 91:
-#line 334 "pars0grm.y"
-    { yyval = pars_column_assignment(
+  case 86: /* column_assignment: PARS_ID_TOKEN '=' exp  */
+#line 321 "pars0grm.y"
+                                { yyval = pars_column_assignment(
 					static_cast<sym_node_t*>(yyvsp[-2]),
 					static_cast<que_node_t*>(yyvsp[0])); }
-#line 2026 "pars0grm.cc"
+#line 1948 "pars0grm.cc"
     break;
 
-  case 92:
-#line 340 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 2032 "pars0grm.cc"
+  case 87: /* column_assignment_list: column_assignment  */
+#line 327 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1954 "pars0grm.cc"
     break;
 
-  case 93:
-#line 342 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 2038 "pars0grm.cc"
+  case 88: /* column_assignment_list: column_assignment_list ',' column_assignment  */
+#line 329 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1960 "pars0grm.cc"
     break;
 
-  case 94:
-#line 348 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 2044 "pars0grm.cc"
+  case 89: /* cursor_positioned: PARS_WHERE_TOKEN PARS_CURRENT_TOKEN PARS_OF_TOKEN PARS_ID_TOKEN  */
+#line 335 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1966 "pars0grm.cc"
     break;
 
-  case 95:
-#line 354 "pars0grm.y"
-    { yyval = pars_update_statement_start(
+  case 90: /* update_statement_start: PARS_UPDATE_TOKEN table_name PARS_SET_TOKEN column_assignment_list  */
+#line 341 "pars0grm.y"
+                                { yyval = pars_update_statement_start(
 					FALSE,
 					static_cast<sym_node_t*>(yyvsp[-2]),
 					static_cast<col_assign_node_t*>(yyvsp[0])); }
-#line 2053 "pars0grm.cc"
+#line 1975 "pars0grm.cc"
     break;
 
-  case 96:
-#line 362 "pars0grm.y"
-    { yyval = pars_update_statement(
+  case 91: /* update_statement_searched: update_statement_start search_condition  */
+#line 349 "pars0grm.y"
+                                { yyval = pars_update_statement(
 					static_cast<upd_node_t*>(yyvsp[-1]),
 					NULL,
 					static_cast<que_node_t*>(yyvsp[0])); }
-#line 2062 "pars0grm.cc"
+#line 1984 "pars0grm.cc"
     break;
 
-  case 97:
-#line 370 "pars0grm.y"
-    { yyval = pars_update_statement(
+  case 92: /* update_statement_positioned: update_statement_start cursor_positioned  */
+#line 357 "pars0grm.y"
+                                { yyval = pars_update_statement(
 					static_cast<upd_node_t*>(yyvsp[-1]),
 					static_cast<sym_node_t*>(yyvsp[0]),
 					NULL); }
-#line 2071 "pars0grm.cc"
+#line 1993 "pars0grm.cc"
     break;
 
-  case 98:
-#line 378 "pars0grm.y"
-    { yyval = pars_update_statement_start(
+  case 93: /* delete_statement_start: PARS_DELETE_TOKEN PARS_FROM_TOKEN table_name  */
+#line 365 "pars0grm.y"
+                                { yyval = pars_update_statement_start(
 					TRUE,
 					static_cast<sym_node_t*>(yyvsp[0]), NULL); }
-#line 2079 "pars0grm.cc"
+#line 2001 "pars0grm.cc"
     break;
 
-  case 99:
-#line 385 "pars0grm.y"
-    { yyval = pars_update_statement(
+  case 94: /* delete_statement_searched: delete_statement_start search_condition  */
+#line 372 "pars0grm.y"
+                                { yyval = pars_update_statement(
 					static_cast<upd_node_t*>(yyvsp[-1]),
 					NULL,
 					static_cast<que_node_t*>(yyvsp[0])); }
-#line 2088 "pars0grm.cc"
+#line 2010 "pars0grm.cc"
     break;
 
-  case 100:
-#line 393 "pars0grm.y"
-    { yyval = pars_update_statement(
+  case 95: /* delete_statement_positioned: delete_statement_start cursor_positioned  */
+#line 380 "pars0grm.y"
+                                { yyval = pars_update_statement(
 					static_cast<upd_node_t*>(yyvsp[-1]),
 					static_cast<sym_node_t*>(yyvsp[0]),
 					NULL); }
-#line 2097 "pars0grm.cc"
+#line 2019 "pars0grm.cc"
     break;
 
-  case 101:
-#line 401 "pars0grm.y"
-    { yyval = pars_assignment_statement(
+  case 96: /* assignment_statement: PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp  */
+#line 388 "pars0grm.y"
+                                { yyval = pars_assignment_statement(
 					static_cast<sym_node_t*>(yyvsp[-2]),
 					static_cast<que_node_t*>(yyvsp[0])); }
-#line 2105 "pars0grm.cc"
+#line 2027 "pars0grm.cc"
     break;
 
-  case 102:
-#line 409 "pars0grm.y"
-    { yyval = pars_elsif_element(yyvsp[-2], yyvsp[0]); }
-#line 2111 "pars0grm.cc"
+  case 97: /* elsif_element: PARS_ELSIF_TOKEN exp PARS_THEN_TOKEN statement_list  */
+#line 396 "pars0grm.y"
+                                { yyval = pars_elsif_element(yyvsp[-2], yyvsp[0]); }
+#line 2033 "pars0grm.cc"
     break;
 
-  case 103:
-#line 413 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 2117 "pars0grm.cc"
+  case 98: /* elsif_list: elsif_element  */
+#line 400 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2039 "pars0grm.cc"
     break;
 
-  case 104:
-#line 415 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
-#line 2123 "pars0grm.cc"
+  case 99: /* elsif_list: elsif_list elsif_element  */
+#line 402 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
+#line 2045 "pars0grm.cc"
     break;
 
-  case 105:
-#line 419 "pars0grm.y"
-    { yyval = NULL; }
-#line 2129 "pars0grm.cc"
+  case 100: /* else_part: %empty  */
+#line 406 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2051 "pars0grm.cc"
     break;
 
-  case 106:
-#line 421 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 2135 "pars0grm.cc"
+  case 101: /* else_part: PARS_ELSE_TOKEN statement_list  */
+#line 408 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2057 "pars0grm.cc"
     break;
 
-  case 107:
-#line 422 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 2141 "pars0grm.cc"
+  case 102: /* else_part: elsif_list  */
+#line 409 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2063 "pars0grm.cc"
     break;
 
-  case 108:
-#line 429 "pars0grm.y"
-    { yyval = pars_if_statement(yyvsp[-5], yyvsp[-3], yyvsp[-2]); }
-#line 2147 "pars0grm.cc"
+  case 103: /* if_statement: PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list else_part PARS_END_TOKEN PARS_IF_TOKEN  */
+#line 416 "pars0grm.y"
+                                { yyval = pars_if_statement(yyvsp[-5], yyvsp[-3], yyvsp[-2]); }
+#line 2069 "pars0grm.cc"
     break;
 
-  case 109:
-#line 435 "pars0grm.y"
-    { yyval = pars_while_statement(yyvsp[-4], yyvsp[-2]); }
-#line 2153 "pars0grm.cc"
+  case 104: /* while_statement: PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list PARS_END_TOKEN PARS_LOOP_TOKEN  */
+#line 422 "pars0grm.y"
+                                { yyval = pars_while_statement(yyvsp[-4], yyvsp[-2]); }
+#line 2075 "pars0grm.cc"
     break;
 
-  case 110:
-#line 443 "pars0grm.y"
-    { yyval = pars_for_statement(
+  case 105: /* for_statement: PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN exp PARS_DDOT_TOKEN exp PARS_LOOP_TOKEN statement_list PARS_END_TOKEN PARS_LOOP_TOKEN  */
+#line 430 "pars0grm.y"
+                                { yyval = pars_for_statement(
 					static_cast<sym_node_t*>(yyvsp[-8]),
 					yyvsp[-6], yyvsp[-4], yyvsp[-2]); }
-#line 2161 "pars0grm.cc"
+#line 2083 "pars0grm.cc"
     break;
 
-  case 111:
-#line 449 "pars0grm.y"
-    { yyval = pars_exit_statement(); }
-#line 2167 "pars0grm.cc"
+  case 106: /* exit_statement: PARS_EXIT_TOKEN  */
+#line 436 "pars0grm.y"
+                                { yyval = pars_exit_statement(); }
+#line 2089 "pars0grm.cc"
     break;
 
-  case 112:
-#line 453 "pars0grm.y"
-    { yyval = pars_return_statement(); }
-#line 2173 "pars0grm.cc"
+  case 107: /* return_statement: PARS_RETURN_TOKEN  */
+#line 440 "pars0grm.y"
+                                { yyval = pars_return_statement(); }
+#line 2095 "pars0grm.cc"
     break;
 
-  case 113:
-#line 458 "pars0grm.y"
-    { yyval = pars_open_statement(
+  case 108: /* open_cursor_statement: PARS_OPEN_TOKEN PARS_ID_TOKEN  */
+#line 445 "pars0grm.y"
+                                { yyval = pars_open_statement(
 						ROW_SEL_OPEN_CURSOR,
 						static_cast<sym_node_t*>(yyvsp[0])); }
-#line 2181 "pars0grm.cc"
+#line 2103 "pars0grm.cc"
     break;
 
-  case 114:
-#line 465 "pars0grm.y"
-    { yyval = pars_open_statement(
+  case 109: /* close_cursor_statement: PARS_CLOSE_TOKEN PARS_ID_TOKEN  */
+#line 452 "pars0grm.y"
+                                { yyval = pars_open_statement(
 						ROW_SEL_CLOSE_CURSOR,
 						static_cast<sym_node_t*>(yyvsp[0])); }
-#line 2189 "pars0grm.cc"
+#line 2111 "pars0grm.cc"
     break;
 
-  case 115:
-#line 472 "pars0grm.y"
-    { yyval = pars_fetch_statement(
+  case 110: /* fetch_statement: PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list  */
+#line 459 "pars0grm.y"
+                                { yyval = pars_fetch_statement(
 					static_cast<sym_node_t*>(yyvsp[-2]),
 					static_cast<sym_node_t*>(yyvsp[0]), NULL); }
-#line 2197 "pars0grm.cc"
+#line 2119 "pars0grm.cc"
     break;
 
-  case 116:
-#line 476 "pars0grm.y"
-    { yyval = pars_fetch_statement(
+  case 111: /* fetch_statement: PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call  */
+#line 463 "pars0grm.y"
+                                { yyval = pars_fetch_statement(
 					static_cast<sym_node_t*>(yyvsp[-2]),
 					NULL,
 					static_cast<sym_node_t*>(yyvsp[0])); }
-#line 2206 "pars0grm.cc"
+#line 2128 "pars0grm.cc"
     break;
 
-  case 117:
-#line 484 "pars0grm.y"
-    { yyval = pars_column_def(
+  case 112: /* column_def: PARS_ID_TOKEN type_name opt_column_len opt_not_null  */
+#line 471 "pars0grm.y"
+                                { yyval = pars_column_def(
 					static_cast<sym_node_t*>(yyvsp[-3]),
 					static_cast<pars_res_word_t*>(yyvsp[-2]),
 					static_cast<sym_node_t*>(yyvsp[-1]),
 					yyvsp[0]); }
-#line 2216 "pars0grm.cc"
+#line 2138 "pars0grm.cc"
     break;
 
-  case 118:
-#line 492 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 2222 "pars0grm.cc"
+  case 113: /* column_def_list: column_def  */
+#line 479 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2144 "pars0grm.cc"
     break;
 
-  case 119:
-#line 494 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 2228 "pars0grm.cc"
+  case 114: /* column_def_list: column_def_list ',' column_def  */
+#line 481 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2150 "pars0grm.cc"
     break;
 
-  case 120:
-#line 498 "pars0grm.y"
-    { yyval = NULL; }
-#line 2234 "pars0grm.cc"
+  case 115: /* opt_column_len: %empty  */
+#line 485 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2156 "pars0grm.cc"
     break;
 
-  case 121:
-#line 500 "pars0grm.y"
-    { yyval = yyvsp[-1]; }
-#line 2240 "pars0grm.cc"
+  case 116: /* opt_column_len: '(' PARS_INT_LIT ')'  */
+#line 487 "pars0grm.y"
+                                { yyval = yyvsp[-1]; }
+#line 2162 "pars0grm.cc"
     break;
 
-  case 122:
-#line 504 "pars0grm.y"
-    { yyval = NULL; }
-#line 2246 "pars0grm.cc"
+  case 117: /* opt_not_null: %empty  */
+#line 491 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2168 "pars0grm.cc"
     break;
 
-  case 123:
-#line 506 "pars0grm.y"
-    { yyval = &pars_int_token;
+  case 118: /* opt_not_null: PARS_NOT_TOKEN PARS_NULL_LIT  */
+#line 493 "pars0grm.y"
+                                { yyval = &pars_int_token;
 					/* pass any non-NULL pointer */ }
-#line 2253 "pars0grm.cc"
+#line 2175 "pars0grm.cc"
     break;
 
-  case 124:
-#line 513 "pars0grm.y"
-    { yyval = pars_create_table(
+  case 119: /* create_table: PARS_CREATE_TOKEN PARS_TABLE_TOKEN table_name '(' column_def_list ')'  */
+#line 500 "pars0grm.y"
+                                { yyval = pars_create_table(
 					static_cast<sym_node_t*>(yyvsp[-3]),
 					static_cast<sym_node_t*>(yyvsp[-1])); }
-#line 2261 "pars0grm.cc"
+#line 2183 "pars0grm.cc"
     break;
 
-  case 125:
-#line 519 "pars0grm.y"
-    { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
-#line 2267 "pars0grm.cc"
+  case 120: /* column_list: PARS_ID_TOKEN  */
+#line 506 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2189 "pars0grm.cc"
     break;
 
-  case 126:
-#line 521 "pars0grm.y"
-    { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
-#line 2273 "pars0grm.cc"
+  case 121: /* column_list: column_list ',' PARS_ID_TOKEN  */
+#line 508 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2195 "pars0grm.cc"
     break;
 
-  case 127:
-#line 525 "pars0grm.y"
-    { yyval = NULL; }
-#line 2279 "pars0grm.cc"
+  case 122: /* unique_def: %empty  */
+#line 512 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2201 "pars0grm.cc"
     break;
 
-  case 128:
-#line 526 "pars0grm.y"
-    { yyval = &pars_unique_token; }
-#line 2285 "pars0grm.cc"
+  case 123: /* unique_def: PARS_UNIQUE_TOKEN  */
+#line 513 "pars0grm.y"
+                                { yyval = &pars_unique_token; }
+#line 2207 "pars0grm.cc"
     break;
 
-  case 129:
-#line 530 "pars0grm.y"
-    { yyval = NULL; }
-#line 2291 "pars0grm.cc"
+  case 124: /* clustered_def: %empty  */
+#line 517 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2213 "pars0grm.cc"
     break;
 
-  case 130:
-#line 531 "pars0grm.y"
-    { yyval = &pars_clustered_token; }
-#line 2297 "pars0grm.cc"
+  case 125: /* clustered_def: PARS_CLUSTERED_TOKEN  */
+#line 518 "pars0grm.y"
+                                { yyval = &pars_clustered_token; }
+#line 2219 "pars0grm.cc"
     break;
 
-  case 131:
-#line 540 "pars0grm.y"
-    { yyval = pars_create_index(
+  case 126: /* create_index: PARS_CREATE_TOKEN unique_def clustered_def PARS_INDEX_TOKEN PARS_ID_TOKEN PARS_ON_TOKEN table_name '(' column_list ')'  */
+#line 527 "pars0grm.y"
+                                { yyval = pars_create_index(
 					static_cast<pars_res_word_t*>(yyvsp[-8]),
 					static_cast<pars_res_word_t*>(yyvsp[-7]),
 					static_cast<sym_node_t*>(yyvsp[-5]),
 					static_cast<sym_node_t*>(yyvsp[-3]),
 					static_cast<sym_node_t*>(yyvsp[-1])); }
-#line 2308 "pars0grm.cc"
+#line 2230 "pars0grm.cc"
     break;
 
-  case 132:
-#line 549 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 2314 "pars0grm.cc"
+  case 127: /* table_name: PARS_ID_TOKEN  */
+#line 536 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2236 "pars0grm.cc"
     break;
 
-  case 133:
-#line 550 "pars0grm.y"
-    { yyval = yyvsp[0]; }
-#line 2320 "pars0grm.cc"
+  case 128: /* table_name: PARS_TABLE_NAME_TOKEN  */
+#line 537 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2242 "pars0grm.cc"
     break;
 
-  case 134:
-#line 555 "pars0grm.y"
-    { yyval = pars_commit_statement(); }
-#line 2326 "pars0grm.cc"
+  case 129: /* commit_statement: PARS_COMMIT_TOKEN PARS_WORK_TOKEN  */
+#line 542 "pars0grm.y"
+                                { yyval = pars_commit_statement(); }
+#line 2248 "pars0grm.cc"
     break;
 
-  case 135:
-#line 560 "pars0grm.y"
-    { yyval = pars_rollback_statement(); }
-#line 2332 "pars0grm.cc"
+  case 130: /* rollback_statement: PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN  */
+#line 547 "pars0grm.y"
+                                { yyval = pars_rollback_statement(); }
+#line 2254 "pars0grm.cc"
     break;
 
-  case 136:
-#line 564 "pars0grm.y"
-    { yyval = &pars_int_token; }
-#line 2338 "pars0grm.cc"
+  case 131: /* type_name: PARS_INT_TOKEN  */
+#line 551 "pars0grm.y"
+                                { yyval = &pars_int_token; }
+#line 2260 "pars0grm.cc"
     break;
 
-  case 137:
-#line 565 "pars0grm.y"
-    { yyval = &pars_bigint_token; }
-#line 2344 "pars0grm.cc"
+  case 132: /* type_name: PARS_BIGINT_TOKEN  */
+#line 552 "pars0grm.y"
+                                { yyval = &pars_bigint_token; }
+#line 2266 "pars0grm.cc"
     break;
 
-  case 138:
-#line 566 "pars0grm.y"
-    { yyval = &pars_char_token; }
-#line 2350 "pars0grm.cc"
+  case 133: /* type_name: PARS_CHAR_TOKEN  */
+#line 553 "pars0grm.y"
+                                { yyval = &pars_char_token; }
+#line 2272 "pars0grm.cc"
     break;
 
-  case 139:
-#line 571 "pars0grm.y"
-    { yyval = pars_variable_declaration(
+  case 134: /* variable_declaration: PARS_ID_TOKEN type_name ';'  */
+#line 558 "pars0grm.y"
+                                { yyval = pars_variable_declaration(
 					static_cast<sym_node_t*>(yyvsp[-2]),
 					static_cast<pars_res_word_t*>(yyvsp[-1])); }
-#line 2358 "pars0grm.cc"
+#line 2280 "pars0grm.cc"
     break;
 
-  case 143:
-#line 585 "pars0grm.y"
-    { yyval = pars_cursor_declaration(
+  case 138: /* cursor_declaration: PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN PARS_IS_TOKEN select_statement ';'  */
+#line 572 "pars0grm.y"
+                                { yyval = pars_cursor_declaration(
 					static_cast<sym_node_t*>(yyvsp[-3]),
 					static_cast<sel_node_t*>(yyvsp[-1])); }
-#line 2366 "pars0grm.cc"
+#line 2288 "pars0grm.cc"
     break;
 
-  case 144:
-#line 592 "pars0grm.y"
-    { yyval = pars_function_declaration(
+  case 139: /* function_declaration: PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'  */
+#line 579 "pars0grm.y"
+                                { yyval = pars_function_declaration(
 					static_cast<sym_node_t*>(yyvsp[-1])); }
-#line 2373 "pars0grm.cc"
+#line 2295 "pars0grm.cc"
     break;
 
-  case 150:
-#line 614 "pars0grm.y"
-    { yyval = pars_procedure_definition(
+  case 145: /* procedure_definition: PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' ')' PARS_IS_TOKEN variable_declaration_list declaration_list PARS_BEGIN_TOKEN statement_list PARS_END_TOKEN  */
+#line 601 "pars0grm.y"
+                                { yyval = pars_procedure_definition(
 					static_cast<sym_node_t*>(yyvsp[-8]), yyvsp[-1]); }
-#line 2380 "pars0grm.cc"
+#line 2302 "pars0grm.cc"
     break;
 
 
-#line 2384 "pars0grm.cc"
+#line 2306 "pars0grm.cc"
 
       default: break;
     }
@@ -2399,11 +2321,10 @@ yyreduce:
      case of YYERROR or YYBACKUP, subsequent parser actions might lead
      to an incorrect destructor call or verbose syntax error message
      before the lookahead is translated.  */
-  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+  YY_SYMBOL_PRINT ("-> $$ =", YY_CAST (yysymbol_kind_t, yyr1[yyn]), &yyval, &yyloc);
 
   YYPOPSTACK (yylen);
   yylen = 0;
-  YY_STACK_PRINT (yyss, yyssp);
 
   *++yyvsp = yyval;
 
@@ -2427,50 +2348,14 @@ yyreduce:
 yyerrlab:
   /* Make sure we have latest lookahead translation.  See comments at
      user semantic actions for why this is necessary.  */
-  yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
-
+  yytoken = yychar == YYEMPTY ? YYSYMBOL_YYEMPTY : YYTRANSLATE (yychar);
   /* If not already recovering from an error, report this error.  */
   if (!yyerrstatus)
     {
       ++yynerrs;
-#if ! YYERROR_VERBOSE
       yyerror (YY_("syntax error"));
-#else
-# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
-                                        yyssp, yytoken)
-      {
-        char const *yymsgp = YY_("syntax error");
-        int yysyntax_error_status;
-        yysyntax_error_status = YYSYNTAX_ERROR;
-        if (yysyntax_error_status == 0)
-          yymsgp = yymsg;
-        else if (yysyntax_error_status == 1)
-          {
-            if (yymsg != yymsgbuf)
-              YYSTACK_FREE (yymsg);
-            yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
-            if (!yymsg)
-              {
-                yymsg = yymsgbuf;
-                yymsg_alloc = sizeof yymsgbuf;
-                yysyntax_error_status = 2;
-              }
-            else
-              {
-                yysyntax_error_status = YYSYNTAX_ERROR;
-                yymsgp = yymsg;
-              }
-          }
-        yyerror (yymsgp);
-        if (yysyntax_error_status == 2)
-          goto yyexhaustedlab;
-      }
-# undef YYSYNTAX_ERROR
-#endif
     }
 
-
-
   if (yyerrstatus == 3)
     {
       /* If just tried and failed to reuse lookahead token after an
@@ -2519,13 +2404,14 @@ yyerrorlab:
 yyerrlab1:
   yyerrstatus = 3;      /* Each real token shifted decrements this.  */
 
+  /* Pop stack until we find a state that shifts the error token.  */
   for (;;)
     {
       yyn = yypact[yystate];
       if (!yypact_value_is_default (yyn))
         {
-          yyn += YYTERROR;
-          if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+          yyn += YYSYMBOL_YYerror;
+          if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYSYMBOL_YYerror)
             {
               yyn = yytable[yyn];
               if (0 < yyn)
@@ -2539,7 +2425,7 @@ yyerrlab1:
 
 
       yydestruct ("Error: popping",
-                  yystos[yystate], yyvsp);
+                  YY_ACCESSING_SYMBOL (yystate), yyvsp);
       YYPOPSTACK (1);
       yystate = *yyssp;
       YY_STACK_PRINT (yyss, yyssp);
@@ -2551,7 +2437,7 @@ yyerrlab1:
 
 
   /* Shift the error token.  */
-  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+  YY_SYMBOL_PRINT ("Shifting", YY_ACCESSING_SYMBOL (yyn), yyvsp, yylsp);
 
   yystate = yyn;
   goto yynewstate;
@@ -2573,20 +2459,20 @@ yyabortlab:
   goto yyreturn;
 
 
-#if !defined yyoverflow || YYERROR_VERBOSE
+#if !defined yyoverflow
 /*-------------------------------------------------.
 | yyexhaustedlab -- memory exhaustion comes here.  |
 `-------------------------------------------------*/
 yyexhaustedlab:
   yyerror (YY_("memory exhausted"));
   yyresult = 2;
-  /* Fall through.  */
+  goto yyreturn;
 #endif
 
 
-/*-----------------------------------------------------.
-| yyreturn -- parsing is finished, return the result.  |
-`-----------------------------------------------------*/
+/*-------------------------------------------------------.
+| yyreturn -- parsing is finished, clean up and return.  |
+`-------------------------------------------------------*/
 yyreturn:
   if (yychar != YYEMPTY)
     {
@@ -2603,18 +2489,16 @@ yyreturn:
   while (yyssp != yyss)
     {
       yydestruct ("Cleanup: popping",
-                  yystos[*yyssp], yyvsp);
+                  YY_ACCESSING_SYMBOL (+*yyssp), yyvsp);
       YYPOPSTACK (1);
     }
 #ifndef yyoverflow
   if (yyss != yyssa)
     YYSTACK_FREE (yyss);
 #endif
-#if YYERROR_VERBOSE
-  if (yymsg != yymsgbuf)
-    YYSTACK_FREE (yymsg);
-#endif
+
   return yyresult;
 }
-#line 618 "pars0grm.y"
+
+#line 605 "pars0grm.y"
 
diff --git a/storage/innobase/pars/pars0grm.y b/storage/innobase/pars/pars0grm.y
index ed2b9bc09b0..baa7100cfde 100644
--- a/storage/innobase/pars/pars0grm.y
+++ b/storage/innobase/pars/pars0grm.y
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2019, MariaDB Corporation.
+Copyright (c) 2017, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -144,8 +144,7 @@ top_statement:
         procedure_definition ';'
 
 statement:
-	stored_procedure_call
-	| while_statement ';'
+	while_statement ';'
 	| for_statement ';'
 	| exit_statement ';'
 	| if_statement ';'
@@ -212,18 +211,6 @@ function_name:
 	| PARS_LENGTH_TOKEN	{ $$ = &pars_length_token; }
 ;
 
-question_mark_list:
-	/* Nothing */
-	| '?'
-	| question_mark_list ',' '?'
-;
-
-stored_procedure_call:
-	'{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
-				{ $$ = pars_stored_procedure_call(
-					static_cast<sym_node_t*>($2)); }
-;
-
 user_function_call:
 	PARS_ID_TOKEN '(' ')'	{ $$ = $1; }
 ;
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
index e1a913b0179..f3b71132998 100644
--- a/storage/innobase/pars/pars0opt.cc
+++ b/storage/innobase/pars/pars0opt.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -552,13 +552,8 @@ opt_search_plan_for_table(
 {
 	plan_t*		plan;
 	dict_index_t*	index;
-	dict_index_t*	best_index;
 	ulint		n_fields;
-	ulint		goodness;
-	ulint		last_op		= 75946965;	/* Eliminate a Purify
-							warning */
-	ulint		best_goodness;
-	ulint		best_last_op = 0; /* remove warning */
+	ulint		best_last_op;
 	que_node_t*	index_plan[256];
 	que_node_t*	best_index_plan[256];
 
@@ -571,30 +566,28 @@ opt_search_plan_for_table(
 
 	/* Calculate goodness for each index of the table */
 
-	index = dict_table_get_first_index(table);
-	best_index = index; /* Eliminate compiler warning */
-	best_goodness = 0;
+	plan->index = index = dict_table_get_first_index(table);
+	ulint best_goodness = opt_calc_index_goodness(
+		index, sel_node, i, best_index_plan, &best_last_op);
 
-	/* should be do ... until ? comment by Jani */
-	while (index) {
-		goodness = opt_calc_index_goodness(index, sel_node, i,
-						   index_plan, &last_op);
+	while ((index = dict_table_get_next_index(index))) {
+		if (!index->is_btree()) {
+			continue;
+		}
+		ulint last_op;
+		ulint goodness = opt_calc_index_goodness(index, sel_node, i,
+							 index_plan, &last_op);
 		if (goodness > best_goodness) {
-
-			best_index = index;
 			best_goodness = goodness;
+			plan->index = index;
 			n_fields = opt_calc_n_fields_from_goodness(goodness);
 
 			memcpy(best_index_plan, index_plan,
 			       n_fields * sizeof *index_plan);
 			best_last_op = last_op;
 		}
-
-		dict_table_next_uncorrupted_index(index);
 	}
 
-	plan->index = best_index;
-
 	n_fields = opt_calc_n_fields_from_goodness(best_goodness);
 
 	if (n_fields == 0) {
@@ -612,27 +605,25 @@ opt_search_plan_for_table(
 
 		memcpy(plan->tuple_exps, best_index_plan,
 		       n_fields * sizeof *best_index_plan);
-		if (best_last_op == '='
-		    || best_last_op == PARS_LIKE_TOKEN_EXACT
-                    || best_last_op == PARS_LIKE_TOKEN_PREFIX
-                    || best_last_op == PARS_LIKE_TOKEN_SUFFIX
-                    || best_last_op == PARS_LIKE_TOKEN_SUBSTR) {
-			plan->n_exact_match = n_fields;
-		} else {
-			plan->n_exact_match = n_fields - 1;
+
+		switch (best_last_op) {
+		case '=':
+		case PARS_LIKE_TOKEN_EXACT:
+		case PARS_LIKE_TOKEN_PREFIX:
+		case PARS_LIKE_TOKEN_SUFFIX:
+		case PARS_LIKE_TOKEN_SUBSTR:
+			break;
+		default:
+			n_fields--;
 		}
 
+		plan->n_exact_match = n_fields;
 		plan->mode = opt_op_to_search_mode(sel_node->asc,
 						   best_last_op);
 	}
 
-	if (dict_index_is_clust(best_index)
-	    && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) {
-
-		plan->unique_search = TRUE;
-	} else {
-		plan->unique_search = FALSE;
-	}
+	plan->unique_search = plan->index->is_clust()
+		&& plan->n_exact_match >= plan->index->n_uniq;
 
 	plan->old_vers_heap = NULL;
 
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
index 2981e31c05f..61614007bd4 100644
--- a/storage/innobase/pars/pars0pars.cc
+++ b/storage/innobase/pars/pars0pars.cc
@@ -766,7 +766,7 @@ pars_retrieve_table_def(
 		sym_node->token_type = SYM_TABLE_REF_COUNTED;
 
 		sym_node->table = dict_table_open_on_name(
-			sym_node->name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+			sym_node->name, true, DICT_ERR_IGNORE_NONE);
 
 		ut_a(sym_node->table != NULL);
 	}
@@ -1783,8 +1783,9 @@ pars_create_table(
 
 	n_cols = que_node_list_get_len(column_defs);
 
-	table = dict_mem_table_create(
-		table_sym->name, NULL, n_cols, 0, flags, flags2);
+	table = dict_table_t::create(
+		{table_sym->name, strlen(table_sym->name)},
+		nullptr, n_cols, 0, flags, flags2);
 
 	mem_heap_t* heap = pars_sym_tab_global->heap;
 	column = column_defs;
@@ -1802,9 +1803,7 @@ pars_create_table(
 	}
 
 	dict_table_add_system_columns(table, heap);
-	node = tab_create_graph_create(table, heap,
-				       FIL_ENCRYPTION_DEFAULT,
-				       FIL_DEFAULT_ENCRYPTION_KEY);
+	node = tab_create_graph_create(table, heap);
 
 	table_sym->resolved = TRUE;
 	table_sym->token_type = SYM_TABLE;
@@ -1858,7 +1857,9 @@ pars_create_index(
 	}
 
 	node = ind_create_graph_create(index, table_sym->name,
-				       pars_sym_tab_global->heap);
+				       pars_sym_tab_global->heap,
+				       FIL_ENCRYPTION_DEFAULT,
+				       FIL_DEFAULT_ENCRYPTION_KEY);
 
 	table_sym->resolved = TRUE;
 	table_sym->token_type = SYM_TABLE;
@@ -1886,7 +1887,7 @@ pars_procedure_definition(
 
 	heap = pars_sym_tab_global->heap;
 
-	fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap);
+	fork = que_fork_create(heap);
 	fork->trx = NULL;
 
 	thr = que_thr_create(fork, heap, NULL);
@@ -1915,22 +1916,6 @@ pars_procedure_definition(
 }
 
 /*************************************************************//**
-Parses a stored procedure call, when this is not within another stored
-procedure, that is, the client issues a procedure call directly.
-In MySQL/InnoDB, stored InnoDB procedures are invoked via the
-parsed procedure tree, not via InnoDB SQL, so this function is not used.
-@return query graph */
-que_fork_t*
-pars_stored_procedure_call(
-/*=======================*/
-	sym_node_t*	sym_node MY_ATTRIBUTE((unused)))
-					/*!< in: stored procedure name */
-{
-	ut_error;
-	return(NULL);
-}
-
-/*************************************************************//**
 Retrieves characters to the lexical analyzer. */
 int
 pars_get_lex_chars(
@@ -1988,7 +1973,7 @@ pars_sql(
 	heap = mem_heap_create(16000);
 
 	/* Currently, the parser is not reentrant: */
-	ut_ad(mutex_own(&dict_sys.mutex));
+	ut_ad(dict_sys.locked());
 
 	pars_sym_tab_global = sym_tab_create(heap);
 
@@ -2021,8 +2006,7 @@ pars_sql(
 }
 
 /** Completes a query graph by adding query thread and fork nodes
-above it and prepares the graph for running. The fork created is of
-type QUE_FORK_MYSQL_INTERFACE.
+above it and prepares the graph for running.
 @param[in]	node		root node for an incomplete query
 				graph, or NULL for dummy graph
 @param[in]	trx		transaction handle
@@ -2039,7 +2023,7 @@ pars_complete_graph_for_exec(
 	que_fork_t*	fork;
 	que_thr_t*	thr;
 
-	fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap);
+	fork = que_fork_create(heap);
 	fork->trx = trx;
 
 	thr = que_thr_create(fork, heap, prebuilt);
@@ -2067,28 +2051,14 @@ pars_info_create(void)
 
 	heap = mem_heap_create(512);
 
-	info = static_cast<pars_info_t*>(mem_heap_alloc(heap, sizeof(*info)));
+	info = static_cast<pars_info_t*>(mem_heap_zalloc(heap, sizeof(*info)));
 
 	info->heap = heap;
-	info->funcs = NULL;
-	info->bound_lits = NULL;
-	info->bound_ids = NULL;
-	info->graph_owns_us = TRUE;
 
 	return(info);
 }
 
 /****************************************************************//**
-Free info struct and everything it contains. */
-void
-pars_info_free(
-/*===========*/
-	pars_info_t*	info)	/*!< in, own: info struct */
-{
-	mem_heap_free(info->heap);
-}
-
-/****************************************************************//**
 Add bound literal. */
 void
 pars_info_add_literal(
diff --git a/storage/innobase/pars/pars0sym.cc b/storage/innobase/pars/pars0sym.cc
index 5e4c0e0f6e0..035415849a7 100644
--- a/storage/innobase/pars/pars0sym.cc
+++ b/storage/innobase/pars/pars0sym.cc
@@ -67,8 +67,6 @@ sym_tab_free_private(
 	sym_node_t*	sym;
 	func_node_t*	func;
 
-	ut_ad(mutex_own(&dict_sys.mutex));
-
 	for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
 	     sym != NULL;
 	     sym = UT_LIST_GET_NEXT(sym_list, sym)) {
@@ -76,8 +74,7 @@ sym_tab_free_private(
 		/* Close the tables opened in pars_retrieve_table_def(). */
 
 		if (sym->token_type == SYM_TABLE_REF_COUNTED) {
-
-			dict_table_close(sym->table, TRUE, FALSE);
+			sym->table->release();
 
 			sym->table = NULL;
 			sym->resolved = FALSE;
diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc
index 759c2cb95e2..d910ee2a881 100644
--- a/storage/innobase/que/que0que.cc
+++ b/storage/innobase/que/que0que.cc
@@ -90,7 +90,7 @@ The commit or rollback can be seen as a subprocedure call.
 When the transaction starts to handle a rollback or commit.
 It builds a query graph which, when executed, will roll back
 or commit the incomplete transaction. The transaction
-is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
+may be moved to the TRX_QUE_ROLLING_BACK state.
 If specified, the SQL cursors opened by the transaction are closed.
 When the execution of the graph completes, it is like returning
 from a subprocedure: the query thread which requested the operation
@@ -100,14 +100,7 @@ starts running again. */
 Creates a query graph fork node.
 @return own: fork node */
 que_fork_t*
-que_fork_create(
-/*============*/
-	que_t*		graph,		/*!< in: graph, if NULL then this
-					fork node is assumed to be the
-					graph root */
-	que_node_t*	parent,		/*!< in: parent node */
-	ulint		fork_type,	/*!< in: fork type */
-	mem_heap_t*	heap)		/*!< in: memory heap where created */
+que_fork_create(mem_heap_t *heap)
 {
 	que_fork_t*	fork;
 
@@ -117,15 +110,11 @@ que_fork_create(
 
 	fork->heap = heap;
 
-	fork->fork_type = fork_type;
-
-	fork->common.parent = parent;
-
 	fork->common.type = QUE_NODE_FORK;
 
 	fork->state = QUE_FORK_COMMAND_WAIT;
 
-	fork->graph = (graph != NULL) ? graph : fork;
+	fork->graph = fork;
 
 	UT_LIST_INIT(fork->thrs, &que_thr_t::thrs);
 
@@ -157,10 +146,6 @@ que_thr_create(
 
 	thr->common.type = QUE_NODE_THR;
 
-	thr->state = QUE_THR_COMMAND_WAIT;
-
-	thr->lock_state = QUE_THR_LOCK_NOLOCK;
-
 	thr->prebuilt = prebuilt;
 
 	UT_LIST_ADD_LAST(parent->thrs, thr);
@@ -169,45 +154,6 @@ que_thr_create(
 }
 
 /**********************************************************************//**
-Moves a suspended query thread to the QUE_THR_RUNNING state and may release
-a worker thread to execute it. This function should be used to end
-the wait state of a query thread waiting for a lock or a stored procedure
-completion.
-@return the query thread that needs to be released. */
-que_thr_t*
-que_thr_end_lock_wait(
-/*==================*/
-	trx_t*		trx)	/*!< in: transaction with que_state in
-				QUE_THR_LOCK_WAIT */
-{
-	que_thr_t*	thr;
-
-	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(trx));
-
-	thr = trx->lock.wait_thr;
-
-	ut_ad(thr != NULL);
-
-	ut_ad(trx->lock.que_state == TRX_QUE_LOCK_WAIT);
-	/* In MySQL this is the only possible state here */
-	ut_a(thr->state == QUE_THR_LOCK_WAIT);
-
-	bool was_active = thr->is_active;
-
-	thr->start_running();
-
-	trx->lock.que_state = TRX_QUE_RUNNING;
-
-	trx->lock.wait_thr = NULL;
-
-	/* In MySQL we let the OS thread (not just the query thread) to wait
-	for the lock to be released: */
-
-	return((!was_active && thr != NULL) ? thr : NULL);
-}
-
-/**********************************************************************//**
 Inits a query thread for a command. */
 UNIV_INLINE
 void
@@ -217,7 +163,7 @@ que_thr_init_command(
 {
 	thr->run_node = thr;
 	thr->prev_node = thr->common.parent;
-	thr->start_running();
+	thr->state = QUE_THR_RUNNING;
 }
 
 /**********************************************************************//**
@@ -231,7 +177,7 @@ que_fork_scheduler_round_robin(
 	que_fork_t*	fork,		/*!< in: a query fork */
 	que_thr_t*	thr)		/*!< in: current pos */
 {
-	trx_mutex_enter(fork->trx);
+	fork->trx->mutex_lock();
 
 	/* If no current, start first available. */
 	if (thr == NULL) {
@@ -245,23 +191,11 @@ que_fork_scheduler_round_robin(
 		fork->state = QUE_FORK_ACTIVE;
 
 		fork->last_sel_node = NULL;
-
-		switch (thr->state) {
-		case QUE_THR_COMMAND_WAIT:
-		case QUE_THR_COMPLETED:
-			ut_a(!thr->is_active);
-			que_thr_init_command(thr);
-			break;
-
-		case QUE_THR_SUSPENDED:
-		case QUE_THR_LOCK_WAIT:
-		default:
-			ut_error;
-
-		}
+		ut_ad(thr->state == QUE_THR_COMPLETED);
+		que_thr_init_command(thr);
 	}
 
-	trx_mutex_exit(fork->trx);
+	fork->trx->mutex_unlock();
 
 	return(thr);
 }
@@ -279,73 +213,15 @@ que_fork_start_command(
 /*===================*/
 	que_fork_t*	fork)	/*!< in: a query fork */
 {
-	que_thr_t*	thr;
-	que_thr_t*	suspended_thr = NULL;
-	que_thr_t*	completed_thr = NULL;
-
 	fork->state = QUE_FORK_ACTIVE;
 
 	fork->last_sel_node = NULL;
 
-	suspended_thr = NULL;
-	completed_thr = NULL;
-
-	/* Choose the query thread to run: usually there is just one thread,
-	but in a parallelized select, which necessarily is non-scrollable,
-	there may be several to choose from */
-
-	/* First we try to find a query thread in the QUE_THR_COMMAND_WAIT
-	state. Then we try to find a query thread in the QUE_THR_SUSPENDED
-	state, finally we try to find a query thread in the QUE_THR_COMPLETED
-	state */
-
-	/* We make a single pass over the thr list within which we note which
-	threads are ready to run. */
-	for (thr = UT_LIST_GET_FIRST(fork->thrs);
-	     thr != NULL;
-	     thr = UT_LIST_GET_NEXT(thrs, thr)) {
-
-		switch (thr->state) {
-		case QUE_THR_COMMAND_WAIT:
-
-			/* We have to send the initial message to query thread
-			to start it */
-
-			que_thr_init_command(thr);
+	que_thr_t* thr = UT_LIST_GET_FIRST(fork->thrs);
 
-			return(thr);
-
-		case QUE_THR_SUSPENDED:
-			/* In this case the execution of the thread was
-			suspended: no initial message is needed because
-			execution can continue from where it was left */
-			if (!suspended_thr) {
-				suspended_thr = thr;
-			}
-
-			break;
-
-		case QUE_THR_COMPLETED:
-			if (!completed_thr) {
-				completed_thr = thr;
-			}
-
-			break;
-
-		case QUE_THR_RUNNING:
-		case QUE_THR_LOCK_WAIT:
-			ut_error;
-		}
-	}
-
-	if (suspended_thr) {
-		thr = suspended_thr;
-		thr->start_running();
-	} else if (completed_thr) {
-		thr = completed_thr;
+	if (thr) {
+		ut_ad(thr->state == QUE_THR_COMPLETED);
 		que_thr_init_command(thr);
-	} else {
-		ut_error;
 	}
 
 	return(thr);
@@ -437,13 +313,9 @@ que_graph_free_recursive(
 	case QUE_NODE_UPDATE:
 		upd = static_cast<upd_node_t*>(node);
 
-		if (upd->in_mysql_interface) {
-
-			btr_pcur_free_for_mysql(upd->pcur);
-			upd->in_mysql_interface = false;
-		}
-
 		que_graph_free_recursive(upd->cascade_node);
+		ut_free(upd->pcur->old_rec_buf);
+		upd->pcur->old_rec_buf = NULL;
 
 		if (upd->cascade_heap) {
 			mem_heap_free(upd->cascade_heap);
@@ -541,7 +413,7 @@ que_graph_free(
 		sym_tab_free_private(graph->sym_tab);
 	}
 
-	if (graph->info && graph->info->graph_owns_us) {
+	if (graph->info) {
 		pars_info_free(graph->info);
 	}
 
@@ -571,186 +443,17 @@ que_thr_node_step(
 		return(thr);
 	}
 
-	trx_mutex_enter(thr_get_trx(thr));
-
-	if (que_thr_peek_stop(thr)) {
-
-		trx_mutex_exit(thr_get_trx(thr));
-
-		return(thr);
-	}
-
-	/* Thread execution completed */
-
-	thr->state = QUE_THR_COMPLETED;
-
-	trx_mutex_exit(thr_get_trx(thr));
-
-	return(NULL);
-}
-
-/**********************************************************************//**
-Stops a query thread if graph or trx is in a state requiring it. The
-conditions are tested in the order (1) graph, (2) trx.
-@return TRUE if stopped */
-ibool
-que_thr_stop(
-/*=========*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	que_t*		graph;
-	trx_t*		trx = thr_get_trx(thr);
-
-	graph = thr->graph;
+	trx_t *trx= thr->graph->trx;
+	trx->mutex_lock();
 
-	ut_ad(trx_mutex_own(trx));
-
-	if (graph->state == QUE_FORK_COMMAND_WAIT) {
-
-		thr->state = QUE_THR_SUSPENDED;
-
-	} else if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-
-		trx->lock.wait_thr = thr;
-		thr->state = QUE_THR_LOCK_WAIT;
-
-	} else if (trx->error_state != DB_SUCCESS
-		   && trx->error_state != DB_LOCK_WAIT) {
-
-		/* Error handling built for the MySQL interface */
+	if (!trx->lock.wait_thr && thr->graph->state == QUE_FORK_ACTIVE) {
 		thr->state = QUE_THR_COMPLETED;
-
-	} else if (graph->fork_type == QUE_FORK_ROLLBACK) {
-
-		thr->state = QUE_THR_SUSPENDED;
-	} else {
-		ut_ad(graph->state == QUE_FORK_ACTIVE);
-
-		return(FALSE);
-	}
-
-	return(TRUE);
-}
-
-/**********************************************************************//**
-Decrements the query thread reference counts in the query graph and the
-transaction.
-*** NOTE ***:
-This and que_thr_stop_for_mysql are the only functions where the reference
-count can be decremented and this function may only be called from inside
-que_run_threads! These restrictions exist to make the rollback code easier
-to maintain. */
-static
-void
-que_thr_dec_refer_count(
-/*====================*/
-	que_thr_t*	thr,		/*!< in: query thread */
-	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
-					if the value which is passed in is
-					a pointer to a NULL pointer, then the
-					calling function can start running
-					a new query thread */
-{
-	trx_t*		trx;
-
-	trx = thr_get_trx(thr);
-
-	ut_a(thr->is_active);
-	ut_ad(trx_mutex_own(trx));
-
-	if (thr->state == QUE_THR_RUNNING) {
-
-		if (!que_thr_stop(thr)) {
-
-			ut_a(next_thr != NULL && *next_thr == NULL);
-
-			/* The reason for the thr suspension or wait was
-			already canceled before we came here: continue
-			running the thread.
-
-			This is also possible because in trx_commit_step() we
-			assume a single query thread. We set the query thread
-			state to QUE_THR_RUNNING. */
-
-			/* fprintf(stderr,
-		       		"Wait already ended: trx: %p\n", trx); */
-
-			/* Normally srv_suspend_mysql_thread resets
-			the state to DB_SUCCESS before waiting, but
-			in this case we have to do it here,
-			otherwise nobody does it. */
-
-			trx->error_state = DB_SUCCESS;
-
-			*next_thr = thr;
-
-			return;
-		}
-	}
-
-	ut_d(static_cast<que_fork_t*>(thr->common.parent)->set_active(false));
-	thr->is_active = false;
-}
-
-/**********************************************************************//**
-A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
-query thread is stopped and made inactive, except in the case where
-it was put to the lock wait state in lock0lock.cc, but the lock has already
-been granted or the transaction chosen as a victim in deadlock resolution. */
-void
-que_thr_stop_for_mysql(
-/*===================*/
-	que_thr_t*	thr)	/*!< in: query thread */
-{
-	trx_t*	trx;
-
-	trx = thr_get_trx(thr);
-
-	trx_mutex_enter(trx);
-
-	if (thr->state == QUE_THR_RUNNING) {
-
-		if (trx->error_state != DB_SUCCESS
-		    && trx->error_state != DB_LOCK_WAIT) {
-
-			/* Error handling built for the MySQL interface */
-			thr->state = QUE_THR_COMPLETED;
-		} else {
-			/* It must have been a lock wait but the lock was
-			already released, or this transaction was chosen
-			as a victim in selective deadlock resolution */
-
-			trx_mutex_exit(trx);
-
-			return;
-		}
+		thr = NULL;
 	}
 
-	ut_ad(thr->is_active);
-	ut_d(thr->set_active(false));
-	thr->is_active= false;
-
-	trx_mutex_exit(trx);
-}
-
-#ifdef UNIV_DEBUG
-/** Change the 'active' status */
-void que_fork_t::set_active(bool active)
-{
-  if (active)
-  {
-    n_active_thrs++;
-    trx->lock.n_active_thrs++;
-  }
-  else
-  {
-    ut_ad(n_active_thrs);
-    ut_ad(trx->lock.n_active_thrs);
-    n_active_thrs--;
-    trx->lock.n_active_thrs--;
-  }
+	trx->mutex_unlock();
+	return(thr);
 }
-#endif
 
 /****************************************************************//**
 Get the first containing loop node (e.g. while_node_t or for_node_t) for the
@@ -783,6 +486,26 @@ que_node_get_containing_loop_node(
 }
 
 /**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@param thr query thread */
+static void open_step(que_thr_t *thr)
+{
+  open_node_t *node= static_cast<open_node_t*>(thr->run_node);
+  ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+  sel_node_t *sel_node= node->cursor_def;
+
+  if (node->op_type == ROW_SEL_OPEN_CURSOR)
+    sel_node->state= SEL_NODE_OPEN;
+  else
+  {
+    ut_ad(sel_node->state != SEL_NODE_CLOSED);
+    sel_node->state= SEL_NODE_CLOSED;
+  }
+
+  thr->run_node= que_node_get_parent(node);
+}
+
+/**********************************************************************//**
 Performs an execution step on a query thread.
 @return query thread to run next: it may differ from the input
 parameter if, e.g., a subprocedure call is made */
@@ -848,7 +571,7 @@ que_thr_step(
 	} else if (type == QUE_NODE_FETCH) {
 		thr = fetch_step(thr);
 	} else if (type == QUE_NODE_OPEN) {
-		thr = open_step(thr);
+		open_step(thr);
 	} else if (type == QUE_NODE_FUNC) {
 		proc_eval_step(thr);
 
@@ -900,19 +623,14 @@ que_run_threads_low(
 /*================*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	trx_t*		trx;
-	que_thr_t*	next_thr;
-
 	ut_ad(thr->state == QUE_THR_RUNNING);
-	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
-	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
 
 	/* cumul_resource counts how much resources the OS thread (NOT the
 	query thread) has spent in this function */
 
-	trx = thr_get_trx(thr);
-
-	do {
+	for (trx_t* trx = thr_get_trx(thr);;) {
+		ut_ad(!trx->mutex_is_owner());
+		ut_a(trx->error_state == DB_SUCCESS);
 		/* Check that there is enough space in the log to accommodate
 		possible log entries by this query step; if the operation can
 		touch more than about 4 pages, checks must be made also within
@@ -923,33 +641,14 @@ que_run_threads_low(
 		/* Perform the actual query step: note that the query thread
 		may change if, e.g., a subprocedure call is made */
 
-		/*-------------------------*/
-		next_thr = que_thr_step(thr);
-		/*-------------------------*/
-
-		trx_mutex_enter(trx);
-
-		ut_a(next_thr == NULL || trx->error_state == DB_SUCCESS);
-
-		if (next_thr != thr) {
-			ut_a(next_thr == NULL);
-
-			/* This can change next_thr to a non-NULL value
-			if there was a lock wait that already completed. */
-
-			que_thr_dec_refer_count(thr, &next_thr);
-
-			if (next_thr != NULL) {
-
-				thr = next_thr;
-			}
-		}
-
+		que_thr_t* next_thr = que_thr_step(thr);
 		ut_ad(trx == thr_get_trx(thr));
+		if (!next_thr) {
+			return;
+		}
 
-		trx_mutex_exit(trx);
-
-	} while (next_thr != NULL);
+		ut_a(next_thr == thr);
+	}
 }
 
 /**********************************************************************//**
@@ -959,47 +658,20 @@ que_run_threads(
 /*============*/
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	ut_ad(!trx_mutex_own(thr_get_trx(thr)));
-
+	trx_t* trx = thr->graph->trx;
 loop:
-	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
-
+	ut_a(trx->error_state == DB_SUCCESS);
 	que_run_threads_low(thr);
 
-	switch (thr->state) {
-
-	case QUE_THR_RUNNING:
-		/* There probably was a lock wait, but it already ended
-		before we came here: continue running thr */
-
-		goto loop;
-
-	case QUE_THR_LOCK_WAIT:
-		lock_wait_suspend_thread(thr);
-
-		trx_mutex_enter(thr_get_trx(thr));
-
-		ut_a(thr_get_trx(thr)->id != 0);
-
-		if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
-			/* thr was chosen as a deadlock victim or there was
-			a lock wait timeout */
-
-			que_thr_dec_refer_count(thr, NULL);
-			trx_mutex_exit(thr_get_trx(thr));
-			break;
+	if (thr->state != QUE_THR_COMPLETED) {
+		if (trx->lock.wait_thr) {
+			ut_ad(trx->id);
+			if (lock_wait(thr) == DB_SUCCESS) {
+				goto loop;
+			}
+		} else if (trx->error_state == DB_SUCCESS) {
+			goto loop;
 		}
-
-		trx_mutex_exit(thr_get_trx(thr));
-		goto loop;
-
-	case QUE_THR_COMPLETED:
-	case QUE_THR_COMMAND_WAIT:
-		/* Do nothing */
-		break;
-
-	default:
-		ut_error;
 	}
 }
 
@@ -1011,9 +683,6 @@ que_eval_sql(
 /*=========*/
 	pars_info_t*	info,	/*!< in: info struct, or NULL */
 	const char*	sql,	/*!< in: SQL string */
-	bool		reserve_dict_mutex,
-				/*!< in: whether to acquire/release
-				dict_sys.mutex around call to pars_sql. */
 	trx_t*		trx)	/*!< in: trx */
 {
 	que_thr_t*	thr;
@@ -1024,34 +693,16 @@ que_eval_sql(
 
 	ut_a(trx->error_state == DB_SUCCESS);
 
-	if (reserve_dict_mutex) {
-		mutex_enter(&dict_sys.mutex);
-	}
-
 	graph = pars_sql(info, sql);
 
-	if (reserve_dict_mutex) {
-		mutex_exit(&dict_sys.mutex);
-	}
-
 	graph->trx = trx;
 	trx->graph = NULL;
 
-	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
-
 	ut_a(thr = que_fork_start_command(graph));
 
 	que_run_threads(thr);
 
-	if (reserve_dict_mutex) {
-		mutex_enter(&dict_sys.mutex);
-	}
-
 	que_graph_free(graph);
 
-	if (reserve_dict_mutex) {
-		mutex_exit(&dict_sys.mutex);
-	}
-
 	DBUG_RETURN(trx->error_state);
 }
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
index 9047618d01c..97eda7dba32 100644
--- a/storage/innobase/read/read0read.cc
+++ b/storage/innobase/read/read0read.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2021, MariaDB Corporation.
+Copyright (c) 2018, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -173,9 +173,22 @@ For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
 inline void ReadViewBase::snapshot(trx_t *trx)
 {
   trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
+  if (m_ids.empty())
+  {
+    m_up_limit_id= m_low_limit_id;
+    return;
+  }
+
   std::sort(m_ids.begin(), m_ids.end());
-  m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
+  m_up_limit_id= m_ids.front();
   ut_ad(m_up_limit_id <= m_low_limit_id);
+
+  if (m_low_limit_no == m_low_limit_id &&
+      m_low_limit_id == m_up_limit_id + m_ids.size())
+  {
+    m_ids.clear();
+    m_low_limit_id= m_low_limit_no= m_up_limit_id;
+  }
 }
 
 
@@ -226,10 +239,10 @@ void ReadView::open(trx_t *trx)
       m_open.store(true, std::memory_order_relaxed);
     else
     {
-      mutex_enter(&m_mutex);
+      m_mutex.wr_lock();
       snapshot(trx);
       m_open.store(true, std::memory_order_relaxed);
-      mutex_exit(&m_mutex);
+      m_mutex.wr_unlock();
     }
   }
 }
diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc
index 89eea764e06..536ea224d18 100644
--- a/storage/innobase/rem/rem0cmp.cc
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -281,15 +281,13 @@ static int cmp_data(ulint mtype, ulint prtype, const byte *data1, ulint len1,
     /* fall through */
   case DATA_VARMYSQL:
     DBUG_ASSERT(is_strnncoll_compatible(prtype & DATA_MYSQL_TYPE_MASK));
-    if (CHARSET_INFO *cs= get_charset(dtype_get_charset_coll(prtype),
-                                      MYF(MY_WME)))
+    if (CHARSET_INFO *cs= all_charsets[dtype_get_charset_coll(prtype)])
       return cs->coll->strnncollsp(cs, data1, len1, data2, len2);
   no_collation:
     ib::fatal() << "Unable to find charset-collation for " << prtype;
   case DATA_MYSQL:
     DBUG_ASSERT(is_strnncoll_compatible(prtype & DATA_MYSQL_TYPE_MASK));
-    if (CHARSET_INFO *cs= get_charset(dtype_get_charset_coll(prtype),
-                                      MYF(MY_WME)))
+    if (CHARSET_INFO *cs= all_charsets[dtype_get_charset_coll(prtype)])
       return cs->coll->strnncollsp_nchars(cs, data1, len1, data2, len2,
                                           std::max(len1, len2),
 	         MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES);
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
index d202afa9e20..766f5000a0d 100644
--- a/storage/innobase/rem/rem0rec.cc
+++ b/storage/innobase/rem/rem0rec.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index bc93ca25195..cc8844c3bd4 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -216,7 +216,7 @@ row_fts_psort_info_init(
 	common_info->old_zip_size = old_zip_size;
 	common_info->trx = trx;
 	common_info->all_info = psort_info;
-	common_info->sort_event = os_event_create(0);
+	pthread_cond_init(&common_info->sort_cond, nullptr);
 	common_info->opt_doc_id_size = opt_doc_id_size;
 
 	if (log_tmp_is_encrypted()) {
@@ -285,7 +285,7 @@ row_fts_psort_info_init(
 		psort_info[j].psort_common = common_info;
 		psort_info[j].error = DB_SUCCESS;
 		psort_info[j].memory_used = 0;
-		mutex_create(LATCH_ID_FTS_PLL_TOKENIZE, &psort_info[j].mutex);
+		mysql_mutex_init(0, &psort_info[j].mutex, nullptr);
 	}
 
 	/* Initialize merge_info structures parallel merge and insert
@@ -332,10 +332,10 @@ row_fts_psort_info_destroy(
 				aligned_free(psort_info[j].crypt_block[i]);
 			}
 
-			mutex_free(&psort_info[j].mutex);
+			mysql_mutex_destroy(&psort_info[j].mutex);
 		}
 
-		os_event_destroy(merge_info[0].psort_common->sort_event);
+		pthread_cond_destroy(&merge_info[0].psort_common->sort_cond);
 		ut_free(merge_info[0].psort_common->dup);
 		ut_free(merge_info[0].psort_common);
 		ut_free(psort_info);
@@ -721,7 +721,7 @@ row_merge_fts_get_next_doc_item(
 		ut_free(*doc_item);
 	}
 
-	mutex_enter(&psort_info->mutex);
+	mysql_mutex_lock(&psort_info->mutex);
 
 	*doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
 	if (*doc_item != NULL) {
@@ -733,7 +733,7 @@ row_merge_fts_get_next_doc_item(
 			+ (*doc_item)->field->len;
 	}
 
-	mutex_exit(&psort_info->mutex);
+	mysql_mutex_unlock(&psort_info->mutex);
 }
 
 /*********************************************************************//**
@@ -917,7 +917,7 @@ loop:
 	}
 
 	if (doc_item == NULL) {
-		os_thread_yield();
+		std::this_thread::yield();
 	}
 
 	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
@@ -1032,9 +1032,9 @@ func_exit:
 
 	mem_heap_free(blob_heap);
 
-	mutex_enter(&psort_info->mutex);
+	mysql_mutex_lock(&psort_info->mutex);
 	psort_info->error = error;
-	mutex_exit(&psort_info->mutex);
+	mysql_mutex_unlock(&psort_info->mutex);
 
 	if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) {
 		/* child can exit either with error or told by parent. */
@@ -1047,9 +1047,10 @@ func_exit:
 		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
 	} while (doc_item != NULL);
 
+	mysql_mutex_lock(&psort_info->mutex);
 	psort_info->child_status = FTS_CHILD_COMPLETE;
-	os_event_set(psort_info->psort_common->sort_event);
-	psort_info->child_status = FTS_CHILD_EXITING;
+	pthread_cond_signal(&psort_info->psort_common->sort_cond);
+	mysql_mutex_unlock(&psort_info->mutex);
 }
 
 /*********************************************************************//**
@@ -1632,7 +1633,7 @@ row_fts_merge_insert(
 
 	/* Get aux index */
 	fts_get_table_name(&fts_table, aux_table_name);
-	aux_table = dict_table_open_on_name(aux_table_name, FALSE, FALSE,
+	aux_table = dict_table_open_on_name(aux_table_name, false,
 					    DICT_ERR_IGNORE_NONE);
 	ut_ad(aux_table != NULL);
 	aux_index = dict_table_get_first_index(aux_table);
@@ -1768,7 +1769,7 @@ exit:
 	error = ins_ctx.btr_bulk->finish(error);
 	UT_DELETE(ins_ctx.btr_bulk);
 
-	dict_table_close(aux_table, FALSE, FALSE);
+	aux_table->release();
 
 	trx->free();
 
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index 3c29461c19d..4afe9e874bb 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2022, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -32,15 +32,15 @@ Created 2012-02-08 by Sunny Bains.
 #include "que0que.h"
 #include "dict0boot.h"
 #include "dict0load.h"
-#include "ibuf0ibuf.h"
 #include "pars0pars.h"
+#include "row0row.h"
 #include "row0sel.h"
 #include "row0mysql.h"
 #include "srv0start.h"
 #include "row0quiesce.h"
 #include "fil0pagecompress.h"
 #include "trx0undo.h"
-#include "row0row.h"
+#include "lock0lock.h"
 #ifdef HAVE_LZO
 #include "lzo/lzo1x.h"
 #endif
@@ -258,19 +258,18 @@ public:
 	}
 
 	/** Position the cursor on the first user record. */
-	void	open(buf_block_t* block) UNIV_NOTHROW
+	rec_t* open(buf_block_t* block, const dict_index_t* index) noexcept
+		MY_ATTRIBUTE((warn_unused_result))
 	{
+		m_cur.index = const_cast<dict_index_t*>(index);
 		page_cur_set_before_first(block, &m_cur);
-
-		if (!end()) {
-			next();
-		}
+		return next();
 	}
 
 	/** Move to the next record. */
-	void	next() UNIV_NOTHROW
+	rec_t* next() noexcept MY_ATTRIBUTE((warn_unused_result))
 	{
-		page_cur_move_to_next(&m_cur);
+		return page_cur_move_to_next(&m_cur);
 	}
 
 	/**
@@ -292,37 +291,36 @@ public:
 
 	/** Remove the current record
 	@return true on success */
-	bool remove(
-		const dict_index_t*	index,
-		rec_offs*		offsets) UNIV_NOTHROW
+	bool remove(rec_offs* offsets) UNIV_NOTHROW
 	{
-		ut_ad(page_is_leaf(m_cur.block->frame));
+		const dict_index_t* const index = m_cur.index;
+		ut_ad(page_is_leaf(m_cur.block->page.frame));
 		/* We can't end up with an empty page unless it is root. */
-		if (page_get_n_recs(m_cur.block->frame) <= 1) {
+		if (page_get_n_recs(m_cur.block->page.frame) <= 1) {
 			return(false);
 		}
 
 		if (!rec_offs_any_extern(offsets)
 		    && m_cur.block->page.id().page_no() != index->page
-		    && ((page_get_data_size(m_cur.block->frame)
+		    && ((page_get_data_size(m_cur.block->page.frame)
 			 - rec_offs_size(offsets)
 			 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))
-			|| !page_has_siblings(m_cur.block->frame)
-			|| (page_get_n_recs(m_cur.block->frame) < 2))) {
+			|| !page_has_siblings(m_cur.block->page.frame)
+			|| (page_get_n_recs(m_cur.block->page.frame) < 2))) {
 			return false;
 		}
 
 #ifdef UNIV_ZIP_DEBUG
 		page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block);
 		ut_a(!page_zip || page_zip_validate(
-			     page_zip, m_cur.block->frame, index));
+			     page_zip, m_cur.block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
-		page_cur_delete_rec(&m_cur, index, offsets, &m_mtr);
+		page_cur_delete_rec(&m_cur, offsets, &m_mtr);
 
 #ifdef UNIV_ZIP_DEBUG
 		ut_a(!page_zip || page_zip_validate(
-			     page_zip, m_cur.block->frame, index));
+			     page_zip, m_cur.block->page.frame, index));
 #endif /* UNIV_ZIP_DEBUG */
 
 		return true;
@@ -370,24 +368,23 @@ public:
 	}
 
 private:
-	/** Begin import, position the cursor on the first record. */
-	void	open() UNIV_NOTHROW;
+  /** Begin import, position the cursor on the first record. */
+  inline bool open() noexcept;
 
-	/** Close the persistent curosr and commit the mini-transaction. */
-	void	close() UNIV_NOTHROW;
+  /** Close the persistent cursor and commit the mini-transaction. */
+  void close() noexcept { m_mtr.commit(); btr_pcur_close(&m_pcur); }
 
-	/** Position the cursor on the next record.
-	@return DB_SUCCESS or error code */
-	dberr_t	next() UNIV_NOTHROW;
+  /** Position the cursor on the next record.
+  @return DB_SUCCESS or error code */
+  dberr_t next() noexcept;
 
-	/** Store the persistent cursor position and reopen the
-	B-tree cursor in BTR_MODIFY_TREE mode, because the
-	tree structure may be changed during a pessimistic delete. */
-	void	purge_pessimistic_delete() UNIV_NOTHROW;
+  /** Store the persistent cursor position and reopen the
+  B-tree cursor in BTR_MODIFY_TREE mode, because the
+  tree structure may be changed during a pessimistic delete. */
+  inline dberr_t purge_pessimistic_delete() noexcept;
 
-	/** Purge delete-marked records.
-	@param offsets current row offsets. */
-	void	purge() UNIV_NOTHROW;
+  /** Purge a delete-marked record. */
+  dberr_t purge() noexcept;
 
 protected:
 	// Disable copying
@@ -468,7 +465,7 @@ public:
 	Called for every page in the tablespace. If the page was not
 	updated then its state must be set to BUF_PAGE_NOT_USED. For
 	compressed tables the page descriptor memory will be at offset:
-		block->frame + srv_page_size;
+		block->page.frame + srv_page_size;
 	@param block block read from file, note it is not from the buffer pool
 	@retval DB_SUCCESS or error code. */
 	virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0;
@@ -485,7 +482,7 @@ public:
 	static byte* get_frame(const buf_block_t* block)
 	{
 		return block->page.zip.data
-			? block->page.zip.data : block->frame;
+			? block->page.zip.data : block->page.frame;
 	}
 
 	/** Invoke the functionality for the callback */
@@ -618,7 +615,7 @@ AbstractCallback::init(
 	os_offset_t		file_size,
 	const buf_block_t*	block) UNIV_NOTHROW
 {
-	const page_t*		page = block->frame;
+	const page_t*		page = block->page.frame;
 
 	m_space_flags = fsp_header_get_flags(page);
 	if (!fil_space_t::is_valid_flags(m_space_flags, true)) {
@@ -757,7 +754,7 @@ dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW
 		return(DB_CORRUPTION);
 	}
 
-	if (!page_is_comp(block->frame) !=
+	if (!page_is_comp(block->page.frame) !=
 	    !dict_table_is_comp(m_table)) {
 		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 			ER_TABLE_SCHEMA_MISMATCH,
@@ -1458,8 +1455,6 @@ row_import::set_root_by_heuristic() UNIV_NOTHROW
 			" the tablespace has " << m_n_indexes << " indexes";
 	}
 
-	dict_mutex_enter_for_mysql();
-
 	ulint	i = 0;
 	dberr_t	err = DB_SUCCESS;
 
@@ -1499,8 +1494,6 @@ row_import::set_root_by_heuristic() UNIV_NOTHROW
 		}
 	}
 
-	dict_mutex_exit_for_mysql();
-
 	return(err);
 }
 
@@ -1510,14 +1503,13 @@ Purge delete marked records.
 dberr_t
 IndexPurge::garbage_collect() UNIV_NOTHROW
 {
-	dberr_t	err;
 	ibool	comp = dict_table_is_comp(m_index->table);
 
 	/* Open the persistent cursor and start the mini-transaction. */
 
-	open();
+	dberr_t err = open() ? next() : DB_CORRUPTION;
 
-	while ((err = next()) == DB_SUCCESS) {
+	for (; err == DB_SUCCESS; err = next()) {
 
 		rec_t*	rec = btr_pcur_get_rec(&m_pcur);
 		ibool	deleted = rec_get_deleted_flag(rec, comp);
@@ -1525,7 +1517,10 @@ IndexPurge::garbage_collect() UNIV_NOTHROW
 		if (!deleted) {
 			++m_n_rows;
 		} else {
-			purge();
+			err = purge();
+			if (err != DB_SUCCESS) {
+				break;
+			}
 		}
 	}
 
@@ -1538,40 +1533,33 @@ IndexPurge::garbage_collect() UNIV_NOTHROW
 
 /**
 Begin import, position the cursor on the first record. */
-void
-IndexPurge::open() UNIV_NOTHROW
+inline bool IndexPurge::open() noexcept
 {
-	mtr_start(&m_mtr);
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
 
-	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+  btr_pcur_init(&m_pcur);
 
-	btr_pcur_open_at_index_side(
-		true, m_index, BTR_MODIFY_LEAF, &m_pcur, true, 0, &m_mtr);
-	btr_pcur_move_to_next_user_rec(&m_pcur, &m_mtr);
-	if (rec_is_metadata(btr_pcur_get_rec(&m_pcur), *m_index)) {
-		ut_ad(btr_pcur_is_on_user_rec(&m_pcur));
-		/* Skip the metadata pseudo-record. */
-	} else {
-		btr_pcur_move_to_prev_on_page(&m_pcur);
-	}
-}
+  if (m_pcur.open_leaf(true, m_index, BTR_MODIFY_LEAF, &m_mtr) != DB_SUCCESS)
+    return false;
 
-/**
-Close the persistent curosr and commit the mini-transaction. */
-void
-IndexPurge::close() UNIV_NOTHROW
-{
-	btr_pcur_close(&m_pcur);
-	mtr_commit(&m_mtr);
+  rec_t *rec= page_rec_get_next(btr_pcur_get_rec(&m_pcur));
+  if (!rec)
+    return false;
+  if (rec_is_metadata(rec, *m_index))
+    /* Skip the metadata pseudo-record. */
+    btr_pcur_get_page_cur(&m_pcur)->rec= rec;
+  return true;
 }
 
 /**
 Position the cursor on the next record.
 @return DB_SUCCESS or error code */
-dberr_t
-IndexPurge::next() UNIV_NOTHROW
+dberr_t IndexPurge::next() noexcept
 {
-	btr_pcur_move_to_next_on_page(&m_pcur);
+	if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(&m_pcur))) {
+		return DB_CORRUPTION;
+	}
 
 	/* When switching pages, commit the mini-transaction
 	in order to release the latch on the old page. */
@@ -1592,9 +1580,12 @@ IndexPurge::next() UNIV_NOTHROW
 
 	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
 
-	btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+	if (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr)
+	    == btr_pcur_t::CORRUPTED) {
+		return DB_CORRUPTION;
+	}
 	/* The following is based on btr_pcur_move_to_next_user_rec(). */
-	m_pcur.old_stored = false;
+	m_pcur.old_rec = nullptr;
 	ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF);
 	do {
 		if (btr_pcur_is_after_last_on_page(&m_pcur)) {
@@ -1602,56 +1593,12 @@ IndexPurge::next() UNIV_NOTHROW
 				return DB_END_OF_INDEX;
 			}
 
-			buf_block_t* block = btr_pcur_get_block(&m_pcur);
-			uint32_t next_page = btr_page_get_next(block->frame);
-
-			/* MDEV-13542 FIXME: Make these checks part of
-			btr_pcur_move_to_next_page(), and introduce a
-			return status that will be checked in all callers! */
-			switch (next_page) {
-			default:
-				if (next_page != block->page.id().page_no()) {
-					break;
-				}
-				/* MDEV-20931 FIXME: Check that
-				next_page is within the tablespace
-				bounds! Also check that it is not a
-				change buffer bitmap page. */
-				/* fall through */
-			case 0:
-			case 1:
-			case FIL_NULL:
-				return DB_CORRUPTION;
+			if (dberr_t err = btr_pcur_move_to_next_page(&m_pcur,
+								     &m_mtr)) {
+				return err;
 			}
-
-			dict_index_t* index = m_pcur.btr_cur.index;
-			buf_block_t* next_block = btr_block_get(
-				*index, next_page, BTR_MODIFY_LEAF, false,
-				&m_mtr);
-
-			if (UNIV_UNLIKELY(!next_block
-					  || !fil_page_index_page_check(
-						  next_block->frame)
-					  || !!dict_index_is_spatial(index)
-					  != (fil_page_get_type(
-						      next_block->frame)
-					      == FIL_PAGE_RTREE)
-					  || page_is_comp(next_block->frame)
-					  != page_is_comp(block->frame)
-					  || btr_page_get_prev(
-						  next_block->frame)
-					  != block->page.id().page_no())) {
-				return DB_CORRUPTION;
-			}
-
-			btr_leaf_page_release(block, BTR_MODIFY_LEAF, &m_mtr);
-
-			page_cur_set_before_first(next_block,
-						  &m_pcur.btr_cur.page_cur);
-
-			ut_d(page_check_dir(next_block->frame));
-		} else {
-			btr_pcur_move_to_next_on_page(&m_pcur);
+		} else if (!btr_pcur_move_to_next_on_page(&m_pcur)) {
+			return DB_CORRUPTION;
 		}
 	} while (!btr_pcur_is_on_user_rec(&m_pcur));
 
@@ -1662,41 +1609,38 @@ IndexPurge::next() UNIV_NOTHROW
 Store the persistent cursor position and reopen the
 B-tree cursor in BTR_MODIFY_TREE mode, because the
 tree structure may be changed during a pessimistic delete. */
-void
-IndexPurge::purge_pessimistic_delete() UNIV_NOTHROW
+inline dberr_t IndexPurge::purge_pessimistic_delete() noexcept
 {
-	dberr_t	err;
-
-	btr_pcur_restore_position(BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-				  &m_pcur, &m_mtr);
-
-	ut_ad(rec_get_deleted_flag(
-			btr_pcur_get_rec(&m_pcur),
-			dict_table_is_comp(m_index->table)));
-
-	btr_cur_pessimistic_delete(
-		&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0, false, &m_mtr);
-
-	ut_a(err == DB_SUCCESS);
+  dberr_t err;
+  if (m_pcur.restore_position(BTR_PURGE_TREE, &m_mtr) != btr_pcur_t::CORRUPTED)
+  {
+    ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(&m_pcur),
+                               m_index->table->not_redundant()));
+    btr_cur_pessimistic_delete(&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0,
+                               false, &m_mtr);
+  }
+  else
+    err= DB_CORRUPTION;
 
-	/* Reopen the B-tree cursor in BTR_MODIFY_LEAF mode */
-	mtr_commit(&m_mtr);
+  m_mtr.commit();
+  return err;
 }
 
-/**
-Purge delete-marked records. */
-void
-IndexPurge::purge() UNIV_NOTHROW
+dberr_t IndexPurge::purge() noexcept
 {
-	btr_pcur_store_position(&m_pcur, &m_mtr);
-
-	purge_pessimistic_delete();
-
-	mtr_start(&m_mtr);
-
-	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
-
-	btr_pcur_restore_position(BTR_MODIFY_LEAF, &m_pcur, &m_mtr);
+  btr_pcur_store_position(&m_pcur, &m_mtr);
+  m_mtr.commit();
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+  dberr_t err= purge_pessimistic_delete();
+
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+  if (err == DB_SUCCESS)
+    err= (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr) ==
+          btr_pcur_t::CORRUPTED)
+      ? DB_CORRUPTION : DB_SUCCESS;
+  return err;
 }
 
 /** Adjust the BLOB reference for a single column that is externally stored
@@ -1806,10 +1750,8 @@ re-organising the B+tree.
 @return true if purge succeeded */
 inline bool PageConverter::purge() UNIV_NOTHROW
 {
-	const dict_index_t*	index = m_index->m_srv_index;
-
 	/* We can't have a page that is empty and not root. */
-	if (m_rec_iter.remove(index, m_offsets)) {
+	if (m_rec_iter.remove(m_offsets)) {
 
 		++m_index->m_stats.m_n_purged;
 
@@ -1873,7 +1815,9 @@ PageConverter::update_records(
 
 	/* This will also position the cursor on the first user record. */
 
-	m_rec_iter.open(block);
+	if (!m_rec_iter.open(block, m_index->m_srv_index)) {
+		return DB_CORRUPTION;
+	}
 
 	while (!m_rec_iter.end()) {
 		rec_t*	rec = m_rec_iter.current();
@@ -1904,17 +1848,19 @@ PageConverter::update_records(
 		optimistic delete. */
 
 		if (deleted) {
+			++m_index->m_stats.m_n_deleted;
 			/* A successful purge will move the cursor to the
 			next record. */
 
-			if (!purge()) {
-				m_rec_iter.next();
+			if (purge()) {
+				continue;
 			}
-
-			++m_index->m_stats.m_n_deleted;
 		} else {
 			++m_index->m_stats.m_n_rows;
-			m_rec_iter.next();
+		}
+
+		if (!m_rec_iter.next()) {
+			return DB_CORRUPTION;
 		}
 	}
 
@@ -1934,7 +1880,7 @@ PageConverter::update_index_page(
 		return(DB_SUCCESS);
 	}
 
-	buf_frame_t* page = block->frame;
+	buf_frame_t* page = block->page.frame;
 	const index_id_t id = btr_page_get_index_id(page);
 
 	if (id != m_index->m_id) {
@@ -1985,7 +1931,7 @@ PageConverter::update_index_page(
 			m_index->m_srv_index->id);
 	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 		memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID],
-		       &block->frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
+		       &block->page.frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
 	}
 
 	if (m_index->m_srv_index->is_clust()) {
@@ -1994,12 +1940,12 @@ PageConverter::update_index_page(
 		}
 	} else if (page_is_leaf(page)) {
 		/* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */
-		mach_write_to_8(&block->frame[PAGE_HEADER + PAGE_MAX_TRX_ID],
-				m_trx->id);
+		mach_write_to_8(&block->page.frame
+				[PAGE_HEADER + PAGE_MAX_TRX_ID], m_trx->id);
 		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 			memcpy_aligned<8>(&block->page.zip.data
 					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
-					  &block->frame
+					  &block->page.frame
 					  [PAGE_HEADER + PAGE_MAX_TRX_ID], 8);
 		}
 	} else {
@@ -2009,7 +1955,8 @@ clear_page_max_trx_id:
 		in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
 		would set the field to the transaction ID even
 		on clustered index pages. */
-		memset_aligned<8>(&block->frame[PAGE_HEADER + PAGE_MAX_TRX_ID],
+		memset_aligned<8>(&block->page.frame
+				  [PAGE_HEADER + PAGE_MAX_TRX_ID],
 				  0, 8);
 		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 			memset_aligned<8>(&block->page.zip.data
@@ -2031,7 +1978,9 @@ clear_page_max_trx_id:
 		return(DB_SUCCESS);
 	}
 
-	return page_is_leaf(block->frame) ? update_records(block) : DB_SUCCESS;
+	return page_is_leaf(block->page.frame)
+		? update_records(block)
+		: DB_SUCCESS;
 }
 
 /** Validate the space flags and update tablespace header page.
@@ -2078,8 +2027,8 @@ PageConverter::update_page(buf_block_t* block, uint16_t& page_type)
 
 	case FIL_PAGE_INDEX:
 	case FIL_PAGE_RTREE:
-		/* We need to decompress the contents into block->frame
-		before we can do any thing with Btree pages. */
+		/* We need to decompress the contents
+		before we can do anything. */
 
 		if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
 			return(DB_CORRUPTION);
@@ -2135,9 +2084,9 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
 	/* If we already had an old page with matching number
 	in the buffer pool, evict it now, because
 	we no longer evict the pages on DISCARD TABLESPACE. */
-	buf_page_get_gen(block->page.id(), get_zip_size(),
-			 RW_NO_LATCH, NULL, BUF_EVICT_IF_IN_POOL,
-			 __FILE__, __LINE__, NULL, NULL);
+	buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH,
+			 nullptr, BUF_PEEK_IF_IN_POOL,
+			 nullptr, nullptr, false);
 
 	uint16_t page_type;
 
@@ -2151,7 +2100,7 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
 
 	if (!block->page.zip.data) {
 		buf_flush_init_for_writing(
-			NULL, block->frame, NULL, full_crc32);
+			NULL, block->page.frame, NULL, full_crc32);
 	} else if (fil_page_type_is_index(page_type)) {
 		buf_flush_init_for_writing(
 			NULL, block->page.zip.data, &block->page.zip,
@@ -2173,11 +2122,8 @@ dberr_t
 row_import_cleanup(
 /*===============*/
 	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
-	trx_t*		trx,		/*!< in/out: transaction for import */
 	dberr_t		err)		/*!< in: error code */
 {
-	ut_a(prebuilt->trx != trx);
-
 	if (err != DB_SUCCESS) {
 		dict_table_t* table = prebuilt->table;
 		table->file_unreadable = true;
@@ -2191,10 +2137,6 @@ row_import_cleanup(
 		ib::info() << "Discarding tablespace of table "
 			   << table->name << ": " << err;
 
-		if (!trx->dict_operation_lock_mode) {
-			row_mysql_lock_data_dictionary(trx);
-		}
-
 		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
 		     index;
 		     index = UT_LIST_GET_NEXT(indexes, index)) {
@@ -2202,15 +2144,13 @@ row_import_cleanup(
 		}
 	}
 
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-
 	DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
 
-	trx_commit_for_mysql(trx);
-
-	row_mysql_unlock_data_dictionary(trx);
+	prebuilt->trx->commit();
 
-	trx->free();
+	if (prebuilt->trx->dict_operation_lock_mode) {
+		row_mysql_unlock_data_dictionary(prebuilt->trx);
+	}
 
 	prebuilt->trx->op_info = "";
 
@@ -2226,10 +2166,9 @@ dberr_t
 row_import_error(
 /*=============*/
 	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
-	trx_t*		trx,		/*!< in/out: transaction for import */
 	dberr_t		err)		/*!< in: error code */
 {
-	if (!trx_is_interrupted(trx)) {
+	if (!trx_is_interrupted(prebuilt->trx)) {
 		char	table_name[MAX_FULL_NAME_LEN + 1];
 
 		innobase_format_name(
@@ -2237,12 +2176,12 @@ row_import_error(
 			prebuilt->table->name.m_name);
 
 		ib_senderrf(
-			trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_WARN,
 			ER_INNODB_IMPORT_ERROR,
 			table_name, (ulong) err, ut_strerr(err));
 	}
 
-	return(row_import_cleanup(prebuilt, trx, err));
+	return row_import_cleanup(prebuilt, err);
 }
 
 /*****************************************************************//**
@@ -2376,43 +2315,28 @@ row_import_set_sys_max_row_id(
 
 	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
 
-	btr_pcur_open_at_index_side(
-		false,		// High end
-		index,
-		BTR_SEARCH_LEAF,
-		&pcur,
-		true,		// Init cursor
-		0,		// Leaf level
-		&mtr);
-
-	btr_pcur_move_to_prev_on_page(&pcur);
-	rec = btr_pcur_get_rec(&pcur);
-
-	/* Check for empty table. */
-	if (page_rec_is_infimum(rec)) {
-		/* The table is empty. */
-	} else if (rec_is_metadata(rec, *index)) {
-		/* The clustered index contains the metadata record only,
-		that is, the table is empty. */
-	} else {
-		row_id = mach_read_from_6(rec);
+	if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr)
+	    == DB_SUCCESS) {
+		rec = btr_pcur_move_to_prev_on_page(&pcur);
+
+		if (!rec) {
+			/* The table is corrupted. */
+		} else if (page_rec_is_infimum(rec)) {
+			/* The table is empty. */
+		} else if (rec_is_metadata(rec, *index)) {
+			/* The clustered index contains the metadata
+			record only, that is, the table is empty. */
+		} else {
+			row_id = mach_read_from_6(rec);
+		}
 	}
 
-	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
 
 	if (row_id) {
 		/* Update the system row id if the imported index row id is
 		greater than the max system row id. */
-
-		mutex_enter(&dict_sys.mutex);
-
-		if (row_id >= dict_sys.row_id) {
-			dict_sys.row_id = row_id + 1;
-			dict_hdr_flush_row_id();
-		}
-
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.update_row_id(row_id);
 	}
 }
 
@@ -3161,18 +3085,16 @@ and apply it to dict_table_t
 static dberr_t handle_instant_metadata(dict_table_t *table,
                                        const row_import &cfg)
 {
-  dict_get_and_save_data_dir_path(table, false);
+  dict_get_and_save_data_dir_path(table);
 
   char *filepath;
   if (DICT_TF_HAS_DATA_DIR(table->flags))
   {
     ut_a(table->data_dir_path);
-
-    filepath=
-        fil_make_filepath(table->data_dir_path, table->name.m_name, IBD, true);
+    filepath= fil_make_filepath(table->data_dir_path, table->name, IBD, true);
   }
   else
-    filepath= fil_make_filepath(nullptr, table->name.m_name, IBD, false);
+    filepath= fil_make_filepath(nullptr, table->name, IBD, false);
 
   if (!filepath)
     return DB_OUT_OF_MEMORY;
@@ -3195,9 +3117,8 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
       static_cast<byte *>(aligned_malloc(srv_page_size, srv_page_size)),
       &aligned_free);
 
-  if (dberr_t err= os_file_read_no_error_handling(IORequestReadPartial,
-                                                  file, first_page.get(), 0,
-                                                  srv_page_size, nullptr))
+  if (dberr_t err= os_file_read(IORequestReadPartial, file, first_page.get(),
+                                0, srv_page_size, nullptr))
     return err;
 
   auto space_flags= fsp_header_get_flags(first_page.get());
@@ -3232,7 +3153,7 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
           aligned_malloc(UNIV_PAGE_SIZE_MAX, UNIV_PAGE_SIZE_MAX)),
       &aligned_free);
 
-  if (dberr_t err= os_file_read_no_error_handling(
+  if (dberr_t err= os_file_read(
           IORequestReadPartial, file, page.get(), 3 * physical_size,
           physical_size, nullptr))
     return err;
@@ -3249,14 +3170,6 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
   {
     dict_index_t *index= dict_table_get_first_index(table);
 
-    auto tmp1= table->space_id;
-    table->space_id= page_get_space_id(page.get());
-    SCOPE_EXIT([tmp1, table]() { table->space_id= tmp1; });
-
-    auto tmp2= index->page;
-    index->page= page_get_page_no(page.get());
-    SCOPE_EXIT([tmp2, index]() { index->page= tmp2; });
-
     if (!page_is_comp(page.get()) != !dict_table_is_comp(table))
     {
       ib_errf(current_thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
@@ -3265,7 +3178,7 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
     }
 
     if (btr_cur_instant_root_init(index, page.get()))
-      return DB_ERROR;
+      return DB_CORRUPTION;
 
     ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
 
@@ -3284,6 +3197,8 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
     while (btr_page_get_level(page.get()) != 0)
     {
       const rec_t *rec= page_rec_get_next(page_get_infimum_rec(page.get()));
+      if (!rec)
+        return DB_CORRUPTION;
 
       /* Relax the assertion in rec_init_offsets(). */
       ut_ad(!index->in_instant_init);
@@ -3295,10 +3210,8 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
       uint64_t child_page_no= btr_node_ptr_get_child_page_no(rec, offsets);
 
       if (dberr_t err=
-          os_file_read_no_error_handling(IORequestReadPartial, file,
-                                         page.get(),
-                                         child_page_no * physical_size,
-                                         physical_size, nullptr))
+          os_file_read(IORequestReadPartial, file, page.get(),
+                       child_page_no * physical_size, physical_size, nullptr))
         return err;
 
       if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
@@ -3308,18 +3221,22 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
         return err;
     }
 
-    const auto *rec= page_rec_get_next(page_get_infimum_rec(page.get()));
+    const auto *rec= page_rec_get_next_const(page_get_infimum_rec(page.get()));
     const auto comp= dict_table_is_comp(index->table);
-    const auto info_bits= rec_get_info_bits(rec, comp);
 
-    if (page_rec_is_supremum(rec) || !(info_bits & REC_INFO_MIN_REC_FLAG))
+    if (!rec || page_rec_is_supremum(rec))
     {
+    corrupted_metadata:
       ib::error() << "Table " << index->table->name
                   << " is missing instant ALTER metadata";
       index->table->corrupted= true;
       return DB_CORRUPTION;
     }
 
+    const auto info_bits= rec_get_info_bits(rec, comp);
+    if (!(info_bits & REC_INFO_MIN_REC_FLAG))
+      goto corrupted_metadata;
+
     if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG ||
         (comp && rec_get_status(rec) != REC_STATUS_INSTANT))
     {
@@ -3373,11 +3290,10 @@ static dberr_t handle_instant_metadata(dict_table_t *table,
                     &aligned_free);
 
       if (dberr_t err=
-          os_file_read_no_error_handling(IORequestReadPartial, file,
-                                         second_page.get(), physical_size *
-                                         mach_read_from_4(ptr +
-                                                          BTR_EXTERN_PAGE_NO),
-                                         srv_page_size, nullptr))
+          os_file_read(IORequestReadPartial, file, second_page.get(),
+                       physical_size *
+                       mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO),
+                       physical_size, nullptr))
         return err;
 
       if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
@@ -3585,8 +3501,6 @@ row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
 
 		que_thr_t*	thr;
 
-		graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
-
 		ut_a(thr = que_fork_start_command(graph));
 
 		que_run_threads(thr);
@@ -3703,7 +3617,7 @@ dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
 	pars_info_bind_function(
 		info, "my_func", row_import_set_discarded, &discard);
 
-	dberr_t	err = que_eval_sql(info, sql, false, trx);
+	dberr_t	err = que_eval_sql(info, sql, trx);
 
 	ut_a(discard.n_recs == 1);
 	ut_a(discard.flags2 != ULINT32_UNDEFINED);
@@ -3784,15 +3698,15 @@ dberr_t FetchIndexRootPages::run(const fil_iterator_t& iter,
   const bool encrypted= iter.crypt_data != NULL &&
     iter.crypt_data->should_encrypt();
   byte* const readptr= iter.io_buffer;
-  block->frame= readptr;
+  block->page.frame= readptr;
 
   if (block->page.zip.data)
     block->page.zip.data= readptr;
 
   bool page_compressed= false;
 
-  dberr_t err= os_file_read_no_error_handling(
-    IORequestReadPartial, iter.file, readptr, 3 * size, size, 0);
+  dberr_t err= os_file_read(IORequestReadPartial, iter.file, readptr,
+                            3 * size, size, nullptr);
   if (err != DB_SUCCESS)
   {
     ib::error() << iter.filepath << ": os_file_read() failed";
@@ -3884,7 +3798,7 @@ static dberr_t fil_iterate(
 	required by buf_zip_decompress() */
 	dberr_t		err = DB_SUCCESS;
 	bool		page_compressed = false;
-	bool		punch_hole = true;
+	bool		punch_hole = !my_test_if_thinly_provisioned(iter.file);
 
 	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
 		if (callback.is_interrupted()) {
@@ -3893,7 +3807,7 @@ static dberr_t fil_iterate(
 		}
 
 		byte*		io_buffer = iter.io_buffer;
-		block->frame = io_buffer;
+		block->page.frame = io_buffer;
 
 		if (block->page.zip.data) {
 			/* Zip IO is done in the compressed page buffer. */
@@ -3916,9 +3830,8 @@ static dberr_t fil_iterate(
 			? iter.crypt_io_buffer : io_buffer;
 		byte* const writeptr = readptr;
 
-		err = os_file_read_no_error_handling(
-			IORequestReadPartial,
-			iter.file, readptr, offset, n_bytes, 0);
+		err = os_file_read(IORequestReadPartial, iter.file, readptr,
+				   offset, n_bytes, nullptr);
 		if (err != DB_SUCCESS) {
 			ib::error() << iter.filepath
 				    << ": os_file_read() failed";
@@ -3933,7 +3846,7 @@ static dberr_t fil_iterate(
 
 		for (ulint i = 0; i < n_pages_read;
 		     ++block->page.id_,
-		     ++i, page_off += size, block->frame += size) {
+		     ++i, page_off += size, block->page.frame += size) {
 			byte*	src = readptr + i * size;
 			const ulint page_no = page_get_page_no(src);
 			if (!page_no && block->page.id().page_no()) {
@@ -3990,7 +3903,7 @@ page_corrupted:
 				} else if (!page_compressed
 					   && type != FIL_PAGE_TYPE_XDES
 					   && !block->page.zip.data) {
-					block->frame = src;
+					block->page.frame = src;
 					frame_changed = true;
 				} else {
 					ut_ad(dst != src);
@@ -4042,8 +3955,7 @@ page_corrupted:
 			if ((err = callback(block)) != DB_SUCCESS) {
 				goto func_exit;
 			} else if (!updated) {
-				updated = block->page.state()
-					== BUF_BLOCK_FILE_PAGE;
+				updated = !!block->page.frame;
 			}
 
 			/* If tablespace is encrypted we use additional
@@ -4051,10 +3963,10 @@ page_corrupted:
 			for decrypting readptr == crypt_io_buffer != io_buffer.
 
 			Destination for decryption is a buffer pool block
-			block->frame == dst == io_buffer that is updated.
+			block->page.frame == dst == io_buffer that is updated.
 			Pages that did not require decryption even when
 			tablespace is marked as encrypted are not copied
-			instead block->frame is set to src == readptr.
+			instead block->page.frame is set to src == readptr.
 
 			For encryption we again use temporary scratch area
 			writeptr != io_buffer == dst
@@ -4087,7 +3999,7 @@ page_corrupted:
 				if (block->page.zip.data) {
 					block->page.zip.data = dst;
 				} else {
-					block->frame = dst;
+					block->page.frame = dst;
 				}
 			}
 
@@ -4203,18 +4115,17 @@ fil_tablespace_iterate(
 			return(DB_CORRUPTION););
 
 	/* Make sure the data_dir_path is set. */
-	dict_get_and_save_data_dir_path(table, false);
+	dict_get_and_save_data_dir_path(table);
 
-	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-		ut_a(table->data_dir_path);
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
 
-		filepath = fil_make_filepath(
-			table->data_dir_path, table->name.m_name, IBD, true);
-	} else {
-		filepath = fil_make_filepath(
-			NULL, table->name.m_name, IBD, false);
-	}
+	const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
 
+	filepath = fil_make_filepath(data_dir_path,
+				     {table->name.m_name,
+				      strlen(table->name.m_name)},
+				     IBD, data_dir_path != nullptr);
 	if (!filepath) {
 		return(DB_OUT_OF_MEMORY);
 	} else {
@@ -4251,13 +4162,13 @@ fil_tablespace_iterate(
 
 	buf_block_t* block = reinterpret_cast<buf_block_t*>
 		(ut_zalloc_nokey(sizeof *block));
-	block->frame = page;
-        block->page.init(BUF_BLOCK_FILE_PAGE, page_id_t(~0ULL), 1);
+	block->page.frame = page;
+	block->page.init(buf_page_t::UNFIXED + 1, page_id_t{~0ULL});
 
-	/* Read the first page and determine the page and zip size. */
+	/* Read the first page and determine the page size. */
 
-	err = os_file_read_no_error_handling(IORequestReadPartial,
-					     file, page, 0, srv_page_size, 0);
+	err = os_file_read(IORequestReadPartial, file, page, 0, srv_page_size,
+			   nullptr);
 
 	if (err == DB_SUCCESS) {
 		err = callback.init(file_size, block);
@@ -4306,8 +4217,9 @@ fil_tablespace_iterate(
 
 		if (block->page.zip.ssize) {
 			ut_ad(iter.n_io_buffers == 1);
-			block->frame = iter.io_buffer;
-			block->page.zip.data = block->frame + srv_page_size;
+			block->page.frame = iter.io_buffer;
+			block->page.zip.data = block->page.frame
+				+ srv_page_size;
 		}
 
 		err = callback.run(iter, block);
@@ -4351,9 +4263,9 @@ row_import_for_mysql(
 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
 {
 	dberr_t		err;
-	trx_t*		trx;
 	ib_uint64_t	autoinc = 0;
 	char*		filepath = NULL;
+	trx_t*		trx = prebuilt->trx;
 
 	/* The caller assured that this is not read_only_mode and that no
 	temorary tablespace is being imported. */
@@ -4362,28 +4274,12 @@ row_import_for_mysql(
 
 	ut_ad(table->space_id);
 	ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
-	ut_ad(prebuilt->trx);
+	ut_ad(trx);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
 	ut_ad(!table->is_readable());
 
 	ibuf_delete_for_discarded_space(table->space_id);
 
-	trx_start_if_not_started(prebuilt->trx, true);
-
-	trx = trx_create();
-
-	/* So that the table is not DROPped during recovery. */
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
-
-	trx_start_if_not_started(trx, true);
-
-	/* So that we can send error messages to the user. */
-	trx->mysql_thd = prebuilt->trx->mysql_thd;
-
-	/* Ensure that the table will be dropped by trx_rollback_active()
-	in case of a crash. */
-
-	trx->table_id = table->id;
-
 	/* Assign an undo segment for the transaction, so that the
 	transaction will be recovered after a crash. */
 
@@ -4398,25 +4294,19 @@ row_import_for_mysql(
 	DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
 			err = DB_TOO_MANY_CONCURRENT_TRXS;);
 
-	if (err != DB_SUCCESS) {
-
-		return(row_import_cleanup(prebuilt, trx, err));
-
-	} else if (trx->rsegs.m_redo.undo == 0) {
-
+	if (err == DB_SUCCESS && !trx->has_logged_persistent()) {
 		err = DB_TOO_MANY_CONCURRENT_TRXS;
-		return(row_import_cleanup(prebuilt, trx, err));
+	}
+	if (err != DB_SUCCESS) {
+		return row_import_cleanup(prebuilt, err);
 	}
 
-	prebuilt->trx->op_info = "read meta-data file";
-
-	/* Prevent DDL operations while we are checking. */
-
-	rw_lock_s_lock(&dict_sys.latch);
+	trx->op_info = "read meta-data file";
 
 	row_import	cfg;
+	THD* thd = trx->mysql_thd;
 
-	err = row_import_read_cfg(table, trx->mysql_thd, cfg);
+	err = row_import_read_cfg(table, thd, cfg);
 
 	/* Check if the table column definitions match the contents
 	of the config file. */
@@ -4424,14 +4314,13 @@ row_import_for_mysql(
 	if (err == DB_SUCCESS) {
 
 		if (dberr_t err = handle_instant_metadata(table, cfg)) {
-			rw_lock_s_unlock(&dict_sys.latch);
-			return row_import_error(prebuilt, trx, err);
+			return row_import_error(prebuilt, err);
 		}
 
 		/* We have a schema file, try and match it with our
 		data dictionary. */
 
-		err = cfg.match_schema(trx->mysql_thd);
+		err = cfg.match_schema(thd);
 
 		/* Update index->page and SYS_INDEXES.PAGE_NO to match the
 		B-tree root page numbers in the tablespace. Use the index
@@ -4442,15 +4331,10 @@ row_import_for_mysql(
 			autoinc = cfg.m_autoinc;
 		}
 
-		rw_lock_s_unlock(&dict_sys.latch);
-
 		DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
 				err = DB_TOO_MANY_CONCURRENT_TRXS;);
 
 	} else if (cfg.m_missing) {
-
-		rw_lock_s_unlock(&dict_sys.latch);
-
 		/* We don't have a schema file, we will have to discover
 		the index root pages from the .ibd file and skip the schema
 		matching step. */
@@ -4460,13 +4344,13 @@ row_import_for_mysql(
 		cfg.m_zip_size = 0;
 
 		if (UT_LIST_GET_LEN(table->indexes) > 1) {
-			ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
 				ER_INTERNAL_ERROR,
 				"Drop all secondary indexes before importing "
 				"table %s when .cfg file is missing.",
 				table->name.m_name);
 			err = DB_ERROR;
-			return row_import_error(prebuilt, trx, err);
+			return row_import_error(prebuilt, err);
 		}
 
 		FetchIndexRootPages	fetchIndexRootPages(table, trx);
@@ -4487,24 +4371,18 @@ row_import_for_mysql(
 				err = cfg.set_root_by_heuristic();
 
 				if (err == DB_SUCCESS) {
-					if (dberr_t err =
-					    handle_instant_metadata(table,
-								    cfg)) {
-						return row_import_error(
-							prebuilt, trx, err);
-					}
+					err = handle_instant_metadata(table,
+								      cfg);
 				}
 			}
 		}
-	} else {
-		rw_lock_s_unlock(&dict_sys.latch);
 	}
 
 	if (err != DB_SUCCESS) {
-		return(row_import_error(prebuilt, trx, err));
+		return row_import_error(prebuilt, err);
 	}
 
-	prebuilt->trx->op_info = "importing tablespace";
+	trx->op_info = "importing tablespace";
 
 	ib::info() << "Phase I - Update all pages";
 
@@ -4546,31 +4424,28 @@ row_import_for_mysql(
 
 		if (err != DB_DECRYPTION_FAILED) {
 
-			ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
 				ER_INTERNAL_ERROR,
 			"Error importing tablespace for table %s : %s",
 				table_name, ut_strerr(err));
 		}
 
-		return(row_import_cleanup(prebuilt, trx, err));
+		return row_import_cleanup(prebuilt, err);
 	}
 
-	row_mysql_lock_data_dictionary(trx);
-
 	/* If the table is stored in a remote tablespace, we need to
 	determine that filepath from the link file and system tables.
 	Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
-	dict_get_and_save_data_dir_path(table, true);
+	dict_get_and_save_data_dir_path(table);
 
-	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-		ut_a(table->data_dir_path);
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+	const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+	fil_space_t::name_type name{
+		table->name.m_name, strlen(table->name.m_name)};
 
-		filepath = fil_make_filepath(
-			table->data_dir_path, table->name.m_name, IBD, true);
-	} else {
-		filepath = fil_make_filepath(
-			NULL, table->name.m_name, IBD, false);
-	}
+	filepath = fil_make_filepath(data_dir_path, name, IBD,
+				     data_dir_path != nullptr);
 
 	DBUG_EXECUTE_IF(
 		"ib_import_OOM_15",
@@ -4579,13 +4454,10 @@ row_import_for_mysql(
 	);
 
 	if (filepath == NULL) {
-		row_mysql_unlock_data_dictionary(trx);
-		return(row_import_cleanup(prebuilt, trx, DB_OUT_OF_MEMORY));
+		return row_import_cleanup(prebuilt, DB_OUT_OF_MEMORY);
 	}
 
 	/* Open the tablespace so that we can access via the buffer pool.
-	We set the 2nd param (fix_dict = true) here because we already
-	have an x-lock on dict_sys.latch and dict_sys.mutex.
 	The tablespace is initially opened as a temporary one, because
 	we will not be writing any redo log for it before we have invoked
 	fil_space_t::set_imported() to declare it a persistent tablespace. */
@@ -4593,35 +4465,29 @@ row_import_for_mysql(
 	ulint	fsp_flags = dict_tf_to_fsp_flags(table->flags);
 
 	table->space = fil_ibd_open(
-		true, true, FIL_TYPE_IMPORT, table->space_id,
-		fsp_flags, table->name, filepath, &err);
+		2, FIL_TYPE_IMPORT, table->space_id,
+		fsp_flags, name, filepath, &err);
 
 	ut_ad((table->space == NULL) == (err != DB_SUCCESS));
 	DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
 			err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;);
 
 	if (!table->space) {
-		row_mysql_unlock_data_dictionary(trx);
-
-		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+		ib_senderrf(thd, IB_LOG_LEVEL_ERROR,
 			ER_GET_ERRMSG,
 			err, ut_strerr(err), filepath);
-
-		ut_free(filepath);
-
-		return(row_import_cleanup(prebuilt, trx, err));
 	}
 
-	row_mysql_unlock_data_dictionary(trx);
-
 	ut_free(filepath);
 
-	err = ibuf_check_bitmap_on_import(trx, table->space);
+	if (err == DB_SUCCESS) {
+		err = ibuf_check_bitmap_on_import(trx, table->space);
+	}
 
 	DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
 
 	if (err != DB_SUCCESS) {
-		return(row_import_cleanup(prebuilt, trx, err));
+		return row_import_cleanup(prebuilt, err);
 	}
 
 	/* The first index must always be the clustered index. */
@@ -4629,7 +4495,7 @@ row_import_for_mysql(
 	dict_index_t*	index = dict_table_get_first_index(table);
 
 	if (!dict_index_is_clust(index)) {
-		return(row_import_error(prebuilt, trx, DB_CORRUPTION));
+		return row_import_error(prebuilt, DB_CORRUPTION);
 	}
 
 	/* Update the Btree segment headers for index node and
@@ -4641,7 +4507,7 @@ row_import_for_mysql(
 			err = DB_CORRUPTION;);
 
 	if (err != DB_SUCCESS) {
-		return(row_import_error(prebuilt, trx, err));
+		return row_import_error(prebuilt, err);
 	} else if (cfg.requires_purge(index->name)) {
 
 		/* Purge any delete-marked records that couldn't be
@@ -4660,7 +4526,7 @@ row_import_for_mysql(
 	DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
 
 	if (err != DB_SUCCESS) {
-		return(row_import_error(prebuilt, trx, err));
+		return row_import_error(prebuilt, err);
 	}
 
 	/* For secondary indexes, purge any records that couldn't be purged
@@ -4673,7 +4539,7 @@ row_import_for_mysql(
 			err = DB_CORRUPTION;);
 
 	if (err != DB_SUCCESS) {
-		return(row_import_error(prebuilt, trx, err));
+		return row_import_error(prebuilt, err);
 	}
 
 	/* Ensure that the next available DB_ROW_ID is not smaller than
@@ -4697,7 +4563,7 @@ row_import_for_mysql(
 			ib::warn() << "Waiting for flush to complete on "
 				   << prebuilt->table->name;
 		}
-		os_thread_sleep(20000);
+		std::this_thread::sleep_for(std::chrono::milliseconds(20));
 	}
 
 	ib::info() << "Phase IV - Flush complete";
@@ -4712,13 +4578,13 @@ row_import_for_mysql(
 	err = row_import_update_index_root(trx, table, false);
 
 	if (err != DB_SUCCESS) {
-		return(row_import_error(prebuilt, trx, err));
+		return row_import_error(prebuilt, err);
 	}
 
 	err = row_import_update_discarded_flag(trx, table->id, false);
 
 	if (err != DB_SUCCESS) {
-		return(row_import_error(prebuilt, trx, err));
+		return row_import_error(prebuilt, err);
 	}
 
 	table->file_unreadable = false;
@@ -4734,5 +4600,5 @@ row_import_for_mysql(
 		btr_write_autoinc(dict_table_get_first_index(table), autoinc);
 	}
 
-	return(row_import_cleanup(prebuilt, trx, err));
+	return row_import_cleanup(prebuilt, err);
 }
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index 79a255dfc8f..d9bc72bee28 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2022, MariaDB Corporation.
+Copyright (c) 2016, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -35,7 +35,6 @@ Created 4/20/1996 Heikki Tuuri
 #include "que0que.h"
 #include "row0upd.h"
 #include "row0sel.h"
-#include "row0log.h"
 #include "rem0cmp.h"
 #include "lock0lock.h"
 #include "log0log.h"
@@ -44,9 +43,13 @@ Created 4/20/1996 Heikki Tuuri
 #include "buf0lru.h"
 #include "fts0fts.h"
 #include "fts0types.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
 #ifdef WITH_WSREP
 #include <wsrep.h>
 #include <mysql/service_wsrep.h>
+#include "ha_prototypes.h"
 #endif /* WITH_WSREP */
 
 /*************************************************************************
@@ -172,7 +175,7 @@ dberr_t
 row_ins_sec_index_entry_by_modify(
 /*==============================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
 				depending on whether mtr holds just a leaf
 				latch or also a tree latch */
 	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
@@ -192,8 +195,8 @@ row_ins_sec_index_entry_by_modify(
 
 	rec = btr_cur_get_rec(cursor);
 
-	ut_ad(!dict_index_is_clust(cursor->index));
-	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
+	ut_ad(!cursor->index()->is_clust());
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
 	ut_ad(!entry->info_bits);
 
 	/* We know that in the alphabetical ordering, entry and rec are
@@ -202,7 +205,7 @@ row_ins_sec_index_entry_by_modify(
 	difference. */
 
 	update = row_upd_build_sec_rec_difference_binary(
-		rec, cursor->index, *offsets, entry, heap);
+		rec, cursor->index(), *offsets, entry, heap);
 
 	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
 		/* We should never insert in place of a record that
@@ -216,8 +219,8 @@ row_ins_sec_index_entry_by_modify(
 		returns. After that point, set_committed(true)
 		would be invoked in commit_inplace_alter_table(). */
 		ut_a(update->n_fields == 0);
-		ut_a(!cursor->index->is_committed());
-		ut_ad(!dict_index_is_online_ddl(cursor->index));
+		ut_a(!cursor->index()->is_committed());
+		ut_ad(!dict_index_is_online_ddl(cursor->index()));
 		return(DB_SUCCESS);
 	}
 
@@ -239,7 +242,7 @@ row_ins_sec_index_entry_by_modify(
 			break;
 		}
 	} else {
-		ut_a(mode == BTR_MODIFY_TREE);
+		ut_ad(mode == BTR_INSERT_TREE);
 		if (buf_pool.running_out()) {
 
 			return(DB_LOCK_TABLE_FULL);
@@ -286,15 +289,15 @@ row_ins_clust_index_entry_by_modify(
 	dberr_t		err = DB_SUCCESS;
 	btr_cur_t*	cursor	= btr_pcur_get_btr_cur(pcur);
 	TABLE*		mysql_table = NULL;
-	ut_ad(dict_index_is_clust(cursor->index));
+	ut_ad(cursor->index()->is_clust());
 
 	rec = btr_cur_get_rec(cursor);
 
 	ut_ad(rec_get_deleted_flag(rec,
-				   dict_table_is_comp(cursor->index->table)));
+				   cursor->index()->table->not_redundant()));
 	/* In delete-marked records, DB_TRX_ID must
 	always refer to an existing undo log record. */
-	ut_ad(rec_get_trx_id(rec, cursor->index));
+	ut_ad(rec_get_trx_id(rec, cursor->index()));
 
 	/* Build an update vector containing all the fields to be modified;
 	NOTE that this vector may NOT contain system columns trx_id or
@@ -305,15 +308,17 @@ row_ins_clust_index_entry_by_modify(
 	}
 
 	update = row_upd_build_difference_binary(
-		cursor->index, entry, rec, NULL, true, true,
+		cursor->index(), entry, rec, NULL, true, true,
 		thr_get_trx(thr), heap, mysql_table, &err);
 	if (err != DB_SUCCESS) {
 		return(err);
 	}
 
 	if (mode != BTR_MODIFY_TREE) {
-		ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED))
-		      == BTR_MODIFY_LEAF);
+		ut_ad(mode == BTR_MODIFY_LEAF
+		      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+		      || mode == BTR_MODIFY_ROOT_AND_LEAF
+		      || mode == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
 
 		/* Try optimistic updating of the record, keeping changes
 		within the page */
@@ -672,7 +677,7 @@ row_ins_set_detailed(
 {
 	ut_ad(!srv_read_only_mode);
 
-	mutex_enter(&srv_misc_tmpfile_mutex);
+	mysql_mutex_lock(&srv_misc_tmpfile_mutex);
 	rewind(srv_misc_tmpfile);
 
 	if (os_file_set_eof(srv_misc_tmpfile)) {
@@ -686,13 +691,14 @@ row_ins_set_detailed(
 		trx_set_detailed_error(trx, "temp file operation failed");
 	}
 
-	mutex_exit(&srv_misc_tmpfile_mutex);
+	mysql_mutex_unlock(&srv_misc_tmpfile_mutex);
 }
 
 /*********************************************************************//**
 Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
 and displays information about the given transaction.
 The caller must release dict_foreign_err_mutex. */
+TRANSACTIONAL_TARGET
 static
 void
 row_ins_foreign_trx_print(
@@ -705,13 +711,14 @@ row_ins_foreign_trx_print(
 
 	ut_ad(!srv_read_only_mode);
 
-	lock_mutex_enter();
-	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
-	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
-	heap_size = mem_heap_get_size(trx->lock.lock_heap);
-	lock_mutex_exit();
+	{
+		TMLockMutexGuard g{SRW_LOCK_CALL};
+		n_rec_locks = trx->lock.n_rec_locks;
+		n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+		heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	}
 
-	mutex_enter(&dict_foreign_err_mutex);
+	mysql_mutex_lock(&dict_foreign_err_mutex);
 	rewind(dict_foreign_err_file);
 	ut_print_timestamp(dict_foreign_err_file);
 	fputs(" Transaction:\n", dict_foreign_err_file);
@@ -719,7 +726,7 @@ row_ins_foreign_trx_print(
 	trx_print_low(dict_foreign_err_file, trx, 600,
 		      n_rec_locks, n_trx_locks, heap_size);
 
-	ut_ad(mutex_own(&dict_foreign_err_mutex));
+	mysql_mutex_assert_owner(&dict_foreign_err_mutex);
 }
 
 /*********************************************************************//**
@@ -777,7 +784,7 @@ row_ins_foreign_report_err(
 	}
 	putc('\n', ef);
 
-	mutex_exit(&dict_foreign_err_mutex);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
 }
 
 /*********************************************************************//**
@@ -843,7 +850,7 @@ row_ins_foreign_report_add_err(
 	}
 	putc('\n', ef);
 
-	mutex_exit(&dict_foreign_err_mutex);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
 }
 
 /*********************************************************************//**
@@ -993,7 +1000,8 @@ row_ins_foreign_check_on_constraint(
 {
 	upd_node_t*	node;
 	upd_node_t*	cascade;
-	dict_table_t*	table		= foreign->foreign_table;
+	dict_table_t*const*const fktable = &foreign->foreign_table;
+	dict_table_t*	table = *fktable;
 	dict_index_t*	index;
 	dict_index_t*	clust_index;
 	dtuple_t*	ref;
@@ -1013,8 +1021,8 @@ row_ins_foreign_check_on_constraint(
 	/* Since we are going to delete or update a row, we have to invalidate
 	the MySQL query cache for table. A deadlock of threads is not possible
 	here because the caller of this function does not hold any latches with
-	the mutex rank above the lock_sys_t::mutex. The query cache mutex
-	has a rank just above the lock_sys_t::mutex. */
+	the mutex rank above the lock_sys.latch. The query cache mutex
+	has a rank just above the lock_sys.latch. */
 
 	row_ins_invalidate_query_cache(thr, table->name.m_name);
 
@@ -1106,7 +1114,7 @@ row_ins_foreign_check_on_constraint(
 		goto nonstandard_exit_func;
 	}
 
-	index = btr_pcur_get_btr_cur(pcur)->index;
+	index = pcur->index();
 
 	ut_a(index == foreign->foreign_index);
 
@@ -1129,9 +1137,14 @@ row_ins_foreign_check_on_constraint(
 
 		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
 					tmp_heap);
-		btr_pcur_open_with_no_init(clust_index, ref,
-					   PAGE_CUR_LE, BTR_SEARCH_LEAF,
-					   cascade->pcur, mtr);
+		cascade->pcur->old_rec = nullptr;
+		cascade->pcur->btr_cur.page_cur.index = clust_index;
+		err = btr_pcur_open_with_no_init(ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 cascade->pcur, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto nonstandard_exit_func;
+		}
 
 		clust_rec = btr_pcur_get_rec(cascade->pcur);
 		clust_block = btr_pcur_get_block(cascade->pcur);
@@ -1161,7 +1174,7 @@ row_ins_foreign_check_on_constraint(
 
 	/* Set an X-lock on the row to delete or update in the child table */
 
-	err = lock_table(0, table, LOCK_IX, thr);
+	err = lock_table(table, fktable, LOCK_IX, thr);
 
 	if (err == DB_SUCCESS) {
 		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
@@ -1338,21 +1351,14 @@ row_ins_foreign_check_on_constraint(
 	err = row_update_cascade_for_mysql(thr, cascade,
 					   foreign->foreign_table);
 
-	/* Release the data dictionary latch for a while, so that we do not
-	starve other threads from doing CREATE TABLE etc. if we have a huge
-	cascaded operation running. */
-
-	row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
-
-	DEBUG_SYNC_C("innodb_dml_cascade_dict_unfreeze");
-
-	row_mysql_freeze_data_dictionary(thr_get_trx(thr));
-
 	mtr_start(mtr);
 
 	/* Restore pcur position */
 
-	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+	if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+	    != btr_pcur_t::SAME_ALL) {
+		err = DB_CORRUPTION;
+	}
 
 	if (tmp_heap) {
 		mem_heap_free(tmp_heap);
@@ -1371,7 +1377,10 @@ nonstandard_exit_func:
 	mtr_commit(mtr);
 	mtr_start(mtr);
 
-	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+	if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+	    != btr_pcur_t::SAME_ALL && err == DB_SUCCESS) {
+		err = DB_CORRUPTION;
+	}
 
 	DBUG_RETURN(err);
 }
@@ -1457,10 +1466,7 @@ row_ins_check_foreign_constraint(
 	dtuple_t*	entry,	/*!< in: index entry for index */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	dberr_t		err;
 	upd_node_t*	upd_node;
-	dict_table_t*	check_table;
-	dict_index_t*	check_index;
 	ulint		n_fields_cmp;
 	btr_pcur_t	pcur;
 	int		cmp;
@@ -1482,14 +1488,10 @@ row_ins_check_foreign_constraint(
 	upd_node= NULL;
 #endif /* WITH_WSREP */
 
-	ut_ad(rw_lock_own(&dict_sys.latch, RW_LOCK_S));
-
-	err = DB_SUCCESS;
-
-	if (trx->check_foreigns == FALSE) {
+	if (!trx->check_foreigns) {
 		/* The user has suppressed foreign key checks currently for
 		this session */
-		goto exit_func;
+		DBUG_RETURN(DB_SUCCESS);
 	}
 
 	/* If any of the foreign key fields in entry is SQL NULL, we
@@ -1498,12 +1500,12 @@ row_ins_check_foreign_constraint(
 	for (ulint i = 0; i < entry->n_fields; i++) {
 		dfield_t* field = dtuple_get_nth_field(entry, i);
 		if (i < foreign->n_fields && dfield_is_null(field)) {
-			goto exit_func;
+			DBUG_RETURN(DB_SUCCESS);
 		}
 		/* System Versioning: if row_end != Inf, we
 		suppress the foreign key check */
 		if (field->type.vers_sys_end() && field->vers_history_row()) {
-			goto exit_func;
+			DBUG_RETURN(DB_SUCCESS);
 		}
 	}
 
@@ -1528,7 +1530,7 @@ row_ins_check_foreign_constraint(
 			another, and the user has problems predicting in
 			which order they are performed. */
 
-			goto exit_func;
+			DBUG_RETURN(DB_SUCCESS);
 		}
 	}
 
@@ -1540,23 +1542,32 @@ row_ins_check_foreign_constraint(
 			dfield_t* row_end = dtuple_get_nth_field(
 				insert_node->row, table->vers_end);
 			if (row_end->vers_history_row()) {
-				goto exit_func;
+				DBUG_RETURN(DB_SUCCESS);
 			}
 		}
 	}
 
-	if (check_ref) {
-		check_table = foreign->referenced_table;
-		check_index = foreign->referenced_index;
-	} else {
-		check_table = foreign->foreign_table;
-		check_index = foreign->foreign_index;
+	dict_table_t *check_table;
+	dict_index_t *check_index;
+	dberr_t err = DB_SUCCESS;
+
+	{
+		dict_table_t*& fktable = check_ref
+			? foreign->referenced_table : foreign->foreign_table;
+		check_table = fktable;
+		if (check_table) {
+			err = lock_table(check_table, &fktable, LOCK_IS, thr);
+			if (err != DB_SUCCESS) {
+				goto do_possible_lock_wait;
+			}
+		}
+		check_table = fktable;
 	}
 
-	if (check_table == NULL
-	    || !check_table->is_readable()
-	    || check_index == NULL) {
+	check_index = check_ref
+		? foreign->referenced_index : foreign->foreign_index;
 
+	if (!check_table || !check_table->is_readable() || !check_index) {
 		FILE*	ef = dict_foreign_err_file;
 		std::string fk_str;
 
@@ -1601,22 +1612,10 @@ row_ins_check_foreign_constraint(
 			err = DB_ROW_IS_REFERENCED;
 		}
 
-		mutex_exit(&dict_foreign_err_mutex);
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
 		goto exit_func;
 	}
 
-	if (check_table != table) {
-		/* We already have a LOCK_IX on table, but not necessarily
-		on check_table */
-
-		err = lock_table(0, check_table, LOCK_IS, thr);
-
-		if (err != DB_SUCCESS) {
-
-			goto do_possible_lock_wait;
-		}
-	}
-
 	mtr_start(&mtr);
 
 	/* Store old value on n_fields_cmp */
@@ -1624,9 +1623,11 @@ row_ins_check_foreign_constraint(
 	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
 
 	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
-
-	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
-		      BTR_SEARCH_LEAF, &pcur, &mtr);
+	pcur.btr_cur.page_cur.index = check_index;
+	err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto end_scan;
+	}
 
 	/* Scan index records and check if there is a matching record */
 
@@ -1812,9 +1813,8 @@ row_ins_check_foreign_constraint(
 	}
 
 end_scan:
-	btr_pcur_close(&pcur);
-
 	mtr_commit(&mtr);
+	ut_free(pcur.old_rec_buf);
 
 	/* Restore old value */
 	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
@@ -1823,29 +1823,19 @@ do_possible_lock_wait:
 	if (err == DB_LOCK_WAIT) {
 		trx->error_state = err;
 
-		que_thr_stop_for_mysql(thr);
-
 		thr->lock_state = QUE_THR_LOCK_ROW;
 
-		check_table->inc_fk_checks();
-
-		lock_wait_suspend_thread(thr);
+		err = lock_wait(thr);
 
 		thr->lock_state = QUE_THR_LOCK_NOLOCK;
 
-		err = trx->error_state;
-		if (err != DB_SUCCESS) {
-		} else if (check_table->to_be_dropped) {
-			err = DB_LOCK_WAIT_TIMEOUT;
-		} else {
+		if (err == DB_SUCCESS) {
 			err = DB_LOCK_WAIT;
 		}
-
-		check_table->dec_fk_checks();
 	}
 
 exit_func:
-	if (heap != NULL) {
+	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
 
@@ -1910,14 +1900,10 @@ row_ins_check_foreign_constraints(
 {
 	dict_foreign_t*	foreign;
 	dberr_t		err = DB_SUCCESS;
-	trx_t*		trx;
-	ibool		got_s_lock	= FALSE;
 	mem_heap_t*	heap = NULL;
 
 	DBUG_ASSERT(index->is_primary() == pk);
 
-	trx = thr_get_trx(thr);
-
 	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
 			    "foreign_constraint_check_for_ins");
 
@@ -1960,37 +1946,14 @@ row_ins_check_foreign_constraints(
 
 				ref_table = dict_table_open_on_name(
 					foreign->referenced_table_name_lookup,
-					FALSE, FALSE, DICT_ERR_IGNORE_NONE);
-			}
-
-			if (0 == trx->dict_operation_lock_mode) {
-				got_s_lock = TRUE;
-
-				row_mysql_freeze_data_dictionary(trx);
+					false, DICT_ERR_IGNORE_NONE);
 			}
 
-			if (referenced_table) {
-				foreign->foreign_table->inc_fk_checks();
-			}
-
-			/* NOTE that if the thread ends up waiting for a lock
-			we will release dict_sys.latch temporarily!
-			But the counter on the table protects the referenced
-			table from being dropped while the check is running. */
-
 			err = row_ins_check_foreign_constraint(
 				TRUE, foreign, table, ref_tuple, thr);
 
-			if (referenced_table) {
-				foreign->foreign_table->dec_fk_checks();
-			}
-
-			if (got_s_lock) {
-				row_mysql_unfreeze_data_dictionary(trx);
-			}
-
-			if (ref_table != NULL) {
-				dict_table_close(ref_table, FALSE, FALSE);
+			if (ref_table) {
+				dict_table_close(ref_table);
 			}
 		}
 	}
@@ -2118,7 +2081,6 @@ row_ins_scan_sec_index_for_duplicate(
 	dict_index_t*	index,	/*!< in: non-clustered unique index */
 	dtuple_t*	entry,	/*!< in: index entry */
 	que_thr_t*	thr,	/*!< in: query thread */
-	bool		s_latch,/*!< in: whether index->lock is being held */
 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
 	mem_heap_t*	offsets_heap)
 				/*!< in/out: memory heap that can be emptied */
@@ -2127,15 +2089,13 @@ row_ins_scan_sec_index_for_duplicate(
 	int		cmp;
 	ulint		n_fields_cmp;
 	btr_pcur_t	pcur;
-	dberr_t		err		= DB_SUCCESS;
 	rec_offs	offsets_[REC_OFFS_SEC_INDEX_SIZE];
 	rec_offs*	offsets		= offsets_;
 	DBUG_ENTER("row_ins_scan_sec_index_for_duplicate");
 
 	rec_offs_init(offsets_);
 
-	ut_ad(s_latch == rw_lock_own_flagged(
-			&index->lock, RW_LOCK_FLAG_S | RW_LOCK_FLAG_SX));
+	ut_ad(!index->lock.have_any());
 
 	n_unique = dict_index_get_n_unique(index);
 
@@ -2158,14 +2118,13 @@ row_ins_scan_sec_index_for_duplicate(
 	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
 
 	dtuple_set_n_fields_cmp(entry, n_unique);
-
-	btr_pcur_open(index, entry, PAGE_CUR_GE,
-		      s_latch
-		      ? BTR_SEARCH_LEAF_ALREADY_S_LATCHED
-		      : BTR_SEARCH_LEAF,
-		      &pcur, mtr);
-
+	pcur.btr_cur.page_cur.index = index;
 	trx_t* const trx = thr_get_trx(thr);
+	dberr_t err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF,
+				    &pcur, mtr);
+	if (err != DB_SUCCESS) {
+		goto end_scan;
+	}
 
 	/* Scan index records and check if there is a duplicate */
 
@@ -2323,11 +2282,11 @@ row_ins_duplicate_error_in_clust_online(
 	dberr_t		err	= DB_SUCCESS;
 	const rec_t*	rec	= btr_cur_get_rec(cursor);
 
-	ut_ad(!cursor->index->is_instant());
+	ut_ad(!cursor->index()->is_instant());
 
 	if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
-		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
-					   cursor->index->n_fields,
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   cursor->index()->n_fields,
 					   ULINT_UNDEFINED, heap);
 		err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
 		if (err != DB_SUCCESS) {
@@ -2335,11 +2294,13 @@ row_ins_duplicate_error_in_clust_online(
 		}
 	}
 
-	rec = page_rec_get_next_const(btr_cur_get_rec(cursor));
+	if (!(rec = page_rec_get_next_const(btr_cur_get_rec(cursor)))) {
+		return DB_CORRUPTION;
+	}
 
 	if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
-		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
-					   cursor->index->n_fields,
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   cursor->index()->n_fields,
 					   ULINT_UNDEFINED, heap);
 		err = row_ins_duplicate_online(n_uniq, entry, rec, *offsets);
 	}
@@ -2372,7 +2333,7 @@ row_ins_duplicate_error_in_clust(
 	rec_offs* offsets		= offsets_;
 	rec_offs_init(offsets_);
 
-	ut_ad(dict_index_is_clust(cursor->index));
+	ut_ad(cursor->index()->is_clust());
 
 	/* NOTE: For unique non-clustered indexes there may be any number
 	of delete marked records with the same value for the non-clustered
@@ -2387,15 +2348,17 @@ row_ins_duplicate_error_in_clust(
 	user records on the leaf level. So, even if low_match would suggest
 	that a duplicate key violation may occur, this may not be the case. */
 
-	n_unique = dict_index_get_n_unique(cursor->index);
+	n_unique = dict_index_get_n_unique(cursor->index());
 
 	if (cursor->low_match >= n_unique) {
 
 		rec = btr_cur_get_rec(cursor);
 
 		if (!page_rec_is_infimum(rec)) {
-			offsets = rec_get_offsets(rec, cursor->index, offsets,
-						  cursor->index->n_core_fields,
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets,
+						  cursor->index()
+						  ->n_core_fields,
 						  ULINT_UNDEFINED, &heap);
 
 			/* We set a lock on the possible duplicate: this
@@ -2416,13 +2379,13 @@ row_ins_duplicate_error_in_clust(
 				err = row_ins_set_exclusive_rec_lock(
 					LOCK_REC_NOT_GAP,
 					btr_cur_get_block(cursor),
-					rec, cursor->index, offsets, thr);
+					rec, cursor->index(), offsets, thr);
 			} else {
 
 				err = row_ins_set_shared_rec_lock(
 					LOCK_REC_NOT_GAP,
 					btr_cur_get_block(cursor), rec,
-					cursor->index, offsets, thr);
+					cursor->index(), offsets, thr);
 			}
 
 			switch (err) {
@@ -2434,11 +2397,11 @@ row_ins_duplicate_error_in_clust(
 			}
 
 			if (row_ins_dupl_error_with_rec(
-				    rec, entry, cursor->index, offsets)) {
+				    rec, entry, cursor->index(), offsets)) {
 duplicate:
-				trx->error_info = cursor->index;
+				trx->error_info = cursor->index();
 				err = DB_DUPLICATE_KEY;
-				if (cursor->index->table->versioned()
+				if (cursor->index()->table->versioned()
 				    && entry->vers_history_row())
 				{
 					ulint trx_id_len;
@@ -2455,13 +2418,17 @@ duplicate:
 		}
 	}
 
+	err = DB_SUCCESS;
+
 	if (cursor->up_match >= n_unique) {
 
 		rec = page_rec_get_next(btr_cur_get_rec(cursor));
 
-		if (!page_rec_is_supremum(rec)) {
-			offsets = rec_get_offsets(rec, cursor->index, offsets,
-						  cursor->index->n_core_fields,
+		if (rec && !page_rec_is_supremum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets,
+						  cursor->index()
+						  ->n_core_fields,
 						  ULINT_UNDEFINED, &heap);
 
 			if (trx->duplicates) {
@@ -2474,34 +2441,33 @@ duplicate:
 				err = row_ins_set_exclusive_rec_lock(
 					LOCK_REC_NOT_GAP,
 					btr_cur_get_block(cursor),
-					rec, cursor->index, offsets, thr);
+					rec, cursor->index(), offsets, thr);
 			} else {
 
 				err = row_ins_set_shared_rec_lock(
 					LOCK_REC_NOT_GAP,
 					btr_cur_get_block(cursor),
-					rec, cursor->index, offsets, thr);
+					rec, cursor->index(), offsets, thr);
 			}
 
 			switch (err) {
+			default:
+				break;
 			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
 			case DB_SUCCESS:
-				break;
-			default:
-				goto func_exit;
-			}
-
-			if (row_ins_dupl_error_with_rec(
-				    rec, entry, cursor->index, offsets)) {
-				goto duplicate;
+				if (row_ins_dupl_error_with_rec(
+					    rec, entry, cursor->index(),
+					    offsets)) {
+					goto duplicate;
+				}
 			}
 		}
 
 		/* This should never happen */
-		ut_error;
+		err = DB_CORRUPTION;
 	}
-
-	err = DB_SUCCESS;
 func_exit:
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
@@ -2534,7 +2500,7 @@ row_ins_must_modify_rec(
 	and a secondary index node pointer contains all index fields. */
 
 	return(cursor->low_match
-	       >= dict_index_get_n_unique_in_tree(cursor->index)
+	       >= dict_index_get_n_unique_in_tree(cursor->index())
 	       && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
 }
 
@@ -2562,9 +2528,9 @@ row_ins_index_entry_big_rec(
 	mtr_t		mtr;
 	btr_pcur_t	pcur;
 	rec_t*		rec;
-	dberr_t		error;
 
-	ut_ad(dict_index_is_clust(index));
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(index->is_primary());
 
 	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
 
@@ -2575,8 +2541,12 @@ row_ins_index_entry_big_rec(
 		index->set_modified(mtr);
 	}
 
-	btr_pcur_open(index, entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
-		      &pcur, &mtr);
+	dberr_t error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
+				      &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		return error;
+	}
+
 	rec = btr_pcur_get_rec(&pcur);
 	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
 				  ULINT_UNDEFINED, heap);
@@ -2586,18 +2556,25 @@ row_ins_index_entry_big_rec(
 		&pcur, offsets, big_rec, &mtr, BTR_STORE_INSERT);
 	DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
 
-	if (error == DB_SUCCESS
-	    && dict_index_is_online_ddl(index)) {
-		row_log_table_insert(btr_pcur_get_rec(&pcur), index, offsets);
-	}
-
 	mtr.commit();
 
-	btr_pcur_close(&pcur);
-
+	ut_free(pcur.old_rec_buf);
 	return(error);
 }
 
+#if 0
+extern "C" int thd_is_slave(const MYSQL_THD thd);
+#else
+# define thd_is_slave(thd) 0
+#endif
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock().
+We would only need this for row_ins_clust_index_entry_low(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+
 /***************************************************************//**
 Tries to insert an entry into a clustered index, ignoring foreign key
 constraints. If a record with the same unique key is found, the other
@@ -2613,7 +2590,7 @@ dberr_t
 row_ins_clust_index_entry_low(
 /*==========================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
 				depending on whether we wish optimistic or
 				pessimistic descent down the index tree */
 	dict_index_t*	index,	/*!< in: clustered index */
@@ -2623,15 +2600,16 @@ row_ins_clust_index_entry_low(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	btr_pcur_t	pcur;
-	btr_cur_t*	cursor;
 	dberr_t		err		= DB_SUCCESS;
 	big_rec_t*	big_rec		= NULL;
 	mtr_t		mtr;
-	ib_uint64_t	auto_inc	= 0;
+	uint64_t	auto_inc	= 0;
 	mem_heap_t*	offsets_heap	= NULL;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets         = offsets_;
 	rec_offs_init(offsets_);
+	trx_t*		trx	= thr_get_trx(thr);
+	buf_block_t*	block;
 
 	DBUG_ENTER("row_ins_clust_index_entry_low");
 
@@ -2639,9 +2617,9 @@ row_ins_clust_index_entry_low(
 	ut_ad(!dict_index_is_unique(index)
 	      || n_uniq == dict_index_get_n_unique(index));
 	ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
-	ut_ad(!thr_get_trx(thr)->in_rollback);
+	ut_ad(!trx->in_rollback);
 
-	mtr_start(&mtr);
+	mtr.start();
 
 	if (index->table->is_temporary()) {
 		/* Disable REDO logging as the lifetime of temp-tables is
@@ -2664,7 +2642,7 @@ row_ins_clust_index_entry_low(
 		} else {
 			if (mode == BTR_MODIFY_LEAF
 			    && dict_index_is_online_ddl(index)) {
-				mode = BTR_MODIFY_LEAF_ALREADY_S_LATCHED;
+				mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
 				mtr_s_lock_index(index, &mtr);
 			}
 
@@ -2681,6 +2659,13 @@ row_ins_clust_index_entry_low(
 						dfield->type.mtype,
 						dfield->type.prtype
 						& DATA_UNSIGNED);
+					if (auto_inc
+					    && mode != BTR_MODIFY_TREE) {
+						mode = btr_latch_mode(
+							BTR_MODIFY_ROOT_AND_LEAF
+							^ BTR_MODIFY_LEAF
+							^ mode);
+					}
 				}
 			}
 		}
@@ -2689,20 +2674,27 @@ row_ins_clust_index_entry_low(
 	/* Note that we use PAGE_CUR_LE as the search mode, because then
 	the function will return in both low_match and up_match of the
 	cursor sensible values */
- 	err = btr_pcur_open_low(index, 0, entry, PAGE_CUR_LE, mode, &pcur,
-			  __FILE__, __LINE__, auto_inc, &mtr);
+	pcur.btr_cur.page_cur.index = index;
+	err = btr_pcur_open(entry, PAGE_CUR_LE, mode, &pcur, &mtr);
 	if (err != DB_SUCCESS) {
 		index->table->file_unreadable = true;
+err_exit:
 		mtr.commit();
 		goto func_exit;
 	}
 
-	cursor = btr_pcur_get_btr_cur(&pcur);
-	cursor->thr = thr;
+	if (auto_inc) {
+		buf_block_t* root
+			= mtr.at_savepoint(mode != BTR_MODIFY_ROOT_AND_LEAF);
+		ut_ad(index->page == root->page.id().page_no());
+		page_set_autoinc(root, auto_inc, &mtr, false);
+	}
+
+	btr_pcur_get_btr_cur(&pcur)->thr = thr;
 
 #ifdef UNIV_DEBUG
 	{
-		page_t*	page = btr_cur_get_page(cursor);
+		page_t*	page = btr_pcur_get_page(&pcur);
 		rec_t*	first_rec = page_rec_get_next(
 			page_get_infimum_rec(page));
 
@@ -2711,31 +2703,91 @@ row_ins_clust_index_entry_low(
 	}
 #endif /* UNIV_DEBUG */
 
+	block = btr_pcur_get_block(&pcur);
+
+	DBUG_EXECUTE_IF("row_ins_row_level", goto skip_bulk_insert;);
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
+	    && page_is_empty(block->page.frame)
+	    && !entry->is_metadata() && !trx->duplicates
+	    && !trx->check_unique_secondary && !trx->check_foreigns
+	    && !trx->dict_operation
+	    && block->page.id().page_no() == index->page
+	    && !index->table->skip_alter_undo
+	    && !index->table->n_rec_locks
+	    && !index->table->is_active_ddl()
+	    && !index->table->versioned()
+	    && !thd_is_slave(trx->mysql_thd) /* FIXME: MDEV-24622 */) {
+		DEBUG_SYNC_C("empty_root_page_insert");
+
+		if (!index->table->is_temporary()) {
+			err = lock_table(index->table, NULL, LOCK_X, thr);
+
+			if (err != DB_SUCCESS) {
+				trx->error_state = err;
+				goto err_exit;
+			}
+
+			if (index->table->n_rec_locks) {
+				goto skip_bulk_insert;
+			}
+
+#if 0
+			if (trx->is_wsrep())
+			{
+				if (!wsrep_thd_is_local_transaction(trx->mysql_thd))
+					goto skip_bulk_insert;
+				if (wsrep_append_table_key(trx->mysql_thd, *index->table))
+				{
+					trx->error_state = DB_ROLLBACK;
+					goto err_exit;
+				}
+			}
+#endif /* WITH_WSREP */
+
+#ifdef BTR_CUR_HASH_ADAPT
+			if (btr_search_enabled) {
+				btr_search_x_lock_all();
+				index->table->bulk_trx_id = trx->id;
+				btr_search_x_unlock_all();
+			} else {
+				index->table->bulk_trx_id = trx->id;
+			}
+#else /* BTR_CUR_HASH_ADAPT */
+			index->table->bulk_trx_id = trx->id;
+#endif /* BTR_CUR_HASH_ADAPT */
+		}
+
+		trx->bulk_insert = true;
+	}
+
+skip_bulk_insert:
 	if (UNIV_UNLIKELY(entry->info_bits != 0)) {
 		ut_ad(entry->is_metadata());
 		ut_ad(flags == BTR_NO_LOCKING_FLAG);
 		ut_ad(index->is_instant());
 		ut_ad(!dict_index_is_online_ddl(index));
 
-		const rec_t* rec = btr_cur_get_rec(cursor);
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
 
 		if (rec_get_info_bits(rec, page_rec_is_comp(rec))
 		    & REC_INFO_MIN_REC_FLAG) {
-			thr_get_trx(thr)->error_info = index;
+			trx->error_info = index;
 			err = DB_DUPLICATE_KEY;
 			goto err_exit;
 		}
 
-		ut_ad(!row_ins_must_modify_rec(cursor));
+		ut_ad(!row_ins_must_modify_rec(&pcur.btr_cur));
 		goto do_insert;
 	}
 
-	if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
+	if (rec_is_metadata(btr_pcur_get_rec(&pcur), *index)) {
 		goto do_insert;
 	}
 
 	if (n_uniq
-	    && (cursor->up_match >= n_uniq || cursor->low_match >= n_uniq)) {
+	    && (pcur.btr_cur.up_match >= n_uniq
+		|| pcur.btr_cur.low_match >= n_uniq)) {
 
 		if (flags
 		    == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
@@ -2743,7 +2795,7 @@ row_ins_clust_index_entry_low(
 			/* Set no locks when applying log
 			in online table rebuild. Only check for duplicates. */
 			err = row_ins_duplicate_error_in_clust_online(
-				n_uniq, entry, cursor,
+				n_uniq, entry, &pcur.btr_cur,
 				&offsets, &offsets_heap);
 
 			switch (err) {
@@ -2754,26 +2806,24 @@ row_ins_clust_index_entry_low(
 				/* fall through */
 			case DB_SUCCESS_LOCKED_REC:
 			case DB_DUPLICATE_KEY:
-				thr_get_trx(thr)->error_info = cursor->index;
+				trx->error_info = index;
 			}
 		} else {
 			/* Note that the following may return also
 			DB_LOCK_WAIT */
 
 			err = row_ins_duplicate_error_in_clust(
-				flags, cursor, entry, thr);
+				flags, &pcur.btr_cur, entry, thr);
 		}
 
 		if (err != DB_SUCCESS) {
-err_exit:
-			mtr_commit(&mtr);
-			goto func_exit;
+			goto err_exit;
 		}
 	}
 
 	/* Note: Allowing duplicates would qualify for modification of
 	an existing record as the new entry is exactly same as old entry. */
-	if (row_ins_must_modify_rec(cursor)) {
+	if (row_ins_must_modify_rec(&pcur.btr_cur)) {
 		/* There is already an index entry with a long enough common
 		prefix, we must convert the insert into a modify of an
 		existing record */
@@ -2783,11 +2833,6 @@ err_exit:
 			&pcur, flags, mode, &offsets, &offsets_heap,
 			entry_heap, entry, thr, &mtr);
 
-		if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) {
-			row_log_table_insert(btr_cur_get_rec(cursor),
-					     index, offsets);
-		}
-
 		mtr_commit(&mtr);
 		mem_heap_free(entry_heap);
 	} else {
@@ -2796,10 +2841,13 @@ do_insert:
 		rec_t*	insert_rec;
 
 		if (mode != BTR_MODIFY_TREE) {
-			ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED))
-			      == BTR_MODIFY_LEAF);
+			ut_ad(mode == BTR_MODIFY_LEAF
+			      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+			      || mode == BTR_MODIFY_ROOT_AND_LEAF
+			      || mode
+			      == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
 			err = btr_cur_optimistic_insert(
-				flags, cursor, &offsets, &offsets_heap,
+				flags, &pcur.btr_cur, &offsets, &offsets_heap,
 				entry, &insert_rec, &big_rec,
 				n_ext, thr, &mtr);
 		} else {
@@ -2808,26 +2856,24 @@ do_insert:
 				goto err_exit;
 			}
 
-			DEBUG_SYNC_C("before_insert_pessimitic_row_ins_clust");
-
 			err = btr_cur_optimistic_insert(
-				flags, cursor,
+				flags, &pcur.btr_cur,
 				&offsets, &offsets_heap,
 				entry, &insert_rec, &big_rec,
 				n_ext, thr, &mtr);
 
 			if (err == DB_FAIL) {
 				err = btr_cur_pessimistic_insert(
-					flags, cursor,
+					flags, &pcur.btr_cur,
 					&offsets, &offsets_heap,
 					entry, &insert_rec, &big_rec,
 					n_ext, thr, &mtr);
 			}
 		}
 
-		if (big_rec != NULL) {
-			mtr_commit(&mtr);
+		mtr.commit();
 
+		if (big_rec) {
 			/* Online table rebuild could read (and
 			ignore) the incomplete record at this point.
 			If online rebuild is in progress, the
@@ -2838,16 +2884,8 @@ do_insert:
 				log_write_up_to(mtr.commit_lsn(), true););
 			err = row_ins_index_entry_big_rec(
 				entry, big_rec, offsets, &offsets_heap, index,
-				thr_get_trx(thr)->mysql_thd);
+				trx->mysql_thd);
 			dtuple_convert_back_big_rec(index, entry, big_rec);
-		} else {
-			if (err == DB_SUCCESS
-			    && dict_index_is_online_ddl(index)) {
-				row_log_table_insert(
-					insert_rec, index, offsets);
-			}
-
-			mtr_commit(&mtr);
 		}
 	}
 
@@ -2856,24 +2894,14 @@ func_exit:
 		mem_heap_free(offsets_heap);
 	}
 
-	btr_pcur_close(&pcur);
-
+	ut_free(pcur.old_rec_buf);
 	DBUG_RETURN(err);
 }
 
-/** Start a mini-transaction and check if the index will be dropped.
+/** Start a mini-transaction.
 @param[in,out]	mtr		mini-transaction
-@param[in,out]	index		secondary index
-@param[in]	check		whether to check
-@param[in]	search_mode	flags
-@return true if the index is to be dropped */
-static MY_ATTRIBUTE((warn_unused_result))
-bool
-row_ins_sec_mtr_start_and_check_if_aborted(
-	mtr_t*		mtr,
-	dict_index_t*	index,
-	bool		check,
-	ulint		search_mode)
+@param[in,out]	index		secondary index */
+static void row_ins_sec_mtr_start(mtr_t *mtr, dict_index_t *index)
 {
 	ut_ad(!dict_index_is_clust(index));
 	ut_ad(mtr->is_named_space(index->table->space));
@@ -2883,30 +2911,6 @@ row_ins_sec_mtr_start_and_check_if_aborted(
 	mtr->start();
 	index->set_modified(*mtr);
 	mtr->set_log_mode(log_mode);
-
-	if (!check) {
-		return(false);
-	}
-
-	if (search_mode & BTR_ALREADY_S_LATCHED) {
-		mtr_s_lock_index(index, mtr);
-	} else {
-		mtr_sx_lock_index(index, mtr);
-	}
-
-	switch (index->online_status) {
-	case ONLINE_INDEX_ABORTED:
-	case ONLINE_INDEX_ABORTED_DROPPED:
-		ut_ad(!index->is_committed());
-		return(true);
-	case ONLINE_INDEX_COMPLETE:
-		return(false);
-	case ONLINE_INDEX_CREATION:
-		break;
-	}
-
-	ut_error;
-	return(true);
 }
 
 /***************************************************************//**
@@ -2915,13 +2919,13 @@ same fields is found, the other record is necessarily marked deleted.
 It is then unmarked. Otherwise, the entry is just inserted to the index.
 @retval DB_SUCCESS on success
 @retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
-@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
 @return error code */
 dberr_t
 row_ins_sec_index_entry_low(
 /*========================*/
 	ulint		flags,	/*!< in: undo logging and locking flags */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
 				depending on whether we wish optimistic or
 				pessimistic descent down the index tree */
 	dict_index_t*	index,	/*!< in: secondary index */
@@ -2936,8 +2940,8 @@ row_ins_sec_index_entry_low(
 	DBUG_ENTER("row_ins_sec_index_entry_low");
 
 	btr_cur_t	cursor;
-	ulint		search_mode	= mode;
-	dberr_t		err		= DB_SUCCESS;
+	btr_latch_mode	search_mode	= mode;
+	dberr_t		err;
 	ulint		n_unique;
 	mtr_t		mtr;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
@@ -2946,10 +2950,11 @@ row_ins_sec_index_entry_low(
 	rtr_info_t	rtr_info;
 
 	ut_ad(!dict_index_is_clust(index));
-	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_MODIFY_TREE);
+	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_INSERT_TREE);
 
 	cursor.thr = thr;
 	cursor.rtr_info = NULL;
+	cursor.page_cur.index = index;
 	ut_ad(thr_get_trx(thr)->id != 0);
 
 	mtr.start();
@@ -2961,53 +2966,22 @@ row_ins_sec_index_entry_low(
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
 	} else {
 		index->set_modified(mtr);
-		if (!dict_index_is_spatial(index)) {
-			search_mode |= BTR_INSERT;
-		}
-	}
-
-	/* Ensure that we acquire index->lock when inserting into an
-	index with index->online_status == ONLINE_INDEX_COMPLETE, but
-	could still be subject to rollback_inplace_alter_table().
-	This prevents a concurrent change of index->online_status.
-	The memory object cannot be freed as long as we have an open
-	reference to the table, or index->table->n_ref_count > 0. */
-	const bool	check = !index->is_committed();
-	if (check) {
-		DEBUG_SYNC_C("row_ins_sec_index_enter");
-		if (mode == BTR_MODIFY_LEAF) {
-			search_mode |= BTR_ALREADY_S_LATCHED;
-			mtr_s_lock_index(index, &mtr);
-		} else {
-			mtr_sx_lock_index(index, &mtr);
-		}
-
-		if (row_log_online_op_try(
-			    index, entry, thr_get_trx(thr)->id)) {
-			goto func_exit;
-		}
 	}
 
 	/* Note that we use PAGE_CUR_LE as the search mode, because then
 	the function will return in both low_match and up_match of the
 	cursor sensible values */
 
-	if (!thr_get_trx(thr)->check_unique_secondary) {
-		search_mode |= BTR_IGNORE_SEC_UNIQUE;
-	}
-
-	if (dict_index_is_spatial(index)) {
-		cursor.index = index;
+	if (index->is_spatial()) {
 		rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
 		rtr_info_update_btr(&cursor, &rtr_info);
 
-		err = btr_cur_search_to_nth_level(
-			index, 0, entry, PAGE_CUR_RTREE_INSERT,
-			search_mode,
-			&cursor, __FILE__, __LINE__, &mtr);
+		err = rtr_insert_leaf(&cursor, entry, search_mode, &mtr);
 
-		if (mode == BTR_MODIFY_LEAF && rtr_info.mbr_adj) {
+		if (err == DB_SUCCESS && search_mode == BTR_MODIFY_LEAF
+		    && rtr_info.mbr_adj) {
 			mtr_commit(&mtr);
+			search_mode = mode = BTR_MODIFY_TREE;
 			rtr_clean_rtr_info(&rtr_info, true);
 			rtr_init_rtr_info(&rtr_info, false, &cursor,
 					  index, false);
@@ -3018,13 +2992,8 @@ row_ins_sec_index_entry_low(
 			} else {
 				index->set_modified(mtr);
 			}
-			search_mode &= ulint(~BTR_MODIFY_LEAF);
-			search_mode |= BTR_MODIFY_TREE;
-			err = btr_cur_search_to_nth_level(
-				index, 0, entry, PAGE_CUR_RTREE_INSERT,
-				search_mode,
-				&cursor, __FILE__, __LINE__, &mtr);
-			mode = BTR_MODIFY_TREE;
+			err = rtr_insert_leaf(&cursor, entry,
+					      search_mode, &mtr);
 		}
 
 		DBUG_EXECUTE_IF(
@@ -3032,21 +3001,21 @@ row_ins_sec_index_entry_low(
 			goto func_exit;});
 
 	} else {
-		err = btr_cur_search_to_nth_level(
-			index, 0, entry, PAGE_CUR_LE,
-			search_mode,
-			&cursor, __FILE__, __LINE__, &mtr);
+		if (!index->table->is_temporary()) {
+			search_mode = btr_latch_mode(
+				search_mode
+				| (thr_get_trx(thr)->check_unique_secondary
+				   ? BTR_INSERT | BTR_IGNORE_SEC_UNIQUE
+				   : BTR_INSERT));
+		}
+
+		err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode,
+					 &mtr);
 	}
 
 	if (err != DB_SUCCESS) {
 		if (err == DB_DECRYPTION_FAILED) {
-			ib_push_warning(thr_get_trx(thr)->mysql_thd,
-				DB_DECRYPTION_FAILED,
-				"Table %s is encrypted but encryption service or"
-				" used key_id is not available. "
-				" Can't continue reading table.",
-				index->table->name.m_name);
-			index->table->file_unreadable = true;
+			btr_decryption_failed(*index);
 		}
 		goto func_exit;
 	}
@@ -3076,13 +3045,10 @@ row_ins_sec_index_entry_low(
 
 		DEBUG_SYNC_C("row_ins_sec_index_unique");
 
-		if (row_ins_sec_mtr_start_and_check_if_aborted(
-			    &mtr, index, check, search_mode)) {
-			goto func_exit;
-		}
+		row_ins_sec_mtr_start(&mtr, index);
 
 		err = row_ins_scan_sec_index_for_duplicate(
-			flags, index, entry, thr, check, &mtr, offsets_heap);
+			flags, index, entry, thr, &mtr, offsets_heap);
 
 		mtr_commit(&mtr);
 
@@ -3093,9 +3059,7 @@ row_ins_sec_index_entry_low(
 			if (!index->is_committed()) {
 				ut_ad(!thr_get_trx(thr)
 				      ->dict_operation_lock_mode);
-				mutex_enter(&dict_sys.mutex);
-				dict_set_corrupted_index_cache_only(index);
-				mutex_exit(&dict_sys.mutex);
+				index->type |= DICT_CORRUPT;
 				/* Do not return any error to the
 				caller. The duplicate will be reported
 				by ALTER TABLE or CREATE UNIQUE INDEX.
@@ -3113,10 +3077,7 @@ row_ins_sec_index_entry_low(
 			DBUG_RETURN(err);
 		}
 
-		if (row_ins_sec_mtr_start_and_check_if_aborted(
-			    &mtr, index, check, search_mode)) {
-			goto func_exit;
-		}
+		row_ins_sec_mtr_start(&mtr, index);
 
 		DEBUG_SYNC_C("row_ins_sec_index_entry_dup_locks_created");
 
@@ -3124,12 +3085,16 @@ row_ins_sec_index_entry_low(
 		locked with s-locks the necessary records to
 		prevent any insertion of a duplicate by another
 		transaction. Let us now reposition the cursor and
-		continue the insertion. */
-		btr_cur_search_to_nth_level(
-			index, 0, entry, PAGE_CUR_LE,
-			(search_mode
-			 & ~(BTR_INSERT | BTR_IGNORE_SEC_UNIQUE)),
-			&cursor, __FILE__, __LINE__, &mtr);
+		continue the insertion (bypassing the change buffer). */
+		err = cursor.search_leaf(
+			entry, PAGE_CUR_LE,
+			btr_latch_mode(search_mode
+				       & ~(BTR_INSERT
+					   | BTR_IGNORE_SEC_UNIQUE)),
+			&mtr);
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
 	}
 
 	if (row_ins_must_modify_rec(&cursor)) {
@@ -3164,7 +3129,6 @@ row_ins_sec_index_entry_low(
 				err = rtr_ins_enlarge_mbr(&cursor, &mtr);
 			}
 		} else {
-			ut_ad(mode == BTR_MODIFY_TREE);
 			if (buf_pool.running_out()) {
 				err = DB_LOCK_TABLE_FULL;
 				goto func_exit;
@@ -3367,7 +3331,7 @@ row_ins_sec_index_entry(
 		log_free_check();
 
 		err = row_ins_sec_index_entry_low(
-			flags, BTR_MODIFY_TREE, index,
+			flags, BTR_INSERT_TREE, index,
 			offsets_heap, heap, entry, 0, thr);
 	}
 
@@ -3582,22 +3546,9 @@ row_ins_alloc_row_id_step(
 /*======================*/
 	ins_node_t*	node)	/*!< in: row insert node */
 {
-	row_id_t	row_id;
-
-	ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
-
-	if (dict_index_is_unique(dict_table_get_first_index(node->table))) {
-
-		/* No row id is stored if the clustered index is unique */
-
-		return;
-	}
-
-	/* Fill in row id value to row */
-
-	row_id = dict_sys_get_new_row_id();
-
-	dict_sys_write_row_id(node->sys_buf, row_id);
+  ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+  if (dict_table_get_first_index(node->table)->is_gen_clust())
+    dict_sys_write_row_id(node->sys_buf, dict_sys.get_new_row_id());
 }
 
 /***********************************************************//**
@@ -3701,23 +3652,14 @@ row_ins(
 
 	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
 
-	while (node->index != NULL) {
-		if (!(node->index->type & DICT_FTS)) {
-			dberr_t err = row_ins_index_entry_step(node, thr);
-
-			if (err != DB_SUCCESS) {
-				DBUG_RETURN(err);
-			}
+	while (dict_index_t *index = node->index) {
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+		} else if (dberr_t err = row_ins_index_entry_step(node, thr)) {
+			DBUG_RETURN(err);
 		}
-
-		node->index = dict_table_get_next_index(node->index);
+		node->index = dict_table_get_next_index(index);
 		++node->entry;
-
-		/* Skip corrupted secondary index and its entry */
-		while (node->index && node->index->is_corrupted()) {
-			node->index = dict_table_get_next_index(node->index);
-			++node->entry;
-		}
 	}
 
 	ut_ad(node->entry == node->entry_list.end());
@@ -3788,10 +3730,6 @@ row_ins_step(
 		goto do_insert;
 	}
 
-	if (UNIV_LIKELY(!node->table->skip_alter_undo)) {
-		trx_write_trx_id(&node->sys_buf[DATA_TRX_ID_LEN], trx->id);
-	}
-
 	if (node->state == INS_NODE_SET_IX_LOCK) {
 
 		node->state = INS_NODE_ALLOC_ROW_ID;
@@ -3809,13 +3747,13 @@ row_ins_step(
 			goto same_trx;
 		}
 
-		err = lock_table(0, node->table, LOCK_IX, thr);
+		err = lock_table(node->table, NULL, LOCK_IX, thr);
 
 		DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
 				err = DB_LOCK_WAIT;);
 
 		if (err != DB_SUCCESS) {
-
+			node->state = INS_NODE_SET_IX_LOCK;
 			goto error_handling;
 		}
 
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
index b8b4bd56239..b21ff2b9f86 100644
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -85,79 +85,6 @@ struct row_log_buf_t {
 				row_log_apply(). */
 };
 
-/** Tracks BLOB allocation during online ALTER TABLE */
-class row_log_table_blob_t {
-public:
-	/** Constructor (declaring a BLOB freed)
-	@param offset_arg row_log_t::tail::total */
-#ifdef UNIV_DEBUG
-	row_log_table_blob_t(ulonglong offset_arg) :
-		old_offset (0), free_offset (offset_arg),
-		offset (BLOB_FREED) {}
-#else /* UNIV_DEBUG */
-	row_log_table_blob_t() :
-		offset (BLOB_FREED) {}
-#endif /* UNIV_DEBUG */
-
-	/** Declare a BLOB freed again.
-	@param offset_arg row_log_t::tail::total */
-#ifdef UNIV_DEBUG
-	void blob_free(ulonglong offset_arg)
-#else /* UNIV_DEBUG */
-	void blob_free()
-#endif /* UNIV_DEBUG */
-	{
-		ut_ad(offset < offset_arg);
-		ut_ad(offset != BLOB_FREED);
-		ut_d(old_offset = offset);
-		ut_d(free_offset = offset_arg);
-		offset = BLOB_FREED;
-	}
-	/** Declare a freed BLOB reused.
-	@param offset_arg row_log_t::tail::total */
-	void blob_alloc(ulonglong offset_arg) {
-		ut_ad(free_offset <= offset_arg);
-		ut_d(old_offset = offset);
-		offset = offset_arg;
-	}
-	/** Determine if a BLOB was freed at a given log position
-	@param offset_arg row_log_t::head::total after the log record
-	@return true if freed */
-	bool is_freed(ulonglong offset_arg) const {
-		/* This is supposed to be the offset at the end of the
-		current log record. */
-		ut_ad(offset_arg > 0);
-		/* We should never get anywhere close the magic value. */
-		ut_ad(offset_arg < BLOB_FREED);
-		return(offset_arg < offset);
-	}
-private:
-	/** Magic value for a freed BLOB */
-	static const ulonglong BLOB_FREED = ~0ULL;
-#ifdef UNIV_DEBUG
-	/** Old offset, in case a page was freed, reused, freed, ... */
-	ulonglong	old_offset;
-	/** Offset of last blob_free() */
-	ulonglong	free_offset;
-#endif /* UNIV_DEBUG */
-	/** Byte offset to the log file */
-	ulonglong	offset;
-};
-
-/** @brief Map of off-page column page numbers to 0 or log byte offsets.
-
-If there is no mapping for a page number, it is safe to access.
-If a page number maps to 0, it is an off-page column that has been freed.
-If a page number maps to a nonzero number, the number is a byte offset
-into the index->online_log, indicating that the page is safe to access
-when applying log records starting from that offset. */
-typedef std::map<
-	ulint,
-	row_log_table_blob_t,
-	std::less<ulint>,
-	ut_allocator<std::pair<const ulint, row_log_table_blob_t> > >
-	page_no_map;
-
 /** @brief Buffer for logging modifications during online index creation
 
 All modifications to an index that is being created will be logged by
@@ -172,12 +99,8 @@ directly. When also head.bytes == tail.bytes, both counts will be
 reset to 0 and the file will be truncated. */
 struct row_log_t {
 	pfs_os_file_t	fd;	/*!< file descriptor */
-	ib_mutex_t	mutex;	/*!< mutex protecting error,
+	mysql_mutex_t	mutex;	/*!< mutex protecting error,
 				max_trx and tail */
-	page_no_map*	blobs;	/*!< map of page numbers of off-page columns
-				that have been freed during table-rebuilding
-				ALTER TABLE (row_log_table_*); protected by
-				index->lock X-latch only */
 	dict_table_t*	table;	/*!< table that is being rebuilt,
 				or NULL when this is a secondary
 				index that is being created online */
@@ -237,6 +160,11 @@ struct row_log_t {
 	const TABLE*	old_table; /*< Use old table in case of error. */
 
 	uint64_t	n_rows; /*< Number of rows read from the table */
+
+	/** Alter table transaction. It can be used to apply the DML logs
+	into the table */
+	const trx_t*	alter_trx;
+
 	/** Determine whether the log should be in the 'instant ADD' format
 	@param[in]	index	the clustered index of the source table
 	@return	whether to use the 'instant ADD COLUMN' format */
@@ -322,15 +250,14 @@ row_log_block_free(
 	DBUG_VOID_RETURN;
 }
 
-/******************************************************//**
-Logs an operation to a secondary index that is (or was) being created. */
-void
-row_log_online_op(
-/*==============*/
-	dict_index_t*	index,	/*!< in/out: index, S or X latched */
-	const dtuple_t* tuple,	/*!< in: index tuple */
-	trx_id_t	trx_id)	/*!< in: transaction ID for insert,
-				or 0 for delete */
+/** Logs an operation to a secondary index that is (or was) being created.
+@param  index   index, S or X latched
+@param  tuple   index tuple
+@param  trx_id  transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+                       trx_id_t trx_id)
 {
 	byte*		b;
 	ulint		extra_size;
@@ -338,17 +265,19 @@ row_log_online_op(
 	ulint		mrec_size;
 	ulint		avail_size;
 	row_log_t*	log;
+	bool		success= true;
 
 	ut_ad(dtuple_validate(tuple));
 	ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
-	ut_ad(rw_lock_own_flagged(&index->lock,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
+	ut_ad(index->lock.have_x() || index->lock.have_s());
 
 	if (index->is_corrupted()) {
-		return;
+		return success;
 	}
 
-	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
 
 	/* Compute the size of the record. This differs from
 	row_merge_buf_encode(), because here we do not encode
@@ -364,8 +293,9 @@ row_log_online_op(
 		+ (trx_id ? DATA_TRX_ID_LEN : 0);
 
 	log = index->online_log;
-	mutex_enter(&log->mutex);
+	mysql_mutex_lock(&log->mutex);
 
+start_log:
 	if (trx_id > log->max_trx) {
 		log->max_trx = trx_id;
 	}
@@ -404,6 +334,7 @@ row_log_online_op(
 
 	rec_convert_dtuple_to_temp<false>(
 		b + extra_size, index, tuple->fields, tuple->n_fields);
+
 	b += size;
 
 	if (mrec_size >= avail_size) {
@@ -413,7 +344,28 @@ row_log_online_op(
 		byte*			buf = log->tail.block;
 
 		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
-			goto write_failed;
+			if (index->online_status != ONLINE_INDEX_COMPLETE)
+				goto write_failed;
+			/* About to run out of log, InnoDB has to
+			apply the online log for the completed index */
+			index->lock.s_unlock();
+			dberr_t error= row_log_apply(
+				log->alter_trx, index, nullptr, nullptr);
+			index->lock.s_lock(SRW_LOCK_CALL);
+			if (error != DB_SUCCESS) {
+				/* Mark all newly added indexes
+				as corrupted */
+				log->error = error;
+				success = false;
+				goto err_exit;
+			}
+
+			/* Recheck whether the index online log */
+			if (!index->online_log) {
+				goto err_exit;
+			}
+
+			goto start_log;
 		}
 
 		if (mrec_size == avail_size) {
@@ -453,9 +405,6 @@ row_log_online_op(
 			    buf, byte_offset, srv_sort_buf_size)
 		    != DB_SUCCESS) {
 write_failed:
-			/* We set the flag directly instead of invoking
-			dict_set_corrupted_index_cache_only(index) here,
-			because the index is not "public" yet. */
 			index->type |= DICT_CORRUPT;
 		}
 
@@ -472,7 +421,8 @@ write_failed:
 
 	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
 err_exit:
-	mutex_exit(&log->mutex);
+	mysql_mutex_unlock(&log->mutex);
+	return success;
 }
 
 /******************************************************//**
@@ -500,13 +450,13 @@ row_log_table_open(
 	ulint		size,	/*!< in: size of log record */
 	ulint*		avail)	/*!< out: available size for log record */
 {
-	mutex_enter(&log->mutex);
+	mysql_mutex_lock(&log->mutex);
 
 	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
 
 	if (log->error != DB_SUCCESS) {
 err_exit:
-		mutex_exit(&log->mutex);
+		mysql_mutex_unlock(&log->mutex);
 		return(NULL);
 	}
 
@@ -542,7 +492,7 @@ row_log_table_close_func(
 {
 	row_log_t*	log = index->online_log;
 
-	ut_ad(mutex_own(&log->mutex));
+	mysql_mutex_assert_owner(&log->mutex);
 
 	if (size >= avail) {
 		const os_offset_t	byte_offset
@@ -606,7 +556,7 @@ write_failed:
 	log->tail.total += size;
 	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
 err_exit:
-	mutex_exit(&log->mutex);
+	mysql_mutex_unlock(&log->mutex);
 
 	onlineddl_rowlog_rows++;
 	/* 10000 means 100.00%, 4525 means 45.25% */
@@ -660,9 +610,7 @@ row_log_table_delete(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
 	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
-	ut_ad(rw_lock_own_flagged(
-			&index->lock,
-			RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
+	ut_ad(index->lock.have_any());
 
 	if (index->online_status != ONLINE_INDEX_CREATION
 	    || (index->type & DICT_CORRUPT) || index->table->corrupted
@@ -798,7 +746,6 @@ row_log_table_low_redundant(
 	dtuple_t*	tuple;
 	const ulint	n_fields = rec_get_n_fields_old(rec);
 
-	ut_ad(!page_is_comp(page_align(rec)));
 	ut_ad(index->n_fields >= n_fields);
 	ut_ad(index->n_fields == n_fields || index->is_instant());
 	ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2));
@@ -957,25 +904,8 @@ row_log_table_low(
 	ut_ad(rec_offs_validate(rec, index, offsets));
 	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
 	ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf);
-	ut_ad(rw_lock_own_flagged(
-			&index->lock,
-			RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
-#ifdef UNIV_DEBUG
-	switch (fil_page_get_type(page_align(rec))) {
-	case FIL_PAGE_INDEX:
-		break;
-	case FIL_PAGE_TYPE_INSTANT:
-		ut_ad(index->is_instant());
-		ut_ad(!page_has_siblings(page_align(rec)));
-		ut_ad(page_get_page_no(page_align(rec)) == index->page);
-		break;
-	default:
-		ut_ad("wrong page type" == 0);
-	}
-#endif /* UNIV_DEBUG */
-	ut_ad(!rec_is_metadata(rec, *index));
-	ut_ad(page_rec_is_leaf(rec));
-	ut_ad(!page_is_comp(page_align(rec)) == !rec_offs_comp(offsets));
+	ut_ad(index->lock.have_any());
+
 	/* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix
 	of the clustered index record (PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR),
 	with no information on virtual columns */
@@ -994,7 +924,6 @@ row_log_table_low(
 		return;
 	}
 
-	ut_ad(page_is_comp(page_align(rec)));
 	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
 	      || rec_get_status(rec) == REC_STATUS_INSTANT);
 
@@ -1239,10 +1168,7 @@ row_log_table_get_pk(
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(dict_index_is_online_ddl(index));
 	ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
-	ut_ad(rw_lock_own_flagged(
-			&index->lock,
-			RW_LOCK_FLAG_S | RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
-
+	ut_ad(index->lock.have_any());
 	ut_ad(log);
 	ut_ad(log->table);
 	ut_ad(log->min_trx);
@@ -1280,7 +1206,7 @@ row_log_table_get_pk(
 		return(NULL);
 	}
 
-	mutex_enter(&log->mutex);
+	mysql_mutex_lock(&log->mutex);
 
 	/* log->error is protected by log->mutex. */
 	if (log->error == DB_SUCCESS) {
@@ -1420,7 +1346,7 @@ err_exit:
 	}
 
 func_exit:
-	mutex_exit(&log->mutex);
+	mysql_mutex_unlock(&log->mutex);
 	return(tuple);
 }
 
@@ -1440,83 +1366,6 @@ row_log_table_insert(
 }
 
 /******************************************************//**
-Notes that a BLOB is being freed during online ALTER TABLE. */
-void
-row_log_table_blob_free(
-/*====================*/
-	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
-	ulint		page_no)/*!< in: starting page number of the BLOB */
-{
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(dict_index_is_online_ddl(index));
-	ut_ad(rw_lock_own_flagged(
-			&index->lock,
-			RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
-	ut_ad(page_no != FIL_NULL);
-
-	if (index->online_log->error != DB_SUCCESS) {
-		return;
-	}
-
-	page_no_map*	blobs	= index->online_log->blobs;
-
-	if (blobs == NULL) {
-		index->online_log->blobs = blobs = UT_NEW_NOKEY(page_no_map());
-	}
-
-#ifdef UNIV_DEBUG
-	const ulonglong	log_pos = index->online_log->tail.total;
-#else
-# define log_pos /* empty */
-#endif /* UNIV_DEBUG */
-
-	const page_no_map::value_type v(page_no,
-					row_log_table_blob_t(log_pos));
-
-	std::pair<page_no_map::iterator,bool> p = blobs->insert(v);
-
-	if (!p.second) {
-		/* Update the existing mapping. */
-		ut_ad(p.first->first == page_no);
-		p.first->second.blob_free(log_pos);
-	}
-#undef log_pos
-}
-
-/******************************************************//**
-Notes that a BLOB is being allocated during online ALTER TABLE. */
-void
-row_log_table_blob_alloc(
-/*=====================*/
-	dict_index_t*	index,	/*!< in/out: clustered index, X-latched */
-	ulint		page_no)/*!< in: starting page number of the BLOB */
-{
-	ut_ad(dict_index_is_clust(index));
-	ut_ad(dict_index_is_online_ddl(index));
-
-	ut_ad(rw_lock_own_flagged(
-			&index->lock,
-			RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
-
-	ut_ad(page_no != FIL_NULL);
-
-	if (index->online_log->error != DB_SUCCESS) {
-		return;
-	}
-
-	/* Only track allocations if the same page has been freed
-	earlier. Double allocation without a free is not allowed. */
-	if (page_no_map* blobs = index->online_log->blobs) {
-		page_no_map::iterator p = blobs->find(page_no);
-
-		if (p != blobs->end()) {
-			ut_ad(p->first == page_no);
-			p->second.blob_alloc(index->online_log->tail.total);
-		}
-	}
-}
-
-/******************************************************//**
 Converts a log record to a table row.
 @return converted row, or NULL if the conversion fails */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -1590,28 +1439,7 @@ row_log_table_apply_convert_mrec(
 
 		if (rec_offs_nth_extern(offsets, i)) {
 			ut_ad(rec_offs_any_extern(offsets));
-			rw_lock_x_lock(dict_index_get_lock(index));
-
-			if (const page_no_map* blobs = log->blobs) {
-				data = rec_get_nth_field(
-					mrec, offsets, i, &len);
-				ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
-
-				ulint	page_no = mach_read_from_4(
-					data + len - (BTR_EXTERN_FIELD_REF_SIZE
-						      - BTR_EXTERN_PAGE_NO));
-				page_no_map::const_iterator p = blobs->find(
-					page_no);
-				if (p != blobs->end()
-				    && p->second.is_freed(log->head.total)) {
-					/* This BLOB has been freed.
-					We must not access the row. */
-					*error = DB_MISSING_HISTORY;
-					dfield_set_data(dfield, data, len);
-					dfield_set_ext(dfield);
-					goto blob_done;
-				}
-			}
+			index->lock.x_lock(SRW_LOCK_CALL);
 
 			data = btr_rec_copy_externally_stored_field(
 				mrec, offsets,
@@ -1619,8 +1447,8 @@ row_log_table_apply_convert_mrec(
 				i, &len, heap);
 			ut_a(data);
 			dfield_set_data(dfield, data, len);
-blob_done:
-			rw_lock_x_unlock(dict_index_get_lock(index));
+
+			index->lock.x_unlock();
 		} else {
 			data = rec_get_nth_field(mrec, offsets, i, &len);
 			if (len == UNIV_SQL_DEFAULT) {
@@ -1667,6 +1495,12 @@ blob_done:
 		if ((new_col->prtype & DATA_NOT_NULL)
 		    && dfield_is_null(dfield)) {
 
+			if (!log->allow_not_null) {
+				/* We got a NULL value for a NOT NULL column. */
+				*error = DB_INVALID_NULL;
+				return NULL;
+			}
+
 			const dfield_t& default_field
 				= log->defaults->fields[col_no];
 
@@ -1676,12 +1510,6 @@ blob_done:
 					   WARN_DATA_TRUNCATED, 1,
 					   ulong(log->n_rows));
 
-			if (!log->allow_not_null) {
-				/* We got a NULL value for a NOT NULL column. */
-				*error = DB_INVALID_NULL;
-				return NULL;
-			}
-
 			*dfield = default_field;
 		}
 
@@ -1755,7 +1583,7 @@ row_log_table_apply_insert_low(
 
 		entry = row_build_index_entry(row, NULL, index, heap);
 		error = row_ins_sec_index_entry_low(
-			flags, BTR_MODIFY_TREE,
+			flags, BTR_INSERT_TREE,
 			index, offsets_heap, heap, entry,
 			thr_get_trx(thr)->id, thr);
 
@@ -1792,15 +1620,6 @@ row_log_table_apply_insert(
 		mrec, dup->index, offsets, log, heap, &error);
 
 	switch (error) {
-	case DB_MISSING_HISTORY:
-		ut_ad(log->blobs);
-		/* Because some BLOBs are missing, we know that the
-		transaction was rolled back later (a rollback of
-		an insert can free BLOBs).
-		We can simply skip the insert: the subsequent
-		ROW_T_DELETE will be ignored, or a ROW_T_UPDATE will
-		be interpreted as ROW_T_INSERT. */
-		return(DB_SUCCESS);
 	case DB_SUCCESS:
 		ut_ad(row != NULL);
 		break;
@@ -1839,7 +1658,7 @@ row_log_table_apply_delete_low(
 	dberr_t		error;
 	row_ext_t*	ext;
 	dtuple_t*	row;
-	dict_index_t*	index	= btr_pcur_get_btr_cur(pcur)->index;
+	dict_index_t*	index	= pcur->index();
 
 	ut_ad(dict_index_is_clust(index));
 
@@ -1859,12 +1678,14 @@ row_log_table_apply_delete_low(
 
 	btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
 				   BTR_CREATE_FLAG, false, mtr);
-	mtr_commit(mtr);
-
 	if (error != DB_SUCCESS) {
-		return(error);
+err_exit:
+		mtr->commit();
+		return error;
 	}
 
+	mtr->commit();
+
 	while ((index = dict_table_get_next_index(index)) != NULL) {
 		if (index->type & DICT_FTS) {
 			continue;
@@ -1874,9 +1695,12 @@ row_log_table_apply_delete_low(
 			row, ext, index, heap);
 		mtr->start();
 		index->set_modified(*mtr);
-		btr_pcur_open(index, entry, PAGE_CUR_LE,
-			      BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-			      pcur, mtr);
+		pcur->btr_cur.page_cur.index = index;
+		error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_PURGE_TREE, pcur,
+				      mtr);
+		if (error) {
+			goto err_exit;
+		}
 #ifdef UNIV_DEBUG
 		switch (btr_pcur_get_btr_cur(pcur)->flag) {
 		case BTR_CUR_DELETE_REF:
@@ -1937,6 +1761,7 @@ row_log_table_apply_delete(
 	btr_pcur_t	pcur;
 	rec_offs*	offsets;
 
+	pcur.btr_cur.page_cur.index = index;
 	ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field());
 	ut_ad(!rec_offs_any_extern(moffsets));
 
@@ -1955,9 +1780,11 @@ row_log_table_apply_delete(
 
 	mtr_start(&mtr);
 	index->set_modified(mtr);
-	btr_pcur_open(index, old_pk, PAGE_CUR_LE,
-		      BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-		      &pcur, &mtr);
+	dberr_t err = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_PURGE_TREE, &pcur,
+				    &mtr);
+	if (err != DB_SUCCESS) {
+		goto all_done;
+	}
 #ifdef UNIV_DEBUG
 	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
 	case BTR_CUR_DELETE_REF:
@@ -1984,7 +1811,7 @@ all_done:
 		ROW_T_INSERT was skipped or
 		ROW_T_UPDATE was interpreted as ROW_T_DELETE
 		due to BLOBs having been freed by rollback. */
-		return(DB_SUCCESS);
+		return err;
 	}
 
 	offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, nullptr,
@@ -2066,6 +1893,8 @@ row_log_table_apply_update(
 	dberr_t		error;
 	ulint		n_index = 0;
 
+	pcur.btr_cur.page_cur.index = index;
+
 	ut_ad(dtuple_get_n_fields_cmp(old_pk)
 	      == dict_index_get_n_unique(index));
 	ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2)
@@ -2075,20 +1904,6 @@ row_log_table_apply_update(
 		mrec, dup->index, offsets, log, heap, &error);
 
 	switch (error) {
-	case DB_MISSING_HISTORY:
-		/* The record contained BLOBs that are now missing. */
-		ut_ad(log->blobs);
-		/* Whether or not we are updating the PRIMARY KEY, we
-		know that there should be a subsequent
-		ROW_T_DELETE for rolling back a preceding ROW_T_INSERT,
-		overriding this ROW_T_UPDATE record. (*1)
-
-		This allows us to interpret this ROW_T_UPDATE
-		as ROW_T_DELETE.
-
-		When applying the subsequent ROW_T_DELETE, no matching
-		record will be found. */
-		/* fall through */
 	case DB_SUCCESS:
 		ut_ad(row != NULL);
 		break;
@@ -2100,10 +1915,25 @@ row_log_table_apply_update(
 		return(error);
 	}
 
-	mtr_start(&mtr);
+	mtr.start();
 	index->set_modified(mtr);
-	btr_pcur_open(index, old_pk, PAGE_CUR_LE,
-		      BTR_MODIFY_TREE, &pcur, &mtr);
+	error = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur,
+			      &mtr);
+	if (error != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+func_exit_committed:
+		ut_ad(mtr.has_committed());
+		ut_free(pcur.old_rec_buf);
+
+		if (error != DB_SUCCESS) {
+			/* Report the erroneous row using the new
+			version of the table. */
+			innobase_row_to_mysql(dup->table, log->table, row);
+		}
+
+		return error;
+	}
 #ifdef UNIV_DEBUG
 	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
 	case BTR_CUR_DELETE_REF:
@@ -2118,80 +1948,16 @@ row_log_table_apply_update(
 	}
 #endif /* UNIV_DEBUG */
 
-	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
-	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
-		/* The record was not found. This should only happen
-		when an earlier ROW_T_INSERT or ROW_T_UPDATE was
-		diverted because BLOBs were freed when the insert was
-		later rolled back. */
-
-		ut_ad(log->blobs);
-
-		if (error == DB_SUCCESS) {
-			/* An earlier ROW_T_INSERT could have been
-			skipped because of a missing BLOB, like this:
-
-			BEGIN;
-			INSERT INTO t SET blob_col='blob value';
-			UPDATE t SET blob_col='';
-			ROLLBACK;
-
-			This would generate the following records:
-			ROW_T_INSERT (referring to 'blob value')
-			ROW_T_UPDATE
-			ROW_T_UPDATE (referring to 'blob value')
-			ROW_T_DELETE
-			[ROLLBACK removes the 'blob value']
-
-			The ROW_T_INSERT would have been skipped
-			because of a missing BLOB. Now we are
-			executing the first ROW_T_UPDATE.
-			The second ROW_T_UPDATE (for the ROLLBACK)
-			would be interpreted as ROW_T_DELETE, because
-			the BLOB would be missing.
-
-			We could probably assume that the transaction
-			has been rolled back and simply skip the
-			'insert' part of this ROW_T_UPDATE record.
-			However, there might be some complex scenario
-			that could interfere with such a shortcut.
-			So, we will insert the row (and risk
-			introducing a bogus duplicate key error
-			for the ALTER TABLE), and a subsequent
-			ROW_T_UPDATE or ROW_T_DELETE will delete it. */
-			mtr_commit(&mtr);
-			error = row_log_table_apply_insert_low(
-				thr, row, offsets_heap, heap, dup);
-		} else {
-			/* Some BLOBs are missing, so we are interpreting
-			this ROW_T_UPDATE as ROW_T_DELETE (see *1).
-			Because the record was not found, we do nothing. */
-			ut_ad(error == DB_MISSING_HISTORY);
-			error = DB_SUCCESS;
-func_exit:
-			mtr_commit(&mtr);
-		}
-func_exit_committed:
-		ut_ad(mtr.has_committed());
-		ut_free(pcur.old_rec_buf);
-
-		if (error != DB_SUCCESS) {
-			/* Report the erroneous row using the new
-			version of the table. */
-			innobase_row_to_mysql(dup->table, log->table, row);
-		}
-
-		return(error);
-	}
+	ut_ad(!page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	      && btr_pcur_get_low_match(&pcur) >= index->n_uniq);
 
 	/* Prepare to update (or delete) the record. */
 	rec_offs*		cur_offsets	= rec_get_offsets(
 		btr_pcur_get_rec(&pcur), index, nullptr, index->n_core_fields,
 		ULINT_UNDEFINED, &offsets_heap);
 
+#ifdef UNIV_DEBUG
 	if (!log->same_pk) {
-		/* Only update the record if DB_TRX_ID,DB_ROLL_PTR match what
-		was buffered. */
 		ulint		len;
 		const byte*	rec_trx_id
 			= rec_get_nth_field(btr_pcur_get_rec(&pcur),
@@ -2206,59 +1972,17 @@ func_exit_committed:
 		      + static_cast<const char*>(old_pk_trx_id->data)
 		      == old_pk_trx_id[1].data);
 		ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx));
-
-		if (memcmp(rec_trx_id, old_pk_trx_id->data,
-			   DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
-			/* The ROW_T_UPDATE was logged for a different
-			DB_TRX_ID,DB_ROLL_PTR. This is possible if an
-			earlier ROW_T_INSERT or ROW_T_UPDATE was diverted
-			because some BLOBs were missing due to rolling
-			back the initial insert or due to purging
-			the old BLOB values of an update. */
-			ut_ad(log->blobs);
-			if (error != DB_SUCCESS) {
-				ut_ad(error == DB_MISSING_HISTORY);
-				/* Some BLOBs are missing, so we are
-				interpreting this ROW_T_UPDATE as
-				ROW_T_DELETE (see *1).
-				Because this is a different row,
-				we will do nothing. */
-				error = DB_SUCCESS;
-			} else {
-				/* Because the user record is missing due to
-				BLOBs that were missing when processing
-				an earlier log record, we should
-				interpret the ROW_T_UPDATE as ROW_T_INSERT.
-				However, there is a different user record
-				with the same PRIMARY KEY value already. */
-				error = DB_DUPLICATE_KEY;
-			}
-
-			goto func_exit;
-		}
-	}
-
-	if (error != DB_SUCCESS) {
-		ut_ad(error == DB_MISSING_HISTORY);
-		ut_ad(log->blobs);
-		/* Some BLOBs are missing, so we are interpreting
-		this ROW_T_UPDATE as ROW_T_DELETE (see *1). */
-		error = row_log_table_apply_delete_low(
-			&pcur, cur_offsets, heap, &mtr);
-		goto func_exit_committed;
+		ut_ad(!memcmp(rec_trx_id, old_pk_trx_id->data,
+			      DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
 	}
+#endif
 
 	dtuple_t*	entry	= row_build_index_entry_low(
 		row, NULL, index, heap, ROW_BUILD_NORMAL);
 	upd_t*		update	= row_upd_build_difference_binary(
 		index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
 		false, false, NULL, heap, dup->table, &error);
-	if (error != DB_SUCCESS) {
-		goto func_exit;
-	}
-
-	if (!update->n_fields) {
-		/* Nothing to do. */
+	if (error != DB_SUCCESS || !update->n_fields) {
 		goto func_exit;
 	}
 
@@ -2339,7 +2063,7 @@ func_exit_committed:
 
 	for (n_index += index->type != DICT_CLUSTERED;
 	     (index = dict_table_get_next_index(index)); n_index++) {
-		if (index->type & DICT_FTS) {
+		if (!index->is_btree()) {
 			continue;
 		}
 
@@ -2356,7 +2080,7 @@ func_exit_committed:
 			dtuple_copy_v_fields(old_row, old_pk);
 		}
 
-		mtr_commit(&mtr);
+		mtr.commit();
 
 		entry = row_build_index_entry(old_row, old_ext, index, heap);
 		if (!entry) {
@@ -2365,13 +2089,15 @@ func_exit_committed:
 			goto func_exit_committed;
 		}
 
-		mtr_start(&mtr);
+		mtr.start();
 		index->set_modified(mtr);
+		pcur.btr_cur.page_cur.index = index;
 
 		ut_free(pcur.old_rec_buf);
+		pcur.old_rec_buf = nullptr;
 
 		if (ROW_FOUND != row_search_index_entry(
-			    index, entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+			    entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
 			ut_ad(0);
 			error = DB_CORRUPTION;
 			break;
@@ -2385,13 +2111,13 @@ func_exit_committed:
 			break;
 		}
 
-		mtr_commit(&mtr);
+		mtr.commit();
 
 		entry = row_build_index_entry(row, NULL, index, heap);
 		error = row_ins_sec_index_entry_low(
 			BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
 			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
-			BTR_MODIFY_TREE, index, offsets_heap, heap,
+			BTR_INSERT_TREE, index, offsets_heap, heap,
 			entry, thr_get_trx(thr)->id, thr);
 
 		/* Report correct index name for duplicate key error. */
@@ -2399,7 +2125,7 @@ func_exit_committed:
 			thr_get_trx(thr)->error_key_num = n_index;
 		}
 
-		mtr_start(&mtr);
+		mtr.start();
 		index->set_modified(mtr);
 	}
 
@@ -2441,11 +2167,6 @@ row_log_table_apply_op(
 
 	*error = DB_SUCCESS;
 
-	/* 3 = 1 (op type) + 1 (extra_size) + at least 1 byte payload */
-	if (mrec + 3 >= mrec_end) {
-		return(NULL);
-	}
-
 	const bool is_instant = log->is_instant(dup->index);
 	const mrec_t* const mrec_start = mrec;
 
@@ -2717,7 +2438,8 @@ ulint
 row_log_estimate_work(
 	const dict_index_t*	index)
 {
-	if (index == NULL || index->online_log == NULL) {
+	if (index == NULL || index->online_log == NULL
+	    || index->online_log_is_dummy()) {
 		return(0);
 	}
 
@@ -2775,7 +2497,7 @@ row_log_table_apply_ops(
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(dict_index_is_online_ddl(index));
 	ut_ad(trx->mysql_thd);
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+	ut_ad(index->lock.have_x());
 	ut_ad(!dict_index_is_online_ddl(new_index));
 	ut_ad(dict_col_get_clust_pos(
 		      dict_table_get_sys_col(index->table, DATA_TRX_ID), index)
@@ -2795,7 +2517,7 @@ row_log_table_apply_ops(
 
 next_block:
 	ut_ad(has_index_lock);
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+	ut_ad(index->lock.have_u_or_x());
 	ut_ad(index->online_log->head.bytes == 0);
 
 	stage->inc(row_log_progress_inc_per_block());
@@ -2868,7 +2590,7 @@ all_done:
 
 		ut_ad(has_index_lock);
 		has_index_lock = false;
-		rw_lock_x_unlock(dict_index_get_lock(index));
+		index->lock.x_unlock();
 
 		log_free_check();
 
@@ -2881,9 +2603,9 @@ all_done:
 
 		byte*			buf = index->online_log->head.block;
 
-		if (os_file_read_no_error_handling(
-			    IORequestRead, index->online_log->fd,
-			    buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) {
+		if (DB_SUCCESS
+		    != os_file_read(IORequestRead, index->online_log->fd,
+				    buf, ofs, srv_sort_buf_size, nullptr)) {
 			ib::error()
 				<< "Unable to read temporary file"
 				" for table " << index->table->name;
@@ -3059,7 +2781,7 @@ all_done:
 
 			mrec = NULL;
 process_next_block:
-			rw_lock_x_lock(dict_index_get_lock(index));
+			index->lock.x_lock(SRW_LOCK_CALL);
 			has_index_lock = true;
 
 			index->online_log->head.bytes = 0;
@@ -3091,7 +2813,7 @@ interrupted:
 	error = DB_INTERRUPTED;
 func_exit:
 	if (!has_index_lock) {
-		rw_lock_x_lock(dict_index_get_lock(index));
+		index->lock.x_lock(SRW_LOCK_CALL);
 	}
 
 	mem_heap_free(offsets_heap);
@@ -3127,14 +2849,13 @@ row_log_table_apply(
 
 	stage->begin_phase_log_table();
 
-	ut_ad(!rw_lock_own(&dict_sys.latch, RW_LOCK_S));
 	clust_index = dict_table_get_first_index(old_table);
 
 	if (clust_index->online_log->n_rows == 0) {
 		clust_index->online_log->n_rows = new_table->stat_n_rows;
 	}
 
-	rw_lock_x_lock(dict_index_get_lock(clust_index));
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
 
 	if (!clust_index->online_log) {
 		ut_ad(dict_index_get_online_status(clust_index)
@@ -3157,7 +2878,7 @@ row_log_table_apply(
 		      == clust_index->online_log->tail.total);
 	}
 
-	rw_lock_x_unlock(dict_index_get_lock(clust_index));
+	clust_index->lock.x_unlock();
 	DBUG_EXECUTE_IF("innodb_trx_duplicates",
 			thr_get_trx(thr)->duplicates = 0;);
 
@@ -3196,7 +2917,7 @@ row_log_allocate(
 	ut_ad(same_pk || table);
 	ut_ad(!table || col_map);
 	ut_ad(!defaults || col_map);
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+	ut_ad(index->lock.have_u_or_x());
 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 	ut_ad(trx->id);
 
@@ -3207,9 +2928,8 @@ row_log_allocate(
 	}
 
 	log->fd = OS_FILE_CLOSED;
-	mutex_create(LATCH_ID_INDEX_ONLINE_LOG, &log->mutex);
+	mysql_mutex_init(index_online_log_key, &log->mutex, nullptr);
 
-	log->blobs = NULL;
 	log->table = table;
 	log->same_pk = same_pk;
 	log->defaults = defaults;
@@ -3259,6 +2979,15 @@ row_log_allocate(
 	}
 
 	index->online_log = log;
+
+	if (!table) {
+		/* Assign the clustered index online log to table.
+		It can be used by concurrent DML to identify whether
+		the table has any online DDL */
+		index->table->indexes.start->online_log_make_dummy();
+		log->alter_trx = trx;
+	}
+
 	/* While we might be holding an exclusive data dictionary lock
 	here, in row_log_abort_sec() we will not always be holding it. Use
 	atomic operations in both cases. */
@@ -3276,7 +3005,6 @@ row_log_free(
 {
 	MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
 
-	UT_DELETE(log->blobs);
 	UT_DELETE_ARRAY(log->non_core_fields);
 	row_log_block_free(log->tail);
 	row_log_block_free(log->head);
@@ -3290,7 +3018,7 @@ row_log_free(
 		my_large_free(log->crypt_tail, log->crypt_tail_size);
 	}
 
-	mutex_free(&log->mutex);
+	mysql_mutex_destroy(&log->mutex);
 	ut_free(log);
 }
 
@@ -3304,11 +3032,11 @@ row_log_get_max_trx(
 	dict_index_t*	index)	/*!< in: index, must be locked */
 {
 	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
-
-	ut_ad((rw_lock_own(dict_index_get_lock(index), RW_LOCK_S)
-	       && mutex_own(&index->online_log->mutex))
-	      || rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
-
+#ifdef SAFE_MUTEX
+	ut_ad(index->lock.have_x()
+	      || (index->lock.have_s()
+		  && mysql_mutex_is_owner(&index->online_log->mutex)));
+#endif
 	return(index->online_log->max_trx);
 }
 
@@ -3336,8 +3064,7 @@ row_log_apply_op_low(
 
 	ut_ad(!dict_index_is_clust(index));
 
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)
-	      == has_index_lock);
+	ut_ad(index->lock.have_x() == has_index_lock);
 
 	ut_ad(!index->is_corrupted());
 	ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
@@ -3350,18 +3077,22 @@ row_log_apply_op_low(
 
 	mtr_start(&mtr);
 	index->set_modified(mtr);
+	cursor.page_cur.index = index;
+	if (has_index_lock) {
+		mtr_x_lock_index(index, &mtr);
+	}
 
 	/* We perform the pessimistic variant of the operations if we
 	already hold index->lock exclusively. First, search the
 	record. The operation may already have been performed,
 	depending on when the row in the clustered index was
 	scanned. */
-	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
-				    has_index_lock
-				    ? BTR_MODIFY_TREE
-				    : BTR_MODIFY_LEAF,
-				    &cursor, __FILE__, __LINE__,
-				    &mtr);
+	*error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock
+				    ? BTR_MODIFY_TREE_ALREADY_LATCHED
+				    : BTR_MODIFY_LEAF, &mtr);
+	if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+		goto func_exit;
+	}
 
 	ut_ad(dict_index_get_n_unique(index) > 0);
 	/* This test is somewhat similar to row_ins_must_modify_rec(),
@@ -3395,9 +3126,10 @@ row_log_apply_op_low(
 				goto func_exit;
 			}
 
-			if (btr_cur_optimistic_delete(
-				    &cursor, BTR_CREATE_FLAG, &mtr)) {
-				*error = DB_SUCCESS;
+			*error = btr_cur_optimistic_delete(
+				&cursor, BTR_CREATE_FLAG, &mtr);
+
+			if (*error != DB_FAIL) {
 				break;
 			}
 
@@ -3407,11 +3139,12 @@ row_log_apply_op_low(
 				mtr_commit(&mtr);
 				mtr_start(&mtr);
 				index->set_modified(mtr);
-				btr_cur_search_to_nth_level(
-					index, 0, entry, PAGE_CUR_LE,
-					BTR_MODIFY_TREE, &cursor,
-					__FILE__, __LINE__, &mtr);
-
+				*error = cursor.search_leaf(entry, PAGE_CUR_LE,
+							    BTR_MODIFY_TREE,
+							    &mtr);
+				if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+					goto func_exit;
+				}
 				/* No other thread than the current one
 				is allowed to modify the index tree.
 				Thus, the record should still exist. */
@@ -3510,10 +3243,12 @@ insert_the_rec:
 				mtr_commit(&mtr);
 				mtr_start(&mtr);
 				index->set_modified(mtr);
-				btr_cur_search_to_nth_level(
-					index, 0, entry, PAGE_CUR_LE,
-					BTR_MODIFY_TREE, &cursor,
-					__FILE__, __LINE__, &mtr);
+				*error = cursor.search_leaf(entry, PAGE_CUR_LE,
+							    BTR_MODIFY_TREE,
+							    &mtr);
+				if (*error != DB_SUCCESS) {
+					break;
+				}
 			}
 
 			/* We already determined that the
@@ -3579,8 +3314,7 @@ row_log_apply_op(
 	/* Online index creation is only used for secondary indexes. */
 	ut_ad(!dict_index_is_clust(index));
 
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X)
-	      == has_index_lock);
+	ut_ad(index->lock.have_x() == has_index_lock);
 
 	if (index->is_corrupted()) {
 		*error = DB_INDEX_CORRUPT;
@@ -3667,7 +3401,8 @@ interrupted)
 @param[in,out]	dup	for reporting duplicate key errors
 @param[in,out]	stage	performance schema accounting object, used by
 ALTER TABLE. If not NULL, then stage->inc() will be called for each block
-of log that is applied.
+of log that is applied or nullptr when row log applied done by DML
+thread.
 @return DB_SUCCESS, or error code on failure */
 static
 dberr_t
@@ -3689,9 +3424,11 @@ row_log_apply_ops(
 	const ulint	i	= 1 + REC_OFFS_HEADER_SIZE
 		+ dict_index_get_n_fields(index);
 
-	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
 	ut_ad(!index->is_committed());
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+	ut_ad(index->lock.have_x());
 	ut_ad(index->online_log);
 
 	MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
@@ -3706,10 +3443,12 @@ row_log_apply_ops(
 
 next_block:
 	ut_ad(has_index_lock);
-	ut_ad(rw_lock_own(dict_index_get_lock(index), RW_LOCK_X));
+	ut_ad(index->lock.have_x());
 	ut_ad(index->online_log->head.bytes == 0);
 
-	stage->inc(row_log_progress_inc_per_block());
+	if (stage) {
+		stage->inc(row_log_progress_inc_per_block());
+	}
 
 	if (trx_is_interrupted(trx)) {
 		goto interrupted;
@@ -3763,6 +3502,8 @@ all_done:
 			ut_ad(has_index_lock);
 			ut_ad(index->online_log->head.blocks == 0);
 			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->tail.bytes = 0;
+			index->online_log->head.bytes = 0;
 			error = DB_SUCCESS;
 			goto func_exit;
 		}
@@ -3772,7 +3513,7 @@ all_done:
 			* srv_sort_buf_size;
 		ut_ad(has_index_lock);
 		has_index_lock = false;
-		rw_lock_x_unlock(dict_index_get_lock(index));
+		index->lock.x_unlock();
 
 		log_free_check();
 
@@ -3783,9 +3524,9 @@ all_done:
 
 		byte*	buf = index->online_log->head.block;
 
-		if (os_file_read_no_error_handling(
-			    IORequestRead, index->online_log->fd,
-			    buf, ofs, srv_sort_buf_size, 0) != DB_SUCCESS) {
+		if (DB_SUCCESS
+		    != os_file_read(IORequestRead, index->online_log->fd,
+				    buf, ofs, srv_sort_buf_size, nullptr)) {
 			ib::error()
 				<< "Unable to read temporary file"
 				" for index " << index->name;
@@ -3932,7 +3673,7 @@ all_done:
 
 			mrec = NULL;
 process_next_block:
-			rw_lock_x_lock(dict_index_get_lock(index));
+			index->lock.x_lock(SRW_LOCK_CALL);
 			has_index_lock = true;
 
 			index->online_log->head.bytes = 0;
@@ -3964,7 +3705,7 @@ interrupted:
 	error = DB_INTERRUPTED;
 func_exit:
 	if (!has_index_lock) {
-		rw_lock_x_lock(dict_index_get_lock(index));
+		index->lock.x_lock(SRW_LOCK_CALL);
 	}
 
 	switch (error) {
@@ -3978,9 +3719,6 @@ func_exit:
 		}
 		/* fall through */
 	default:
-		/* We set the flag directly instead of invoking
-		dict_set_corrupted_index_cache_only(index) here,
-		because the index is not "public" yet. */
 		index->type |= DICT_CORRUPT;
 	}
 
@@ -3998,7 +3736,8 @@ interrupted)
 @param[in,out]	table	MySQL table (for reporting duplicates)
 @param[in,out]	stage	performance schema accounting object, used by
 ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
-stage->inc() will be called for each block of log that is applied.
+stage->inc() will be called for each block of log that is applied or nullptr
+when row log has been applied by DML thread.
 @return DB_SUCCESS, or error code on failure */
 dberr_t
 row_log_apply(
@@ -4008,20 +3747,23 @@ row_log_apply(
 	ut_stage_alter_t*	stage)
 {
 	dberr_t		error;
-	row_log_t*	log;
 	row_merge_dup_t	dup = { index, table, NULL, 0 };
 	DBUG_ENTER("row_log_apply");
 
-	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
 	ut_ad(!dict_index_is_clust(index));
 
-	stage->begin_phase_log_index();
+	if (stage) {
+		stage->begin_phase_log_index();
+	}
 
 	log_free_check();
 
-	rw_lock_x_lock(dict_index_get_lock(index));
+	index->lock.x_lock(SRW_LOCK_CALL);
 
-	if (!dict_table_is_corrupted(index->table)) {
+	if (index->online_log && !index->table->corrupted) {
 		error = row_log_apply_ops(trx, index, &dup, stage);
 	} else {
 		error = DB_SUCCESS;
@@ -4029,23 +3771,18 @@ row_log_apply(
 
 	if (error != DB_SUCCESS) {
 		ut_ad(index->table->space);
-		/* We set the flag directly instead of invoking
-		dict_set_corrupted_index_cache_only(index) here,
-		because the index is not "public" yet. */
 		index->type |= DICT_CORRUPT;
 		index->table->drop_aborted = TRUE;
 
 		dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
-	} else {
+	} else if (stage) {
+		/* Mark the index as completed only when it is
+		being called by DDL thread */
 		ut_ad(dup.n_dup == 0);
 		dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
 	}
 
-	log = index->online_log;
-	index->online_log = NULL;
-	rw_lock_x_unlock(dict_index_get_lock(index));
-
-	row_log_free(log);
+	index->lock.x_unlock();
 
 	DBUG_RETURN(error);
 }
@@ -4055,3 +3792,338 @@ unsigned row_log_get_n_core_fields(const dict_index_t *index)
   ut_ad(index->online_log);
   return index->online_log->n_core_fields;
 }
+
+dberr_t row_log_get_error(const dict_index_t *index)
+{
+  ut_ad(index->online_log);
+  return index->online_log->error;
+}
+
+dberr_t dict_table_t::clear(que_thr_t *thr)
+{
+  dberr_t err= DB_SUCCESS;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(indexes); index;
+       index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (index->type & DICT_FTS)
+      continue;
+
+    switch (dict_index_get_online_status(index)) {
+    case ONLINE_INDEX_ABORTED:
+    case ONLINE_INDEX_ABORTED_DROPPED:
+      continue;
+    case ONLINE_INDEX_COMPLETE:
+      break;
+    case ONLINE_INDEX_CREATION:
+      ut_ad("invalid type" == 0);
+      MY_ASSERT_UNREACHABLE();
+      break;
+    }
+    if (dberr_t err_index= index->clear(thr))
+      err= err_index;
+  }
+  return err;
+}
+
+const rec_t *
+UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+                            const rec_t **clust_rec, rec_offs **offsets)
+{
+  ut_ad(index->is_primary());
+  btr_pcur_t pcur;
+
+  bool found= row_search_on_row_ref(&pcur, BTR_MODIFY_LEAF,
+                                    index->table, &tuple, &mtr);
+  ut_a(found);
+  *clust_rec= btr_pcur_get_rec(&pcur);
+
+  ulint len= 0;
+  rec_t *prev_version;
+  const rec_t *version= *clust_rec;
+  do
+  {
+    *offsets= rec_get_offsets(version, index, *offsets,
+                              index->n_core_fields, ULINT_UNDEFINED,
+                              &heap);
+    roll_ptr_t roll_ptr= trx_read_roll_ptr(
+      rec_get_nth_field(version, *offsets, index->db_roll_ptr(), &len));
+    ut_ad(len == DATA_ROLL_PTR_LEN);
+    if (is_same(roll_ptr))
+      return version;
+    trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version,
+                                nullptr, nullptr, 0);
+    version= prev_version;
+  }
+  while (version);
+
+  return nullptr;
+}
+
+/** Clear out all online log of other online indexes after
+encountering the error during row_log_apply() in DML thread
+@param	table	table which does online DDL */
+static void row_log_mark_other_online_index_abort(dict_table_t *table)
+{
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  for (dict_index_t *index= dict_table_get_next_index(clust_index);
+       index; index= dict_table_get_next_index(index))
+  {
+    if (index->online_log &&
+        index->online_status <= ONLINE_INDEX_CREATION &&
+        !index->is_corrupted())
+    {
+      index->lock.x_lock(SRW_LOCK_CALL);
+      row_log_abort_sec(index);
+      index->type|= DICT_CORRUPT;
+      index->lock.x_unlock();
+      MONITOR_ATOMIC_INC(MONITOR_BACKGROUND_DROP_INDEX);
+    }
+  }
+
+  clust_index->lock.x_lock(SRW_LOCK_CALL);
+  clust_index->online_log= nullptr;
+  clust_index->lock.x_unlock();
+  table->drop_aborted= TRUE;
+}
+
+void dtype_t::assign(const dict_col_t &col)
+{
+  prtype= col.prtype;
+  mtype= col.mtype;
+  len= col.len;
+  mbminlen= col.mbminlen;
+  mbmaxlen= col.mbmaxlen;
+}
+
+inline void dtuple_t::copy_field_types(const dict_index_t &index)
+{
+  ut_ad(index.n_fields == n_fields);
+  if (UNIV_LIKELY_NULL(index.change_col_info))
+    for (ulint i= 0; i < n_fields; i++)
+      fields[i].type.assign(*index.fields[i].col);
+}
+
+void UndorecApplier::log_insert(const dtuple_t &tuple,
+                                dict_index_t *clust_index)
+{
+  DEBUG_SYNC_C("row_log_insert_handle");
+  ut_ad(clust_index->is_primary());
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+
+  rec_offs_init(offsets_);
+  mtr.start();
+  const rec_t *rec;
+  const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+  if (!match_rec)
+  {
+    mtr.commit();
+    return;
+  }
+  const rec_t *copy_rec= match_rec;
+  if (match_rec == rec)
+  {
+    copy_rec= rec_copy(mem_heap_alloc(
+      heap, rec_offs_size(offsets)), match_rec, offsets);
+    rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+  }
+  mtr.commit();
+
+  dict_table_t *table= clust_index->table;
+  clust_index->lock.s_lock(SRW_LOCK_CALL);
+  if (clust_index->online_log &&
+      !clust_index->online_log_is_dummy() &&
+      clust_index->online_status <= ONLINE_INDEX_CREATION)
+  {
+    row_log_table_insert(copy_rec, clust_index, offsets);
+    clust_index->lock.s_unlock();
+  }
+  else
+  {
+    clust_index->lock.s_unlock();
+    row_ext_t *ext;
+    dtuple_t *row= row_build(ROW_COPY_POINTERS, clust_index,
+      copy_rec, offsets, table, nullptr, nullptr, &ext, heap);
+
+    if (table->n_v_cols)
+    {
+      /* Update the row with virtual column values present
+      in the undo log or update vector */
+      if (type == TRX_UNDO_UPD_DEL_REC)
+        row_upd_replace_vcol(row, table, update, false,
+                             nullptr,
+                             (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                             ? nullptr : undo_rec);
+      else
+        trx_undo_read_v_cols(table, undo_rec, row, false);
+    }
+
+    bool success= true;
+    for (dict_index_t *index= clust_index;
+         (index= dict_table_get_next_index(index)) != nullptr; )
+    {
+      index->lock.s_lock(SRW_LOCK_CALL);
+      if (index->online_log &&
+          index->online_status <= ONLINE_INDEX_CREATION &&
+          !index->is_corrupted())
+      {
+        dtuple_t *entry= row_build_index_entry_low(row, ext, index,
+                                                   heap, ROW_BUILD_NORMAL);
+        entry->copy_field_types(*index);
+	success= row_log_online_op(index, entry, trx_id);
+      }
+
+      index->lock.s_unlock();
+      if (!success)
+      {
+        row_log_mark_other_online_index_abort(index->table);
+        return;
+      }
+    }
+  }
+}
+
+void UndorecApplier::log_update(const dtuple_t &tuple,
+                                dict_index_t *clust_index)
+{
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs *prev_offsets= offsets2_;
+
+  rec_offs_init(offsets_);
+  rec_offs_init(offsets2_);
+
+  dict_table_t *table= clust_index->table;
+
+  clust_index->lock.s_lock(SRW_LOCK_CALL);
+  bool table_rebuild=
+    (clust_index->online_log
+     && !clust_index->online_log_is_dummy()
+     && clust_index->online_status <= ONLINE_INDEX_CREATION);
+  clust_index->lock.s_unlock();
+
+  mtr.start();
+  const rec_t *rec;
+  rec_t *prev_version;
+  bool is_update= (type == TRX_UNDO_UPD_EXIST_REC);
+  const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+  if (!match_rec)
+  {
+    mtr.commit();
+    return;
+  }
+
+  if (table_rebuild)
+  {
+    const rec_t *copy_rec= match_rec;
+    if (match_rec == rec)
+      copy_rec= rec_copy(mem_heap_alloc(
+        heap, rec_offs_size(offsets)), match_rec, offsets);
+    trx_undo_prev_version_build(match_rec, clust_index, offsets, heap,
+                                &prev_version, nullptr, nullptr, 0);
+
+    prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets,
+                                  clust_index->n_core_fields,
+                                  ULINT_UNDEFINED, &heap);
+    rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+    mtr.commit();
+
+    clust_index->lock.s_lock(SRW_LOCK_CALL);
+    /* Recheck whether clustered index online log has been cleared */
+    if (clust_index->online_log)
+    {
+      if (is_update)
+      {
+        const dtuple_t *rebuilt_old_pk= row_log_table_get_pk(
+          prev_version, clust_index, prev_offsets, nullptr, &heap);
+        row_log_table_update(copy_rec, clust_index, offsets, rebuilt_old_pk);
+      }
+      else
+        row_log_table_delete(prev_version, clust_index, prev_offsets, nullptr);
+    }
+    clust_index->lock.s_unlock();
+    return;
+  }
+
+  dtuple_t *row= nullptr;
+  row_ext_t *new_ext;
+  if (match_rec != rec)
+    row= row_build(ROW_COPY_POINTERS, clust_index, match_rec, offsets,
+                   clust_index->table, NULL, NULL, &new_ext, heap);
+  else
+    row= row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+                   clust_index->table, NULL, NULL, &new_ext, heap);
+  mtr.commit();
+  row_ext_t *old_ext;
+  dtuple_t *old_row= nullptr;
+  if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE))
+  {
+    for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++)
+       dfield_get_type(
+         dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING;
+  }
+
+  if (is_update)
+  {
+    old_row= dtuple_copy(row, heap);
+    row_upd_replace(old_row, &old_ext, clust_index, update, heap);
+  }
+
+  if (table->n_v_cols)
+    row_upd_replace_vcol(row, table, update, false, nullptr,
+                         (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                         ? nullptr : this->undo_rec);
+
+  bool success= true;
+  dict_index_t *index= dict_table_get_next_index(clust_index);
+  while (index)
+  {
+    index->lock.s_lock(SRW_LOCK_CALL);
+    if (index->online_log &&
+        index->online_status <= ONLINE_INDEX_CREATION &&
+        !index->is_corrupted())
+    {
+      if (is_update)
+      {
+        /* Ignore the index if the update doesn't affect the index */
+        if (!row_upd_changes_ord_field_binary(index, update,
+                                              nullptr,
+                                              row, new_ext))
+          goto next_index;
+        dtuple_t *old_entry= row_build_index_entry_low(
+          old_row, old_ext, index, heap, ROW_BUILD_NORMAL);
+
+        old_entry->copy_field_types(*index);
+
+	success= row_log_online_op(index, old_entry, 0);
+
+	dtuple_t *new_entry= row_build_index_entry_low(
+          row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+        new_entry->copy_field_types(*index);
+
+	if (success)
+	  success= row_log_online_op(index, new_entry, trx_id);
+      }
+      else
+      {
+        dtuple_t *old_entry= row_build_index_entry_low(
+          row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+        old_entry->copy_field_types(*index);
+
+        success= row_log_online_op(index, old_entry, 0);
+      }
+    }
+next_index:
+    index->lock.s_unlock();
+    if (!success)
+    {
+      row_log_mark_other_online_index_abort(index->table);
+      return;
+    }
+    index= dict_table_get_next_index(index);
+  }
+}
+
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 80d51754d5c..70b51fbb812 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2014, 2022, MariaDB Corporation.
+Copyright (c) 2014, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -51,6 +51,7 @@ Completed by Sunny Bains and Marko Makela
 #endif /* BTR_CUR_ADAPT */
 #include "ut0stage.h"
 #include "fil0crypt.h"
+#include "srv0mon.h"
 
 /* Ignore posix_fadvise() on those platforms where it does not exist */
 #if defined _WIN32
@@ -129,7 +130,10 @@ public:
 
 			if (log_sys.check_flush_or_checkpoint()) {
 				if (mtr_started) {
-					btr_pcur_move_to_prev_on_page(pcur);
+					if (!btr_pcur_move_to_prev_on_page(pcur)) {
+						error = DB_CORRUPTION;
+						break;
+					}
 					btr_pcur_store_position(pcur, scan_mtr);
 					scan_mtr->commit();
 					mtr_started = false;
@@ -141,20 +145,17 @@ public:
 			mtr.start();
 			index->set_modified(mtr);
 
-			ins_cur.index = index;
+			ins_cur.page_cur.index = index;
 			rtr_init_rtr_info(&rtr_info, false, &ins_cur, index,
 					  false);
 			rtr_info_update_btr(&ins_cur, &rtr_info);
 
-			btr_cur_search_to_nth_level(index, 0, dtuple,
-						    PAGE_CUR_RTREE_INSERT,
-						    BTR_MODIFY_LEAF, &ins_cur,
-						     __FILE__, __LINE__,
-						    &mtr);
+			error = rtr_insert_leaf(&ins_cur, dtuple,
+						BTR_MODIFY_LEAF, &mtr);
 
 			/* It need to update MBR in parent entry,
 			so change search mode to BTR_MODIFY_TREE */
-			if (rtr_info.mbr_adj) {
+			if (error == DB_SUCCESS && rtr_info.mbr_adj) {
 				mtr.commit();
 				rtr_clean_rtr_info(&rtr_info, true);
 				rtr_init_rtr_info(&rtr_info, false, &ins_cur,
@@ -162,19 +163,20 @@ public:
 				rtr_info_update_btr(&ins_cur, &rtr_info);
 				mtr.start();
 				index->set_modified(mtr);
-				btr_cur_search_to_nth_level(
-					index, 0, dtuple,
-					PAGE_CUR_RTREE_INSERT,
-					BTR_MODIFY_TREE, &ins_cur,
-					__FILE__, __LINE__, &mtr);
+				error = rtr_insert_leaf(&ins_cur, dtuple,
+							BTR_MODIFY_TREE, &mtr);
+			}
+
+			if (error == DB_SUCCESS) {
+				error = btr_cur_optimistic_insert(
+					flag, &ins_cur, &ins_offsets,
+					&heap, dtuple, &rec, &big_rec,
+					0, NULL, &mtr);
 			}
 
-			error = btr_cur_optimistic_insert(
-				flag, &ins_cur, &ins_offsets, &heap,
-				dtuple, &rec, &big_rec, 0, NULL, &mtr);
+			ut_ad(!big_rec);
 
 			if (error == DB_FAIL) {
-				ut_ad(!big_rec);
 				mtr.commit();
 				mtr.start();
 				index->set_modified(mtr);
@@ -184,18 +186,19 @@ public:
 						  &ins_cur, index, false);
 
 				rtr_info_update_btr(&ins_cur, &rtr_info);
-				btr_cur_search_to_nth_level(
-					index, 0, dtuple,
-					PAGE_CUR_RTREE_INSERT,
-					BTR_MODIFY_TREE,
-					&ins_cur, __FILE__, __LINE__, &mtr);
+				error = rtr_insert_leaf(&ins_cur, dtuple,
+							BTR_MODIFY_TREE, &mtr);
 
-				error = btr_cur_pessimistic_insert(
+				if (error == DB_SUCCESS) {
+					error = btr_cur_pessimistic_insert(
 						flag, &ins_cur, &ins_offsets,
 						&heap, dtuple, &rec,
 						&big_rec, 0, NULL, &mtr);
+				}
 			}
 
+			ut_ad(!big_rec);
+
 			DBUG_EXECUTE_IF(
 				"row_merge_ins_spatial_fail",
 				error = DB_FAIL;
@@ -471,6 +474,7 @@ row_merge_buf_redundant_convert(
 @param[in,out]	v_heap		heap memory to process data for virtual column
 @param[in,out]	my_table	mysql table object
 @param[in]	trx		transaction object
+@param[in]	col_collate	columns whose collations changed, or nullptr
 @return number of rows added, 0 if out of space */
 static
 ulint
@@ -488,7 +492,8 @@ row_merge_buf_add(
 	dberr_t*		err,
 	mem_heap_t**		v_heap,
 	TABLE*			my_table,
-	trx_t*			trx)
+	trx_t*			trx,
+	const col_collations*	col_collate)
 {
 	ulint			i;
 	const dict_index_t*	index;
@@ -502,6 +507,7 @@ row_merge_buf_add(
 	doc_id_t		write_doc_id;
 	ulint			n_row_added = 0;
 	VCOL_STORAGE		vcol_storage;
+
 	DBUG_ENTER("row_merge_buf_add");
 
 	if (buf->n_tuples >= buf->max_tuples) {
@@ -593,8 +599,17 @@ error:
 				row_field = dtuple_get_nth_field(row,
 								 col->ind);
 				dfield_copy(field, row_field);
-			}
 
+				/* Copy the column collation to the
+				tuple field */
+				if (col_collate) {
+					auto it = col_collate->find(col->ind);
+					if (it != col_collate->end()) {
+						field->type
+							.assign(*it->second);
+					}
+				}
+			}
 
 			/* Tokenize and process data for FTS */
 			if (!history_fts && (index->type & DICT_FTS)) {
@@ -648,7 +663,7 @@ error:
 					*doc_id % fts_sort_pll_degree);
 
 				/* Add doc item to fts_doc_list */
-				mutex_enter(&psort_info[bucket].mutex);
+				mysql_mutex_lock(&psort_info[bucket].mutex);
 
 				if (psort_info[bucket].error == DB_SUCCESS) {
 					UT_LIST_ADD_LAST(
@@ -660,13 +675,14 @@ error:
 					ut_free(doc_item);
 				}
 
-				mutex_exit(&psort_info[bucket].mutex);
+				mysql_mutex_unlock(&psort_info[bucket].mutex);
 
 				/* Sleep when memory used exceeds limit*/
 				while (psort_info[bucket].memory_used
 				       > FTS_PENDING_DOC_MEMORY_LIMIT
 				       && trial_count++ < max_trial_count) {
-					os_thread_sleep(1000);
+					std::this_thread::sleep_for(
+						std::chrono::milliseconds(1));
 				}
 
 				n_row_added = 1;
@@ -842,7 +858,7 @@ row_merge_dup_report(
 	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
 	const dfield_t*		entry)	/*!< in: duplicate index entry */
 {
-	if (!dup->n_dup++) {
+	if (!dup->n_dup++ && dup->table) {
 		/* Only report the first duplicate record,
 		but count all duplicate records. */
 		innobase_fields_to_mysql(dup->table, dup->index, entry);
@@ -1068,11 +1084,11 @@ row_merge_read(
 	DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
 	DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE););
 
-	const bool	success = DB_SUCCESS == os_file_read_no_error_handling(
-		IORequestRead, fd, buf, ofs, srv_sort_buf_size, 0);
+	const dberr_t err = os_file_read(
+		IORequestRead, fd, buf, ofs, srv_sort_buf_size, nullptr);
 
 	/* If encryption is enabled decrypt buffer */
-	if (success && log_tmp_is_encrypted()) {
+	if (err == DB_SUCCESS && srv_encrypt_log) {
 		if (!log_tmp_block_decrypt(buf, srv_sort_buf_size,
 					   crypt_buf, ofs)) {
 			DBUG_RETURN(false);
@@ -1087,11 +1103,7 @@ row_merge_read(
 	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
 #endif /* POSIX_FADV_DONTNEED */
 
-	if (!success) {
-		ib::error() << "Failed to read merge block at " << ofs;
-	}
-
-	DBUG_RETURN(success);
+	DBUG_RETURN(err == DB_SUCCESS);
 }
 
 /********************************************************************//**
@@ -1099,10 +1111,8 @@ Write a merge block to the file system.
 @return whether the request was completed successfully
 @retval	false	on error
 @retval	true	on success */
-UNIV_INTERN
 bool
 row_merge_write(
-/*============*/
 	const pfs_os_file_t&	fd,			/*!< in: file descriptor */
 	ulint		offset,			/*!< in: offset where to write,
 						in number of row_merge_block_t elements */
@@ -1638,6 +1648,7 @@ stage->inc() will be called for each page read.
 @param[in]	eval_table	mysql table used to evaluate virtual column
 				value, see innobase_get_computed_value().
 @param[in]	allow_not_null	allow null to not-null conversion
+@param[in]	col_collate	columns whose collations changed, or nullptr
 @return DB_SUCCESS or error */
 static MY_ATTRIBUTE((warn_unused_result))
 dberr_t
@@ -1665,7 +1676,8 @@ row_merge_read_clustered_index(
 	double 			pct_cost,
 	row_merge_block_t*	crypt_block,
 	struct TABLE*		eval_table,
-	bool			allow_not_null)
+	bool			allow_not_null,
+	const col_collations*	col_collate)
 {
 	dict_index_t*		clust_index;	/* Clustered index */
 	mem_heap_t*		row_heap = NULL;/* Heap memory to create
@@ -1685,10 +1697,8 @@ row_merge_read_clustered_index(
 	doc_id_t		doc_id = 0;
 	doc_id_t		max_doc_id = 0;
 	ibool			add_doc_id = FALSE;
-	os_event_t		fts_parallel_sort_event = NULL;
-	ibool			fts_pll_sort = FALSE;
-	int64_t			sig_count = 0;
-	spatial_index_info**	sp_tuples = NULL;
+	pthread_cond_t*		fts_parallel_sort_cond = nullptr;
+	spatial_index_info**	sp_tuples = nullptr;
 	ulint			num_spatial = 0;
 	BtrBulk*		clust_btr_bulk = NULL;
 	bool			clust_temp_file = false;
@@ -1728,7 +1738,7 @@ row_merge_read_clustered_index(
 		ut_malloc_nokey(n_index * sizeof *merge_buf));
 
 	row_merge_dup_t	clust_dup = {index[0], table, col_map, 0};
-	dfield_t*	prev_fields;
+	dfield_t*	prev_fields = nullptr;
 	const ulint	n_uniq = dict_index_get_n_unique(index[0]);
 
 	ut_ad(trx->mysql_thd != NULL);
@@ -1762,10 +1772,9 @@ row_merge_read_clustered_index(
 				ut_ad(doc_id > 0);
 			}
 
-			fts_pll_sort = TRUE;
 			row_fts_start_psort(psort_info);
-			fts_parallel_sort_event =
-				 psort_info[0].psort_common->sort_event;
+			fts_parallel_sort_cond =
+				 &psort_info[0].psort_common->sort_cond;
 		} else {
 			if (dict_index_is_spatial(index[i])) {
 				num_spatial++;
@@ -1811,17 +1820,60 @@ row_merge_read_clustered_index(
 	      == (DATA_ROLL_PTR | DATA_NOT_NULL));
 	const ulint new_trx_id_col = col_map
 		? col_map[old_trx_id_col] : old_trx_id_col;
+	uint64_t n_rows = 0;
 
-	btr_pcur_open_at_index_side(
-		true, clust_index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
-	mtr_started = true;
-	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-	if (rec_is_metadata(btr_pcur_get_rec(&pcur), *clust_index)) {
-		ut_ad(btr_pcur_is_on_user_rec(&pcur));
-		/* Skip the metadata pseudo-record. */
+	err = pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr);
+	if (err != DB_SUCCESS) {
+err_exit:
+		trx->error_key_num = 0;
+		goto func_exit;
 	} else {
-		ut_ad(!clust_index->is_instant());
-		btr_pcur_move_to_prev_on_page(&pcur);
+		rec_t* rec = page_rec_get_next(btr_pcur_get_rec(&pcur));
+		if (!rec) {
+corrupted_metadata:
+			err = DB_CORRUPTION;
+			goto err_exit;
+		}
+		if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+		    & REC_INFO_MIN_REC_FLAG) {
+			if (!clust_index->is_instant()) {
+				goto corrupted_metadata;
+			}
+			if (page_rec_is_comp(rec)
+			    && rec_get_status(rec) != REC_STATUS_INSTANT) {
+				goto corrupted_metadata;
+			}
+			/* Skip the metadata pseudo-record. */
+			btr_pcur_get_page_cur(&pcur)->rec = rec;
+		} else if (clust_index->is_instant()) {
+			goto corrupted_metadata;
+		}
+	}
+
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are holding a clustered index leaf page latch here.
+	That will obviously prevent any concurrent INSERT from
+	updating bulk_trx_id while we read it. */
+	if (!online) {
+	} else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
+		ut_ad(trx->read_view.is_open());
+		ut_ad(bulk_trx_id != trx->id);
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto func_exit;
+		}
 	}
 
 	if (old_table != new_table) {
@@ -1870,21 +1922,17 @@ row_merge_read_clustered_index(
 		prev_fields = static_cast<dfield_t*>(
 			ut_malloc_nokey(n_uniq * sizeof *prev_fields));
 		mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
-	} else {
-		prev_fields = NULL;
 	}
 
 	mach_write_to_8(new_sys_trx_start, trx->id);
 	mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
-	uint64_t	n_rows = 0;
 
 	/* Scan the clustered index. */
 	for (;;) {
 		/* Do not continue if table pages are still encrypted */
 		if (!old_table->is_readable() || !new_table->is_readable()) {
 			err = DB_DECRYPTION_FAILED;
-			trx->error_key_num = 0;
-			goto func_exit;
+			goto err_exit;
 		}
 
 		const rec_t*	rec;
@@ -1895,25 +1943,27 @@ row_merge_read_clustered_index(
 		page_cur_t*	cur	= btr_pcur_get_page_cur(&pcur);
 		bool history_row, history_fts = false;
 
-		page_cur_move_to_next(cur);
-
 		stage->n_pk_recs_inc();
 
+		if (!page_cur_move_to_next(cur)) {
+corrupted_rec:
+			err = DB_CORRUPTION;
+			goto err_exit;
+		}
+
 		if (page_cur_is_after_last(cur)) {
 
 			stage->inc();
 
 			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
 				err = DB_INTERRUPTED;
-				trx->error_key_num = 0;
-				goto func_exit;
+				goto err_exit;
 			}
 
 			if (online && old_table != new_table) {
 				err = row_log_table_get_error(clust_index);
 				if (err != DB_SUCCESS) {
-					trx->error_key_num = 0;
-					goto func_exit;
+					goto err_exit;
 				}
 			}
 
@@ -1932,7 +1982,7 @@ row_merge_read_clustered_index(
 				goto scan_next;
 			}
 
-			if (clust_index->lock.waiters) {
+			if (clust_index->lock.is_waiting()) {
 				/* There are waiters on the clustered
 				index tree lock, likely the purge
 				thread. Store and restore the cursor
@@ -1942,20 +1992,23 @@ row_merge_read_clustered_index(
 
 				/* Store the cursor position on the last user
 				record on the page. */
-				btr_pcur_move_to_prev_on_page(&pcur);
+				if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+					goto corrupted_index;
+				}
 				/* Leaf pages must never be empty, unless
 				this is the only page in the index tree. */
-				ut_ad(btr_pcur_is_on_user_rec(&pcur)
-				      || btr_pcur_get_block(
-					      &pcur)->page.id().page_no()
-				      == clust_index->page);
+				if (!btr_pcur_is_on_user_rec(&pcur)
+				    && btr_pcur_get_block(&pcur)->page.id()
+				    .page_no() != clust_index->page) {
+					goto corrupted_index;
+				}
 
 				btr_pcur_store_position(&pcur, &mtr);
 				mtr.commit();
 				mtr_started = false;
 
 				/* Give the waiters a chance to proceed. */
-				os_thread_yield();
+				std::this_thread::yield();
 scan_next:
 				ut_ad(!mtr_started);
 				ut_ad(!mtr.is_active());
@@ -1964,8 +2017,13 @@ scan_next:
 				/* Restore position on the record, or its
 				predecessor if the record was purged
 				meanwhile. */
-				btr_pcur_restore_position(
-					BTR_SEARCH_LEAF, &pcur, &mtr);
+				if (pcur.restore_position(BTR_SEARCH_LEAF,
+							  &mtr)
+				    == btr_pcur_t::CORRUPTED) {
+corrupted_index:
+					err = DB_CORRUPTION;
+					goto func_exit;
+                                }
 				/* Move to the successor of the
 				original record. */
 				if (!btr_pcur_move_to_next_user_rec(
@@ -1988,16 +2046,24 @@ end_of_index:
 					goto end_of_index;
 				}
 
-				buf_block_t* block = btr_block_get(
-					*clust_index, next_page_no,
-					RW_S_LATCH, false, &mtr);
+				buf_block_t* block = buf_page_get_gen(
+					page_id_t(old_table->space->id,
+						  next_page_no),
+					old_table->space->zip_size(),
+					RW_S_LATCH, nullptr, BUF_GET, &mtr,
+					&err, false);
+				if (!block) {
+					goto err_exit;
+				}
 
-				btr_leaf_page_release(page_cur_get_block(cur),
-						      BTR_SEARCH_LEAF, &mtr);
 				page_cur_set_before_first(block, cur);
-				page_cur_move_to_next(cur);
+				if (!page_cur_move_to_next(cur)
+				    || page_cur_is_after_last(cur)) {
+					goto corrupted_rec;
+				}
 
-				ut_ad(!page_cur_is_after_last(cur));
+				const auto s = mtr.get_savepoint();
+				mtr.rollback_to_savepoint(s - 2, s - 1);
 			}
 		} else {
 			mem_heap_empty(row_heap);
@@ -2034,8 +2100,14 @@ end_of_index:
 			ut_ad(trx->read_view.is_open());
 			ut_ad(rec_trx_id != trx->id);
 
-			if (!trx->read_view.changes_visible(
-				    rec_trx_id, old_table->name)) {
+			if (!trx->read_view.changes_visible(rec_trx_id)) {
+				if (rec_trx_id
+				    >= trx->read_view.low_limit_id()
+				    && rec_trx_id
+				    >= trx_sys.get_max_trx_id()) {
+					goto corrupted_rec;
+				}
+
 				rec_t*	old_vers;
 
 				row_vers_build_for_consistent_read(
@@ -2151,8 +2223,7 @@ end_of_index:
 
 				if (!allow_not_null) {
 					err = DB_INVALID_NULL;
-					trx->error_key_num = 0;
-					goto func_exit;
+					goto err_exit;
 				}
 
 				const dfield_t& default_field
@@ -2226,13 +2297,10 @@ end_of_index:
 			byte*	b = static_cast<byte*>(dfield_get_data(dfield));
 
 			if (sequence.eof()) {
-				err = DB_ERROR;
-				trx->error_key_num = 0;
-
 				ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 					ER_AUTOINC_READ_FAILED, "[NULL]");
-
-				goto func_exit;
+				err = DB_ERROR;
+				goto err_exit;
 			}
 
 			ulonglong	value = sequence++;
@@ -2324,7 +2392,8 @@ write_buffers:
 					buf, fts_index, old_table, new_table,
 					psort_info, row, ext, history_fts,
 					&doc_id, conv_heap, &err,
-					&v_heap, eval_table, trx)))) {
+					&v_heap, eval_table, trx,
+					col_collate)))) {
 
 				/* If we are creating FTS index,
 				a single row can generate more
@@ -2445,8 +2514,10 @@ write_buffers:
 						we must reread it on the next
 						loop iteration. */
 						if (mtr_started) {
-							btr_pcur_move_to_prev_on_page(
-								&pcur);
+							if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+								err = DB_CORRUPTION;
+								goto func_exit;
+							}
 							btr_pcur_store_position(
 								&pcur, &mtr);
 
@@ -2508,9 +2579,11 @@ write_buffers:
 						overflow). */
 						mtr.start();
 						mtr_started = true;
-						btr_pcur_restore_position(
-							BTR_SEARCH_LEAF, &pcur,
-							&mtr);
+						if (pcur.restore_position(
+							BTR_SEARCH_LEAF, &mtr)
+						    == btr_pcur_t::CORRUPTED) {
+							goto corrupted_index;
+						}
 						buf = row_merge_buf_empty(buf);
 						merge_buf[i] = buf;
 						/* Restart the outer loop on the
@@ -2541,22 +2614,21 @@ write_buffers:
 				from accessing this index, to ensure
 				read consistency. */
 
-				trx_id_t	max_trx_id;
-
 				ut_a(row == NULL);
-				rw_lock_x_lock(
-					dict_index_get_lock(buf->index));
-				ut_a(dict_index_get_online_status(buf->index)
+
+				dict_index_t* index = buf->index;
+				index->lock.x_lock(SRW_LOCK_CALL);
+				ut_a(dict_index_get_online_status(index)
 				     == ONLINE_INDEX_CREATION);
 
-				max_trx_id = row_log_get_max_trx(buf->index);
+				trx_id_t max_trx_id = row_log_get_max_trx(
+					index);
 
-				if (max_trx_id > buf->index->trx_id) {
-					buf->index->trx_id = max_trx_id;
+				if (max_trx_id > index->trx_id) {
+					index->trx_id = max_trx_id;
 				}
 
-				rw_lock_x_unlock(
-					dict_index_get_lock(buf->index));
+				index->lock.x_unlock();
 			}
 
 			/* Secondary index and clustered index which is
@@ -2649,7 +2721,7 @@ write_buffers:
 						new_table, psort_info,
 						row, ext, history_fts, &doc_id,
 						conv_heap, &err, &v_heap,
-						eval_table, trx)))) {
+						eval_table, trx, col_collate)))) {
                                         /* An empty buffer should have enough
                                         room for at least one record. */
 					ut_ad(err == DB_COMPUTE_VALUE_FAILED
@@ -2713,7 +2785,7 @@ all_done:
 		UT_DELETE(clust_btr_bulk);
 	}
 
-	if (prev_fields != NULL) {
+	if (prev_fields) {
 		ut_free(prev_fields);
 		mem_heap_free(mtuple_heap);
 	}
@@ -2729,7 +2801,7 @@ all_done:
 #ifdef FTS_INTERNAL_DIAG_PRINT
 	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
 #endif
-	if (fts_pll_sort) {
+	if (UNIV_LIKELY_NULL(fts_parallel_sort_cond)) {
 wait_again:
                 /* Check if error occurs in child thread */
 		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
@@ -2750,14 +2822,15 @@ wait_again:
 		}
 
 		/* Now wait all children to report back to be completed */
-		os_event_wait_time_low(fts_parallel_sort_event,
-				       1000000, sig_count);
+		timespec abstime;
+		set_timespec(abstime, 1);
+		mysql_mutex_lock(&psort_info[0].mutex);
+		my_cond_timedwait(fts_parallel_sort_cond,
+				  &psort_info[0].mutex.m_mutex, &abstime);
+		mysql_mutex_unlock(&psort_info[0].mutex);
 
 		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
-			if (psort_info[i].child_status != FTS_CHILD_COMPLETE
-			    && psort_info[i].child_status != FTS_CHILD_EXITING) {
-				sig_count = os_event_reset(
-					fts_parallel_sort_event);
+			if (!psort_info[i].child_status) {
 				goto wait_again;
 			}
 		}
@@ -2778,8 +2851,7 @@ wait_again:
 	row_fts_free_pll_merge_buf(psort_info);
 
 	ut_free(merge_buf);
-
-	btr_pcur_close(&pcur);
+	ut_free(pcur.old_rec_buf);
 
 	if (sp_tuples != NULL) {
 		for (ulint i = 0; i < num_spatial; i++) {
@@ -2815,12 +2887,8 @@ wait_again:
 	}
 
 	if (vers_update_trt) {
-		trx_mod_table_time_t& time =
-			trx->mod_tables
-				.insert(trx_mod_tables_t::value_type(
-					const_cast<dict_table_t*>(new_table), 0))
-				.first->second;
-		time.set_versioned(0);
+		trx->mod_tables.emplace(new_table, 0)
+			.first->second.set_versioned(0);
 	}
 
 	trx->op_info = "";
@@ -3594,26 +3662,13 @@ row_merge_insert_index_tuples(
 
 			Any modifications after the
 			row_merge_read_clustered_index() scan
-			will go through row_log_table_apply().
-			Any modifications to off-page columns
-			will be tracked by
-			row_log_table_blob_alloc() and
-			row_log_table_blob_free(). */
+			will go through row_log_table_apply(). */
 			row_merge_copy_blobs(
 				mrec, offsets, old_table->space->zip_size(),
 				dtuple, tuple_heap);
 		}
 
-#ifdef UNIV_DEBUG
-		static const latch_level_t latches[] = {
-			SYNC_INDEX_TREE,	/* index->lock */
-			SYNC_LEVEL_VARYING	/* btr_bulk->m_page_bulks */
-		};
-#endif /* UNIV_DEBUG */
-
 		ut_ad(dtuple_validate(dtuple));
-		ut_ad(!sync_check_iterate(sync_allowed_latches(latches,
-							       latches + 2)));
 		error = btr_bulk->insert(dtuple);
 
 		if (error != DB_SUCCESS) {
@@ -3645,25 +3700,6 @@ err_exit:
 }
 
 /*********************************************************************//**
-Sets an exclusive lock on a table, for the duration of creating indexes.
-@return error code or DB_SUCCESS */
-dberr_t
-row_merge_lock_table(
-/*=================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table,		/*!< in: table to lock */
-	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
-{
-	ut_ad(!srv_read_only_mode);
-	ut_ad(mode == LOCK_X || mode == LOCK_S);
-
-	trx->op_info = "setting table lock for creating or dropping index";
-	trx->ddl = true;
-
-	return(lock_table_for_trx(table, trx, mode));
-}
-
-/*********************************************************************//**
 Drop an index that was created before an error occurred.
 The data dictionary must have been locked exclusively by the caller,
 because the transaction will not be committed. */
@@ -3684,14 +3720,14 @@ row_merge_drop_index_dict(
 	pars_info_t*	info;
 
 	ut_ad(!srv_read_only_mode);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_d(dict_sys.assert_locked());
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
 
 	info = pars_info_create();
 	pars_info_add_ull_literal(info, "indexid", index_id);
 	trx->op_info = "dropping index from dictionary";
-	error = que_eval_sql(info, sql, FALSE, trx);
+	error = que_eval_sql(info, sql, trx);
 
 	if (error != DB_SUCCESS) {
 		/* Even though we ensure that DDL transactions are WAIT
@@ -3710,6 +3746,7 @@ row_merge_drop_index_dict(
 Drop indexes that were created before an error occurred.
 The data dictionary must have been locked exclusively by the caller,
 because the transaction will not be committed. */
+static
 void
 row_merge_drop_indexes_dict(
 /*========================*/
@@ -3746,9 +3783,9 @@ row_merge_drop_indexes_dict(
 	pars_info_t*	info;
 
 	ut_ad(!srv_read_only_mode);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_d(dict_sys.assert_locked());
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
 
 	/* It is possible that table->n_ref_count > 1 when
 	locked=TRUE. In this case, all code that should have an open
@@ -3760,7 +3797,7 @@ row_merge_drop_indexes_dict(
 	info = pars_info_create();
 	pars_info_add_ull_literal(info, "tableid", table_id);
 	trx->op_info = "dropping indexes";
-	error = que_eval_sql(info, sql, FALSE, trx);
+	error = que_eval_sql(info, sql, trx);
 
 	switch (error) {
 	case DB_SUCCESS:
@@ -3779,6 +3816,28 @@ row_merge_drop_indexes_dict(
 	trx->op_info = "";
 }
 
+/** Drop common internal tables if all fulltext indexes are dropped
+@param trx   transaction
+@param table user table */
+static void row_merge_drop_fulltext_indexes(trx_t *trx, dict_table_t *table)
+{
+  if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
+      !table->fts ||
+      !ib_vector_is_empty(table->fts->indexes))
+    return;
+
+  for (const dict_index_t *index= dict_table_get_first_index(table);
+       index; index= dict_table_get_next_index(index))
+    if (index->type & DICT_FTS)
+      return;
+
+  fts_optimize_remove_table(table);
+  fts_drop_tables(trx, *table);
+  table->fts->~fts_t();
+  table->fts= nullptr;
+  DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+}
+
 /** Drop indexes that were created before an error occurred.
 The data dictionary must have been locked exclusively by the caller,
 because the transaction will not be committed.
@@ -3798,9 +3857,9 @@ row_merge_drop_indexes(
 	dict_index_t*	next_index;
 
 	ut_ad(!srv_read_only_mode);
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-	ut_d(dict_sys.assert_locked());
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
 
 	index = dict_table_get_first_index(table);
 	ut_ad(dict_index_is_clust(index));
@@ -3814,18 +3873,10 @@ row_merge_drop_indexes(
 	handle to the table be waiting for the next statement to execute,
 	or waiting for a meta-data lock.
 
-	A concurrent purge will be prevented by dict_sys.latch. */
+	A concurrent purge will be prevented by MDL. */
 
 	if (!locked && (table->get_ref_count() > 1
 			|| table->has_lock_other_than(alter_trx))) {
-		/* We will have to drop the indexes later, when the
-		table is guaranteed to be no longer in use.  Mark the
-		indexes as incomplete and corrupted, so that other
-		threads will stop using them.  Let dict_table_close()
-		or crash recovery or the next invocation of
-		prepare_inplace_alter_table() take care of dropping
-		the indexes. */
-
 		while ((index = dict_table_get_next_index(index)) != NULL) {
 			ut_ad(!dict_index_is_clust(index));
 
@@ -3869,8 +3920,7 @@ row_merge_drop_indexes(
 						table, index);
 					index = prev;
 				} else {
-					rw_lock_x_lock(
-						dict_index_get_lock(index));
+					index->lock.x_lock(SRW_LOCK_CALL);
 					dict_index_set_online_status(
 						index, ONLINE_INDEX_ABORTED);
 					index->type |= DICT_CORRUPT;
@@ -3879,14 +3929,14 @@ row_merge_drop_indexes(
 				}
 				continue;
 			case ONLINE_INDEX_CREATION:
-				rw_lock_x_lock(dict_index_get_lock(index));
+				index->lock.x_lock(SRW_LOCK_CALL);
 				ut_ad(!index->is_committed());
 				row_log_abort_sec(index);
 			drop_aborted:
-				rw_lock_x_unlock(dict_index_get_lock(index));
+				index->lock.x_unlock();
 
 				DEBUG_SYNC_C("merge_drop_index_after_abort");
-				/* covered by dict_sys.mutex */
+				/* covered by dict_sys.latch */
 				MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
 				/* fall through */
 			case ONLINE_INDEX_ABORTED:
@@ -3895,17 +3945,17 @@ row_merge_drop_indexes(
 				the tablespace, but keep the object
 				in the data dictionary cache. */
 				row_merge_drop_index_dict(trx, index->id);
-				rw_lock_x_lock(dict_index_get_lock(index));
+				index->lock.x_lock(SRW_LOCK_CALL);
 				dict_index_set_online_status(
 					index, ONLINE_INDEX_ABORTED_DROPPED);
-				rw_lock_x_unlock(dict_index_get_lock(index));
+				index->lock.x_unlock();
 				table->drop_aborted = TRUE;
 				continue;
 			}
 			ut_error;
 		}
 
-		fts_clear_all(table, trx);
+		row_merge_drop_fulltext_indexes(trx, table);
 		return;
 	}
 
@@ -3914,8 +3964,11 @@ row_merge_drop_indexes(
 	/* Invalidate all row_prebuilt_t::ins_graph that are referring
 	to this table. That is, force row_get_prebuilt_insert_row() to
 	rebuild prebuilt->ins_node->entry_list). */
-	ut_ad(table->def_trx_id <= trx->id);
-	table->def_trx_id = trx->id;
+	if (table->def_trx_id < trx->id) {
+		table->def_trx_id = trx->id;
+	} else {
+		ut_ad(table->def_trx_id == trx->id || table->name.part());
+	}
 
 	next_index = dict_table_get_next_index(index);
 
@@ -3950,7 +4003,7 @@ row_merge_drop_indexes(
 				break;
 			case ONLINE_INDEX_ABORTED:
 			case ONLINE_INDEX_ABORTED_DROPPED:
-				/* covered by dict_sys.mutex */
+				/* covered by dict_sys.latch */
 				MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
 			}
 
@@ -3958,22 +4011,92 @@ row_merge_drop_indexes(
 		}
 	}
 
-	fts_clear_all(table, trx);
+	row_merge_drop_fulltext_indexes(trx, table);
 	table->drop_aborted = FALSE;
 	ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
 }
 
-/*********************************************************************//**
-Drop all partially created indexes during crash recovery. */
-void
-row_merge_drop_temp_indexes(void)
-/*=============================*/
+/** Drop fulltext indexes */
+static ibool row_merge_drop_fts(void *node, void *trx)
 {
+   auto s= static_cast<sel_node_t*>(node);
+
+   const dfield_t *table_id= que_node_get_val(s->select_list);
+   ut_ad(table_id->type.mtype == DATA_BINARY);
+   node= que_node_get_next(s->select_list);
+   ut_ad(!que_node_get_next(node));
+   const dfield_t *index_id= que_node_get_val(node);
+   ut_ad(index_id->type.mtype == DATA_BINARY);
+
+   static const char sql[]=
+     "PROCEDURE DROP_TABLES_PROC () IS\n"
+     "tid CHAR;\n"
+     "iid CHAR;\n"
+
+     "DECLARE CURSOR cur_tab IS\n"
+     "SELECT ID FROM SYS_TABLES\n"
+     "WHERE INSTR(NAME,:name)+45=LENGTH(NAME)"
+     " AND INSTR('123456',SUBSTR(NAME,LENGTH(NAME)-1,1))>0"
+     " FOR UPDATE;\n"
+
+     "DECLARE CURSOR cur_idx IS\n"
+     "SELECT ID FROM SYS_INDEXES\n"
+     "WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+     "BEGIN\n"
+     "OPEN cur_tab;\n"
+     "WHILE 1 = 1 LOOP\n"
+     "  FETCH cur_tab INTO tid;\n"
+     "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+     "  OPEN cur_idx;\n"
+     "  WHILE 1 = 1 LOOP\n"
+     "    FETCH cur_idx INTO iid;\n"
+     "    IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+     "    DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+     "    DELETE FROM SYS_INDEXES WHERE CURRENT OF cur_idx;\n"
+     "  END LOOP;\n"
+     "  CLOSE cur_idx;\n"
+     "  DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+     "  DELETE FROM SYS_TABLES WHERE CURRENT OF cur_tab;\n"
+     "END LOOP;\n"
+     "CLOSE cur_tab;\n"
+     "END;\n";
+
+   if (table_id->len == 8 && index_id->len == 8)
+   {
+     char buf[sizeof "/FTS_0000000000000000_0000000000000000_INDEX_"];
+     snprintf(buf, sizeof buf, "/FTS_%016llx_%016llx_INDEX_",
+              static_cast<ulonglong>
+              (mach_read_from_8(static_cast<const byte*>(table_id->data))),
+              static_cast<ulonglong>
+              (mach_read_from_8(static_cast<const byte*>(index_id->data))));
+     auto pinfo= pars_info_create();
+     pars_info_add_str_literal(pinfo, "name", buf);
+     que_eval_sql(pinfo, sql, static_cast<trx_t*>(trx));
+   }
+
+   return true;
+}
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes()
+{
+	static_assert(DICT_FTS == 32, "compatibility");
+
 	static const char sql[] =
 		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
 		"ixid CHAR;\n"
 		"found INT;\n"
 
+		"DECLARE FUNCTION drop_fts;\n"
+
+		"DECLARE CURSOR fts_cur IS\n"
+		" SELECT TABLE_ID,ID FROM SYS_INDEXES\n"
+		" WHERE TYPE=32"
+		" AND SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		" FOR UPDATE;\n"
+
 		"DECLARE CURSOR index_cur IS\n"
 		" SELECT ID FROM SYS_INDEXES\n"
 		" WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
@@ -3981,6 +4104,15 @@ row_merge_drop_temp_indexes(void)
 
 		"BEGIN\n"
 		"found := 1;\n"
+		"OPEN fts_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH fts_cur INTO drop_fts();\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE fts_cur;\n"
+
 		"OPEN index_cur;\n"
 		"WHILE found = 1 LOOP\n"
 		"  FETCH index_cur INTO ixid;\n"
@@ -3993,31 +4125,36 @@ row_merge_drop_temp_indexes(void)
 		"END LOOP;\n"
 		"CLOSE index_cur;\n"
 		"END;\n";
-	trx_t*	trx;
-	dberr_t	error;
 
 	/* Load the table definitions that contain partially defined
 	indexes, so that the data dictionary information can be checked
 	when accessing the tablename.ibd files. */
-	trx = trx_create();
+	trx_t* trx = trx_create();
+	trx_start_for_ddl(trx);
 	trx->op_info = "dropping partially created indexes";
+	dberr_t error = lock_sys_tables(trx);
+
 	row_mysql_lock_data_dictionary(trx);
 	/* Ensure that this transaction will be rolled back and locks
 	will be released, if the server gets killed before the commit
 	gets written to the redo log. */
-	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+	trx->dict_operation = true;
 
 	trx->op_info = "dropping indexes";
-	error = que_eval_sql(NULL, sql, FALSE, trx);
 
-	if (error != DB_SUCCESS) {
+	pars_info_t* pinfo = pars_info_create();
+	pars_info_bind_function(pinfo, "drop_fts", row_merge_drop_fts, trx);
+	if (error == DB_SUCCESS) {
+		error = que_eval_sql(pinfo, sql, trx);
+	}
+
+	if (error) {
 		/* Even though we ensure that DDL transactions are WAIT
 		and DEADLOCK free, we could encounter other errors e.g.,
 		DB_TOO_MANY_CONCURRENT_TRXS. */
 		trx->error_state = DB_SUCCESS;
 
-		ib::error() << "row_merge_drop_temp_indexes failed with error"
-			<< error;
+		ib::error() << "row_merge_drop_temp_indexes(): " << error;
 	}
 
 	trx_commit_for_mysql(trx);
@@ -4150,15 +4287,15 @@ row_merge_rename_index_to_add(
 		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
 		"END;\n";
 
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
 
 	trx->op_info = "renaming index to add";
 
 	pars_info_add_ull_literal(info, "tableid", table_id);
 	pars_info_add_ull_literal(info, "indexid", index_id);
 
-	err = que_eval_sql(info, rename_index, FALSE, trx);
+	err = que_eval_sql(info, rename_index, trx);
 
 	if (err != DB_SUCCESS) {
 		/* Even though we ensure that DDL transactions are WAIT
@@ -4175,59 +4312,6 @@ row_merge_rename_index_to_add(
 	return(err);
 }
 
-/*********************************************************************//**
-Rename an index in the dictionary that is to be dropped. The data
-dictionary must have been locked exclusively by the caller, because
-the transaction will not be committed.
-@return DB_SUCCESS if all OK */
-dberr_t
-row_merge_rename_index_to_drop(
-/*===========================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	table_id_t	table_id,	/*!< in: table identifier */
-	index_id_t	index_id)	/*!< in: index identifier */
-{
-	dberr_t		err;
-	pars_info_t*	info = pars_info_create();
-
-	ut_ad(!srv_read_only_mode);
-
-	/* We use the private SQL parser of Innobase to generate the
-	query graphs needed in renaming indexes. */
-
-	static const char rename_index[] =
-		"PROCEDURE RENAME_INDEX_PROC () IS\n"
-		"BEGIN\n"
-		"UPDATE SYS_INDEXES SET NAME=CONCAT('"
-		TEMP_INDEX_PREFIX_STR "',NAME)\n"
-		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
-		"END;\n";
-
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-	ut_ad(trx_get_dict_operation(trx) == TRX_DICT_OP_INDEX);
-
-	trx->op_info = "renaming index to drop";
-
-	pars_info_add_ull_literal(info, "tableid", table_id);
-	pars_info_add_ull_literal(info, "indexid", index_id);
-
-	err = que_eval_sql(info, rename_index, FALSE, trx);
-
-	if (err != DB_SUCCESS) {
-		/* Even though we ensure that DDL transactions are WAIT
-		and DEADLOCK free, we could encounter other errors e.g.,
-		DB_TOO_MANY_CONCURRENT_TRXS. */
-		trx->error_state = DB_SUCCESS;
-
-		ib::error() << "row_merge_rename_index_to_drop failed with"
-			" error " << err;
-	}
-
-	trx->op_info = "";
-
-	return(err);
-}
-
 /** Create the index and load in to the dictionary.
 @param[in,out]	table		the index is on this table
 @param[in]	index_def	the index definition
@@ -4305,30 +4389,7 @@ row_merge_is_index_usable(
 	       && (index->table->is_temporary() || index->table->no_rollback()
 		   || index->trx_id == 0
 		   || !trx->read_view.is_open()
-		   || trx->read_view.changes_visible(
-			   index->trx_id,
-			   index->table->name)));
-}
-
-/*********************************************************************//**
-Drop a table. The caller must have ensured that the background stats
-thread is not processing the table. This can be done by calling
-dict_stats_wait_bg_to_stop_using_table() after locking the dictionary and
-before calling this function.
-@return DB_SUCCESS or error code */
-dberr_t
-row_merge_drop_table(
-/*=================*/
-	trx_t*		trx,		/*!< in: transaction */
-	dict_table_t*	table)		/*!< in: table to drop */
-{
-	ut_ad(!srv_read_only_mode);
-
-	/* There must be no open transactions on the table. */
-	ut_a(table->get_ref_count() == 0);
-
-	return(row_drop_table_for_mysql(table->name.m_name,
-			trx, SQLCOM_DROP_TABLE, false, false));
+		   || trx->read_view.changes_visible(index->trx_id)));
 }
 
 /** Build indexes on a table by reading a clustered index, creating a temporary
@@ -4359,6 +4420,7 @@ this function and it will be passed to other functions for further accounting.
 @param[in]	eval_table	mysql table used to evaluate virtual column
 				value, see innobase_get_computed_value().
 @param[in]	allow_not_null	allow the conversion from null to not-null
+@param[in]	col_collate	columns whose collations changed, or nullptr
 @return DB_SUCCESS or error code */
 dberr_t
 row_merge_build_indexes(
@@ -4378,7 +4440,8 @@ row_merge_build_indexes(
 	ut_stage_alter_t*	stage,
 	const dict_add_v_col_t*	add_v,
 	struct TABLE*		eval_table,
-	bool			allow_not_null)
+	bool			allow_not_null,
+	const col_collations*	col_collate)
 {
 	merge_file_t*		merge_files;
 	row_merge_block_t*	block;
@@ -4528,7 +4591,8 @@ row_merge_build_indexes(
 		fts_sort_idx, psort_info, merge_files, key_numbers,
 		n_indexes, defaults, add_v, col_map, add_autoinc,
 		sequence, block, skip_pk_sort, &tmpfd, stage,
-		pct_cost, crypt_block, eval_table, allow_not_null);
+		pct_cost, crypt_block, eval_table, allow_not_null,
+		col_collate);
 
 	stage->end_phase_read_pk();
 
@@ -4768,12 +4832,10 @@ func_exit:
 			case ONLINE_INDEX_COMPLETE:
 				break;
 			case ONLINE_INDEX_CREATION:
-				rw_lock_x_lock(
-					dict_index_get_lock(indexes[i]));
+				indexes[i]->lock.x_lock(SRW_LOCK_CALL);
 				row_log_abort_sec(indexes[i]);
 				indexes[i]->type |= DICT_CORRUPT;
-				rw_lock_x_unlock(
-					dict_index_get_lock(indexes[i]));
+				indexes[i]->lock.x_unlock();
 				new_table->drop_aborted = TRUE;
 				/* fall through */
 			case ONLINE_INDEX_ABORTED_DROPPED:
@@ -4782,6 +4844,13 @@ func_exit:
 					MONITOR_BACKGROUND_DROP_INDEX);
 			}
 		}
+
+		dict_index_t *clust_index= new_table->indexes.start;
+		clust_index->lock.x_lock(SRW_LOCK_CALL);
+		ut_ad(!clust_index->online_log ||
+		      clust_index->online_log_is_dummy());
+		clust_index->online_log= nullptr;
+		clust_index->lock.x_unlock();
 	}
 
 	DBUG_EXECUTE_IF("ib_index_crash_after_bulk_load", DBUG_SUICIDE(););
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 514d4b3ecd9..67167f19c70 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -36,11 +36,8 @@ Created 9/17/2000 Heikki Tuuri
 #include "dict0crea.h"
 #include "dict0dict.h"
 #include "dict0load.h"
-#include "dict0priv.h"
 #include "dict0stats.h"
 #include "dict0stats_bg.h"
-#include "dict0defrag_bg.h"
-#include "btr0defragment.h"
 #include "fil0fil.h"
 #include "fil0crypt.h"
 #include "fsp0file.h"
@@ -61,53 +58,15 @@ Created 9/17/2000 Heikki Tuuri
 #include "trx0rec.h"
 #include "trx0roll.h"
 #include "trx0undo.h"
+#include "srv0mon.h"
 #include "srv0start.h"
-#include "row0ext.h"
-#include "srv0start.h"
+#include "log.h"
 
 #include <algorithm>
-#include <deque>
 #include <vector>
+#include <thread>
 
 
-/** Provide optional 4.x backwards compatibility for 5.0 and above */
-ibool	row_rollback_on_timeout	= FALSE;
-
-/** Chain node of the list of tables to drop in the background. */
-struct row_mysql_drop_t{
-	table_id_t			table_id;	/*!< table id */
-	UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
-							/*!< list chain node */
-};
-
-/** @brief List of tables we should drop in background.
-
-ALTER TABLE in MySQL requires that the table handler can drop the
-table in background when there are no queries to it any
-more.  Protected by row_drop_list_mutex. */
-static UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
-
-/** Mutex protecting the background table drop list. */
-static ib_mutex_t row_drop_list_mutex;
-
-/** Flag: has row_mysql_drop_list been initialized? */
-static bool row_mysql_drop_list_inited;
-
-#ifdef UNIV_DEBUG
-/** Wait for the background drop list to become empty. */
-void
-row_wait_for_background_drop_list_empty()
-{
-	bool	empty = false;
-	while (!empty) {
-		mutex_enter(&row_drop_list_mutex);
-		empty = (UT_LIST_GET_LEN(row_mysql_drop_list) == 0);
-		mutex_exit(&row_drop_list_mutex);
-		os_thread_sleep(100000);
-	}
-}
-#endif /* UNIV_DEBUG */
-
 /*******************************************************************//**
 Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
 static
@@ -116,7 +75,8 @@ row_mysql_delay_if_needed(void)
 /*===========================*/
 {
 	if (srv_dml_needed_delay) {
-		os_thread_sleep(srv_dml_needed_delay);
+		std::this_thread::sleep_for(
+			std::chrono::microseconds(srv_dml_needed_delay));
 	}
 }
 
@@ -665,19 +625,20 @@ row_mysql_handle_errors(
 	DBUG_ENTER("row_mysql_handle_errors");
 	DEBUG_SYNC_C("row_mysql_handle_errors");
 
-handle_new_error:
 	err = trx->error_state;
 
+handle_new_error:
 	ut_a(err != DB_SUCCESS);
 
 	trx->error_state = DB_SUCCESS;
 
-	DBUG_LOG("trx", "handle error: " << ut_strerr(err)
+	DBUG_LOG("trx", "handle error: " << err
 		 << ";id=" << ib::hex(trx->id) << ", " << trx);
 
 	switch (err) {
 	case DB_LOCK_WAIT_TIMEOUT:
-		if (row_rollback_on_timeout) {
+		extern my_bool innobase_rollback_on_timeout;
+		if (innobase_rollback_on_timeout) {
 			goto rollback;
 		}
 		/* fall through */
@@ -707,14 +668,18 @@ handle_new_error:
 
 			trx->rollback(savept);
 		}
-		/* MySQL will roll back the latest SQL statement */
+		if (!trx->bulk_insert) {
+			/* MariaDB will roll back the latest SQL statement */
+			break;
+		}
+		/* MariaDB will roll back the entire transaction. */
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = 0;
+		trx->savepoints_discard();
 		break;
 	case DB_LOCK_WAIT:
-		lock_wait_suspend_thread(thr);
-
-		if (trx->error_state != DB_SUCCESS) {
-			que_thr_stop_for_mysql(thr);
-
+		err = lock_wait(thr);
+		if (err != DB_SUCCESS) {
 			goto handle_new_error;
 		}
 
@@ -731,12 +696,8 @@ handle_new_error:
 		trx->rollback();
 		break;
 
-	case DB_MUST_GET_MORE_FILE_SPACE:
-		ib::fatal() << "The database cannot continue operation because"
-			" of lack of space. You must add a new data file"
-			" to my.cnf and restart the database.";
-		break;
-
+	case DB_IO_ERROR:
+	case DB_TABLE_CORRUPT:
 	case DB_CORRUPTION:
 	case DB_PAGE_CORRUPTED:
 		ib::error() << "We detected index corruption in an InnoDB type"
@@ -763,14 +724,13 @@ handle_new_error:
 		ib::fatal() << "Unknown error " << err;
 	}
 
-	if (trx->error_state != DB_SUCCESS) {
-		*new_err = trx->error_state;
+	if (dberr_t n_err = trx->error_state) {
+		trx->error_state = DB_SUCCESS;
+		*new_err = n_err;
 	} else {
 		*new_err = err;
 	}
 
-	trx->error_state = DB_SUCCESS;
-
 	DBUG_RETURN(false);
 }
 
@@ -858,6 +818,10 @@ row_create_prebuilt(
 		DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
 			ut_a(temp_index->n_user_defined_cols
 						== MAX_REF_PARTS););
+		if (temp_index->is_corrupted()) {
+			continue;
+		}
+
 		uint temp_len = 0;
 		for (uint i = 0; i < temp_index->n_uniq; i++) {
 			ulint type = temp_index->fields[i].col->mtype;
@@ -942,13 +906,8 @@ row_create_prebuilt(
 	DBUG_RETURN(prebuilt);
 }
 
-/********************************************************************//**
-Free a prebuilt struct for a MySQL table handle. */
-void
-row_prebuilt_free(
-/*==============*/
-	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
-	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt)
 {
 	DBUG_ENTER("row_prebuilt_free");
 
@@ -1008,7 +967,7 @@ row_prebuilt_free(
 		rtr_clean_rtr_info(prebuilt->rtr_info, true);
 	}
 	if (prebuilt->table) {
-		dict_table_close(prebuilt->table, dict_locked, FALSE);
+		dict_table_close(prebuilt->table);
 	}
 
 	mem_heap_free(prebuilt->heap);
@@ -1069,7 +1028,6 @@ row_get_prebuilt_insert_row(
 		if (prebuilt->trx_id == table->def_trx_id
 		    && prebuilt->ins_node->entry_list.size()
 		    == UT_LIST_GET_LEN(table->indexes)) {
-
 			return(prebuilt->ins_node->row);
 		}
 
@@ -1107,12 +1065,12 @@ row_get_prebuilt_insert_row(
 	dict_table_copy_types(row, table);
 
 	ins_node_set_new_row(node, row);
+	que_thr_t* fork = pars_complete_graph_for_exec(
+		node, prebuilt->trx, prebuilt->heap, prebuilt);
+	fork->state = QUE_THR_RUNNING;
 
 	prebuilt->ins_graph = static_cast<que_fork_t*>(
-		que_node_get_parent(
-			pars_complete_graph_for_exec(
-				node,
-				prebuilt->trx, prebuilt->heap, prebuilt)));
+		que_node_get_parent(fork));
 
 	prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
 
@@ -1139,11 +1097,10 @@ row_lock_table_autoinc_for_mysql(
 	const dict_table_t*	table	= prebuilt->table;
 	que_thr_t*		thr;
 	dberr_t			err;
-	ibool			was_lock_wait;
 
 	/* If we already hold an AUTOINC lock on the table then do nothing.
 	Note: We peek at the value of the current owner without acquiring
-	the lock mutex. */
+	lock_sys.latch. */
 	if (trx == table->autoinc_trx) {
 
 		return(DB_SUCCESS);
@@ -1159,36 +1116,20 @@ row_lock_table_autoinc_for_mysql(
 
 	thr = que_fork_get_first_thr(prebuilt->ins_graph);
 
-	thr->start_running();
-
-run_again:
-	thr->run_node = node;
-	thr->prev_node = node;
-
-	/* It may be that the current session has not yet started
-	its transaction, or it has been committed: */
-
-	trx_start_if_not_started_xa(trx, true);
-
-	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
-
-	trx->error_state = err;
-
-	if (err != DB_SUCCESS) {
-		que_thr_stop_for_mysql(thr);
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
 
-		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
 
-		if (was_lock_wait) {
-			goto run_again;
-		}
+		trx_start_if_not_started_xa(trx, true);
 
-		trx->op_info = "";
+		err = lock_table(prebuilt->table, NULL, LOCK_AUTO_INC, thr);
 
-		return(err);
-	}
-
-	thr->stop_no_error();
+		trx->error_state = err;
+	} while (err != DB_SUCCESS
+		 && row_mysql_handle_errors(&err, trx, thr, NULL));
 
 	trx->op_info = "";
 
@@ -1204,7 +1145,6 @@ row_lock_table(row_prebuilt_t* prebuilt)
 	trx_t*		trx		= prebuilt->trx;
 	que_thr_t*	thr;
 	dberr_t		err;
-	ibool		was_lock_wait;
 
 	trx->op_info = "setting table lock";
 
@@ -1218,39 +1158,20 @@ row_lock_table(row_prebuilt_t* prebuilt)
 
 	thr = que_fork_get_first_thr(prebuilt->sel_graph);
 
-	thr->start_running();
-
-run_again:
-	thr->run_node = thr;
-	thr->prev_node = thr->common.parent;
-
-	/* It may be that the current session has not yet started
-	its transaction, or it has been committed: */
-
-	trx_start_if_not_started_xa(trx, false);
+	do {
+		thr->run_node = thr;
+		thr->prev_node = thr->common.parent;
 
-	err = lock_table(0, prebuilt->table,
-			 static_cast<enum lock_mode>(
-				 prebuilt->select_lock_type),
-			 thr);
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
 
-	trx->error_state = err;
+		trx_start_if_not_started_xa(trx, false);
 
-	if (err != DB_SUCCESS) {
-		que_thr_stop_for_mysql(thr);
-
-		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
-
-		if (was_lock_wait) {
-			goto run_again;
-		}
-
-		trx->op_info = "";
-
-		return(err);
-	}
-
-	thr->stop_no_error();
+		err = lock_table(prebuilt->table, NULL, static_cast<lock_mode>(
+					 prebuilt->select_lock_type), thr);
+		trx->error_state = err;
+	} while (err != DB_SUCCESS
+		 && row_mysql_handle_errors(&err, trx, thr, NULL));
 
 	trx->op_info = "";
 
@@ -1279,10 +1200,10 @@ row_mysql_get_table_status(
 			// to decrypt
 			if (push_warning) {
 				ib_push_warning(trx, DB_DECRYPTION_FAILED,
-					"Table %s in tablespace %lu encrypted."
+					"Table %s is encrypted."
 					"However key management plugin or used key_id is not found or"
 					" used encryption algorithm or method does not match.",
-					table->name.m_name, table->space);
+					table->name.m_name);
 			}
 
 			err = DB_DECRYPTION_FAILED;
@@ -1330,30 +1251,19 @@ row_insert_for_mysql(
 	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
 	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
 
-	if (!prebuilt->table->space) {
-
-		ib::error() << "The table " << prebuilt->table->name
+	if (!table->space) {
+		ib::error() << "The table " << table->name
 			<< " doesn't have a corresponding tablespace, it was"
 			" discarded.";
 
 		return(DB_TABLESPACE_DELETED);
-
-	} else if (!prebuilt->table->is_readable()) {
-		return(row_mysql_get_table_status(prebuilt->table, trx, true));
+	} else if (!table->is_readable()) {
+		return row_mysql_get_table_status(table, trx, true);
 	} else if (high_level_read_only) {
 		return(DB_READ_ONLY);
-	}
-
-	DBUG_EXECUTE_IF("mark_table_corrupted", {
-		/* Mark the table corrupted for the clustered index */
-		dict_index_t*	index = dict_table_get_first_index(table);
-		ut_ad(dict_index_is_clust(index));
-		dict_set_corrupted(index, trx, "INSERT TABLE"); });
-
-	if (dict_table_is_corrupted(table)) {
-
-		ib::error() << "Table " << table->name << " is corrupt.";
-		return(DB_TABLE_CORRUPT);
+	} else if (UNIV_UNLIKELY(table->corrupted)
+		   || dict_table_get_first_index(table)->is_corrupted()) {
+		return DB_TABLE_CORRUPT;
 	}
 
 	trx->op_info = "inserting";
@@ -1374,7 +1284,12 @@ row_insert_for_mysql(
           node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL);
         }
 
-	savept = trx_savept_take(trx);
+	/* Because we now allow multiple INSERT into the same
+	initially empty table in bulk insert mode, on error we must
+	roll back to the start of the transaction. For correctness, it
+	would suffice to roll back to the start of the first insert
+	into this empty table, but we will keep it simple and efficient. */
+	savept.least_undo_no = trx->bulk_insert ? 0 : trx->undo_no;
 
 	thr = que_fork_get_first_thr(prebuilt->ins_graph);
 
@@ -1383,10 +1298,9 @@ row_insert_for_mysql(
 		prebuilt->sql_stat_start = FALSE;
 	} else {
 		node->state = INS_NODE_ALLOC_ROW_ID;
+		node->trx_id = trx->id;
 	}
 
-	thr->start_running();
-
 run_again:
 	thr->run_node = node;
 	thr->prev_node = node;
@@ -1399,8 +1313,6 @@ run_again:
 
 	if (err != DB_SUCCESS) {
 error_exit:
-		que_thr_stop_for_mysql(thr);
-
 		/* FIXME: What's this ? */
 		thr->lock_state = QUE_THR_LOCK_ROW;
 
@@ -1411,7 +1323,8 @@ error_exit:
 
 		if (was_lock_wait) {
 			ut_ad(node->state == INS_NODE_INSERT_ENTRIES
-			      || node->state == INS_NODE_ALLOC_ROW_ID);
+			      || node->state == INS_NODE_ALLOC_ROW_ID
+			      || node->state == INS_NODE_SET_IX_LOCK);
 			goto run_again;
 		}
 
@@ -1473,15 +1386,14 @@ error_exit:
 		}
 	}
 
-	thr->stop_no_error();
-
 	if (table->is_system_db) {
 		srv_stats.n_system_rows_inserted.inc(size_t(trx->id));
 	} else {
 		srv_stats.n_rows_inserted.inc(size_t(trx->id));
 	}
 
-	/* Not protected by dict_sys.mutex for performance
+	/* Not protected by dict_sys.latch or table->stats_mutex_lock()
+	for performance
 	reasons, we would rather get garbage in stat_n_rows (which is
 	just an estimate anyway) than protecting the following code
 	with a latch. */
@@ -1518,12 +1430,12 @@ row_prebuild_sel_graph(
 
 		node = sel_node_create(prebuilt->heap);
 
+		que_thr_t* fork = pars_complete_graph_for_exec(
+			node, prebuilt->trx, prebuilt->heap, prebuilt);
+		fork->state = QUE_THR_RUNNING;
+
 		prebuilt->sel_graph = static_cast<que_fork_t*>(
-			que_node_get_parent(
-				pars_complete_graph_for_exec(
-					static_cast<sel_node_t*>(node),
-					prebuilt->trx, prebuilt->heap,
-					prebuilt)));
+			que_node_get_parent(fork));
 
 		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
 	}
@@ -1547,11 +1459,8 @@ row_create_update_node_for_mysql(
 
 	node->in_mysql_interface = true;
 	node->is_delete = NO_DELETE;
-	node->searched_update = FALSE;
-	node->select = NULL;
-	node->pcur = btr_pcur_create_for_mysql();
-
-	DBUG_PRINT("info", ("node: %p, pcur: %p", node, node->pcur));
+	node->pcur = new (mem_heap_alloc(heap, sizeof(btr_pcur_t)))
+		btr_pcur_t();
 
 	node->table = table;
 
@@ -1563,10 +1472,6 @@ row_create_update_node_for_mysql(
 	UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
 
 	node->has_clust_rec_x_lock = TRUE;
-	node->cmpl_info = 0;
-
-	node->table_sym = NULL;
-	node->col_assign_list = NULL;
 
 	DBUG_RETURN(node);
 }
@@ -1667,33 +1572,24 @@ init_fts_doc_id_for_ref(
 	dict_table_t*	table,		/*!< in: table */
 	ulint*		depth)		/*!< in: recusive call depth */
 {
-	dict_foreign_t* foreign;
-
 	table->fk_max_recusive_level = 0;
 
-	(*depth)++;
-
 	/* Limit on tables involved in cascading delete/update */
-	if (*depth > FK_MAX_CASCADE_DEL) {
+	if (++*depth > FK_MAX_CASCADE_DEL) {
 		return;
 	}
 
 	/* Loop through this table's referenced list and also
 	recursively traverse each table's foreign table list */
-	for (dict_foreign_set::iterator it = table->referenced_set.begin();
-	     it != table->referenced_set.end();
-	     ++it) {
-
-		foreign = *it;
-
-		ut_ad(foreign->foreign_table != NULL);
+	for (dict_foreign_t* foreign : table->referenced_set) {
+		ut_ad(foreign->foreign_table);
 
-		if (foreign->foreign_table->fts != NULL) {
+		if (foreign->foreign_table->fts) {
 			fts_init_doc_id(foreign->foreign_table);
 		}
 
-		if (!foreign->foreign_table->referenced_set.empty()
-		    && foreign->foreign_table != table) {
+		if (foreign->foreign_table != table
+		    && !foreign->foreign_table->referenced_set.empty()) {
 			init_fts_doc_id_for_ref(
 				foreign->foreign_table, depth);
 		}
@@ -1714,7 +1610,6 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 	dict_table_t*	table		= prebuilt->table;
 	trx_t*		trx		= prebuilt->trx;
 	ulint		fk_depth	= 0;
-	bool		got_s_lock	= false;
 
 	DBUG_ENTER("row_update_for_mysql");
 
@@ -1744,18 +1639,6 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 		trx_start_if_not_started_xa(trx, true);
 	}
 
-	if (dict_table_is_referenced_by_foreign_key(table)) {
-		/* Share lock the data dictionary to prevent any
-		table dictionary (for foreign constraint) change.
-		This is similar to row_ins_check_foreign_constraint
-		check protect by the dictionary lock as well.
-		In the future, this can be removed once the Foreign
-		key MDL is implemented */
-		row_mysql_freeze_data_dictionary(trx);
-		init_fts_doc_id_for_ref(table, &fk_depth);
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
 	node = prebuilt->upd_node;
 	const bool is_delete = node->is_delete == PLAIN_DELETE;
 	ut_ad(node->table == table);
@@ -1763,8 +1646,7 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 	clust_index = dict_table_get_first_index(table);
 
 	btr_pcur_copy_stored_position(node->pcur,
-				      prebuilt->pcur->btr_cur.index
-				      == clust_index
+				      prebuilt->pcur->index() == clust_index
 				      ? prebuilt->pcur
 				      : prebuilt->clust_pcur);
 
@@ -1777,7 +1659,7 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 	generated for the table: MySQL does not know anything about
 	the row id used as the clustered index key */
 
-	savept = trx_savept_take(trx);
+	savept.least_undo_no = trx->undo_no;
 
 	thr = que_fork_get_first_thr(prebuilt->upd_graph);
 
@@ -1785,8 +1667,6 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 
 	ut_ad(!prebuilt->sql_stat_start);
 
-	thr->start_running();
-
 	ut_ad(!prebuilt->versioned_write || node->table->versioned());
 
 	if (prebuilt->versioned_write) {
@@ -1810,8 +1690,6 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 			break;
 		}
 
-		que_thr_stop_for_mysql(thr);
-
 		if (err == DB_RECORD_NOT_FOUND) {
 			trx->error_state = DB_SUCCESS;
 			goto error;
@@ -1830,8 +1708,6 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 		}
 	}
 
-	thr->stop_no_error();
-
 	if (dict_table_has_fts_index(table)
 	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
 		err = row_fts_update_or_delete(prebuilt);
@@ -1842,15 +1718,12 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 	}
 
 	/* Completed cascading operations (if any) */
-	if (got_s_lock) {
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
 	bool	update_statistics;
 	ut_ad(is_delete == (node->is_delete == PLAIN_DELETE));
 
 	if (is_delete) {
-		/* Not protected by dict_sys.mutex for performance
+		/* Not protected by dict_sys.latch
+		or prebuilt->table->stats_mutex_lock() for performance
 		reasons, we would rather get garbage in stat_n_rows (which is
 		just an estimate anyway) than protecting the following code
 		with a latch. */
@@ -1881,22 +1754,14 @@ row_update_for_mysql(row_prebuilt_t* prebuilt)
 		prebuilt->table->stat_modified_counter++;
 	}
 
-	trx->op_info = "";
-
-	DBUG_RETURN(err);
-
 error:
 	trx->op_info = "";
-	if (got_s_lock) {
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
 	DBUG_RETURN(err);
 }
 
 /** This can only be used when the current transaction is at
 READ COMMITTED or READ UNCOMMITTED isolation level.
-Before calling this function row_search_for_mysql() must have
+Before calling this function row_search_mvcc() must have
 initialized prebuilt->new_rec_locks to store the information which new
 record locks really were set. This function removes a newly set
 clustered index record lock under prebuilt->pcur or
@@ -1912,56 +1777,29 @@ row_unlock_for_mysql(
 	row_prebuilt_t*	prebuilt,
 	ibool		has_latches_on_recs)
 {
-	btr_pcur_t*	pcur		= prebuilt->pcur;
-	btr_pcur_t*	clust_pcur	= prebuilt->clust_pcur;
-	trx_t*		trx		= prebuilt->trx;
-
-	ut_ad(prebuilt != NULL);
-	ut_ad(trx != NULL);
-	ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
-
-	if (dict_index_is_spatial(prebuilt->index)) {
-		return;
-	}
-
-	trx->op_info = "unlock_row";
-
-	if (prebuilt->new_rec_locks >= 1) {
+	if (prebuilt->new_rec_locks == 1 && prebuilt->index->is_clust()) {
+		trx_t* trx = prebuilt->trx;
+		ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+		trx->op_info = "unlock_row";
 
 		const rec_t*	rec;
 		dict_index_t*	index;
 		trx_id_t	rec_trx_id;
 		mtr_t		mtr;
+		btr_pcur_t*	pcur	= prebuilt->pcur;
 
 		mtr_start(&mtr);
 
 		/* Restore the cursor position and find the record */
 
-		if (!has_latches_on_recs) {
-			btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr);
+		if (!has_latches_on_recs
+		    && pcur->restore_position(BTR_SEARCH_LEAF, &mtr)
+		    != btr_pcur_t::SAME_ALL) {
+			goto no_unlock;
 		}
 
 		rec = btr_pcur_get_rec(pcur);
-		index = btr_pcur_get_btr_cur(pcur)->index;
-
-		if (prebuilt->new_rec_locks >= 2) {
-			/* Restore the cursor position and find the record
-			in the clustered index. */
-
-			if (!has_latches_on_recs) {
-				btr_pcur_restore_position(BTR_SEARCH_LEAF,
-							  clust_pcur, &mtr);
-			}
-
-			rec = btr_pcur_get_rec(clust_pcur);
-			index = btr_pcur_get_btr_cur(clust_pcur)->index;
-		}
-
-		if (!dict_index_is_clust(index)) {
-			/* This is not a clustered index record.  We
-			do not know how to unlock the record. */
-			goto no_unlock;
-		}
+		index = pcur->index();
 
 		/* If the record has been modified by this
 		transaction, do not unlock it. */
@@ -1993,60 +1831,15 @@ row_unlock_for_mysql(
 
 			lock_rec_unlock(
 				trx,
-				btr_pcur_get_block(pcur),
+				btr_pcur_get_block(pcur)->page.id(),
 				rec,
 				static_cast<enum lock_mode>(
 					prebuilt->select_lock_type));
-
-			if (prebuilt->new_rec_locks >= 2) {
-				rec = btr_pcur_get_rec(clust_pcur);
-
-				lock_rec_unlock(
-					trx,
-					btr_pcur_get_block(clust_pcur),
-					rec,
-					static_cast<enum lock_mode>(
-						prebuilt->select_lock_type));
-			}
 		}
 no_unlock:
 		mtr_commit(&mtr);
+		trx->op_info = "";
 	}
-
-	trx->op_info = "";
-}
-
-/*********************************************************************//**
-Locks the data dictionary in shared mode from modifications, for performing
-foreign key check, rollback, or other operation invisible to MySQL. */
-void
-row_mysql_freeze_data_dictionary_func(
-/*==================================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	const char*	file,	/*!< in: file name */
-	unsigned	line)	/*!< in: line number */
-{
-	ut_a(trx->dict_operation_lock_mode == 0);
-
-	rw_lock_s_lock_inline(&dict_sys.latch, 0, file, line);
-
-	trx->dict_operation_lock_mode = RW_S_LATCH;
-}
-
-/*********************************************************************//**
-Unlocks the data dictionary shared lock. */
-void
-row_mysql_unfreeze_data_dictionary(
-/*===============================*/
-	trx_t*	trx)	/*!< in/out: transaction */
-{
-	ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
-
-	ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
-
-	rw_lock_s_unlock(&dict_sys.latch);
-
-	trx->dict_operation_lock_mode = 0;
 }
 
 /** Write query start time as SQL field data to a buffer. Needed by InnoDB.
@@ -2151,10 +1944,7 @@ static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node)
 
 		switch (trx->error_state) {
 		case DB_LOCK_WAIT:
-			que_thr_stop_for_mysql(thr);
-			lock_wait_suspend_thread(thr);
-
-			if (trx->error_state == DB_SUCCESS) {
+			if (lock_wait(thr) == DB_SUCCESS) {
 				continue;
 			}
 
@@ -2225,10 +2015,7 @@ row_update_cascade_for_mysql(
 
 		switch (trx->error_state) {
 		case DB_LOCK_WAIT:
-			que_thr_stop_for_mysql(thr);
-			lock_wait_suspend_thread(thr);
-
-			if (trx->error_state == DB_SUCCESS) {
+			if (lock_wait(thr) == DB_SUCCESS) {
 				continue;
 			}
 
@@ -2243,7 +2030,8 @@ row_update_cascade_for_mysql(
 			bool stats;
 
 			if (node->is_delete == PLAIN_DELETE) {
-				/* Not protected by dict_sys.mutex for
+				/* Not protected by dict_sys.latch
+				or node->table->stats_mutex_lock() for
 				performance reasons, we would rather
 				get garbage in stat_n_rows (which is
 				just an estimate anyway) than
@@ -2273,35 +2061,6 @@ row_update_cascade_for_mysql(
 }
 
 /*********************************************************************//**
-Locks the data dictionary exclusively for performing a table create or other
-data dictionary modification operation. */
-void
-row_mysql_lock_data_dictionary_func(
-/*================================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	const char*	file,	/*!< in: file name */
-	unsigned	line)	/*!< in: line number */
-{
-	ut_a(trx->dict_operation_lock_mode == 0
-	     || trx->dict_operation_lock_mode == RW_X_LATCH);
-	dict_sys.lock(file, line);
-	trx->dict_operation_lock_mode = RW_X_LATCH;
-}
-
-/*********************************************************************//**
-Unlocks the data dictionary exclusive lock. */
-void
-row_mysql_unlock_data_dictionary(
-/*=============================*/
-	trx_t*	trx)	/*!< in/out: transaction */
-{
-	ut_ad(lock_trx_has_sys_table_locks(trx) == NULL);
-	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
-	trx->dict_operation_lock_mode = 0;
-	dict_sys.unlock();
-}
-
-/*********************************************************************//**
 Creates a table for MySQL. On failure the transaction will be rolled back
 and the 'table' object will be freed.
 @return error code or DB_SUCCESS */
@@ -2311,44 +2070,31 @@ row_create_table_for_mysql(
 	dict_table_t*	table,	/*!< in, own: table definition
 				(will be freed, or on DB_SUCCESS
 				added to the data dictionary cache) */
-	trx_t*		trx,	/*!< in/out: transaction */
-	fil_encryption_t mode,	/*!< in: encryption mode */
-	uint32_t	key_id)	/*!< in: encryption key_id */
+	trx_t*		trx)	/*!< in/out: transaction */
 {
 	tab_node_t*	node;
 	mem_heap_t*	heap;
 	que_thr_t*	thr;
-	dberr_t		err;
 
-	ut_d(dict_sys.assert_locked());
-	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(dict_sys.sys_tables_exist());
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
+
+	DEBUG_SYNC_C("create_table");
 
 	DBUG_EXECUTE_IF(
 		"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
-		dict_mem_table_free(table);
-		trx->op_info = "";
-		return DB_ERROR;
+		dict_mem_table_free(table); return DB_ERROR;
 	);
 
 	trx->op_info = "creating table";
 
-	trx_start_if_not_started_xa(trx, true);
-
 	heap = mem_heap_create(512);
 
-	switch (trx_get_dict_operation(trx)) {
-	case TRX_DICT_OP_NONE:
-		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-	case TRX_DICT_OP_TABLE:
-		break;
-	case TRX_DICT_OP_INDEX:
-		/* If the transaction was previously flagged as
-		TRX_DICT_OP_INDEX, we should be creating auxiliary
-		tables for full-text indexes. */
-		ut_ad(strstr(table->name.m_name, "/FTS_") != NULL);
-	}
+	trx->dict_operation = true;
 
-	node = tab_create_graph_create(table, heap, mode, key_id);
+	node = tab_create_graph_create(table, heap);
 
 	thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
 
@@ -2357,62 +2103,12 @@ row_create_table_for_mysql(
 
 	que_run_threads(thr);
 
-	err = trx->error_state;
-
-	/* Update SYS_TABLESPACES and SYS_DATAFILES if a new file-per-table
-	tablespace was created. */
-	if (err == DB_SUCCESS && dict_table_is_file_per_table(table)) {
-		err = dict_replace_tablespace_in_dictionary(
-			table->space_id, table->name.m_name,
-			table->space->flags,
-			table->space->chain.start->name, trx);
-
-		if (err != DB_SUCCESS) {
-
-			/* We must delete the link file. */
-			RemoteDatafile::delete_link_file(table->name.m_name);
-		}
-	}
-
-	switch (err) {
-	case DB_SUCCESS:
-		break;
-	case DB_OUT_OF_FILE_SPACE:
-		trx->error_state = DB_SUCCESS;
-		trx->rollback();
-
-		ib::warn() << "Cannot create table "
-			<< table->name
-			<< " because tablespace full";
-
-		if (dict_table_open_on_name(table->name.m_name, TRUE, FALSE,
-					    DICT_ERR_IGNORE_NONE)) {
-
-			dict_table_close_and_drop(trx, table);
-		} else {
-			dict_mem_table_free(table);
-		}
-
-		break;
-
-	case DB_UNSUPPORTED:
-	case DB_TOO_MANY_CONCURRENT_TRXS:
-		/* We already have .ibd file here. it should be deleted. */
-
-		if (dict_table_is_file_per_table(table)
-		    && fil_delete_tablespace(table->space_id) != DB_SUCCESS) {
-			ib::error() << "Cannot delete the file of table "
-				<< table->name;
-		}
-		/* fall through */
+	dberr_t err = trx->error_state;
 
-	case DB_DUPLICATE_KEY:
-	case DB_TABLESPACE_EXISTS:
-	default:
+	if (err != DB_SUCCESS) {
 		trx->error_state = DB_SUCCESS;
 		trx->rollback();
 		dict_mem_table_free(table);
-		break;
 	}
 
 	que_graph_free((que_t*) que_node_get_parent(thr));
@@ -2432,12 +2128,14 @@ row_create_index_for_mysql(
 	dict_index_t*	index,		/*!< in, own: index definition
 					(will be freed) */
 	trx_t*		trx,		/*!< in: transaction handle */
-	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
+	const ulint*	field_lengths,	/*!< in: if not NULL, must contain
 					dict_index_get_n_fields(index)
 					actual field lengths for the
 					index columns, which are
 					then checked for not being too
 					large. */
+	fil_encryption_t mode,		/*!< in: encryption mode */
+	uint32_t	key_id)		/*!< in: encryption key_id */
 {
 	ind_node_t*	node;
 	mem_heap_t*	heap;
@@ -2447,7 +2145,7 @@ row_create_index_for_mysql(
 	ulint		len;
 	dict_table_t*	table = index->table;
 
-	ut_d(dict_sys.assert_locked());
+	ut_ad(dict_sys.locked());
 
 	for (i = 0; i < index->n_def; i++) {
 		/* Check that prefix_len and actual length
@@ -2471,21 +2169,21 @@ row_create_index_for_mysql(
 		}
 	}
 
-	trx->op_info = "creating index";
-
 	/* For temp-table we avoid insertion into SYSTEM TABLES to
 	maintain performance and so we have separate path that directly
 	just updates dictonary cache. */
 	if (!table->is_temporary()) {
-		trx_start_if_not_started_xa(trx, true);
-		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		ut_ad(trx->state == TRX_STATE_ACTIVE);
+		ut_ad(trx->dict_operation);
+		trx->op_info = "creating index";
+
 		/* Note that the space id where we store the index is
 		inherited from the table in dict_build_index_def_step()
 		in dict0crea.cc. */
 
 		heap = mem_heap_create(512);
 		node = ind_create_graph_create(index, table->name.m_name,
-					       heap);
+					       heap, mode, key_id);
 
 		thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
 
@@ -2506,6 +2204,8 @@ row_create_index_for_mysql(
 		if (index && (index->type & DICT_FTS)) {
 			err = fts_create_index_tables(trx, index, table->id);
 		}
+
+		trx->op_info = "";
 	} else {
 		dict_build_index_def(table, index, trx);
 
@@ -2527,270 +2227,9 @@ row_create_index_for_mysql(
 		}
 	}
 
-	trx->op_info = "";
-
 	return(err);
 }
 
-/*********************************************************************//**
-Drops a table for MySQL as a background operation. MySQL relies on Unix
-in ALTER TABLE to the fact that the table handler does not remove the
-table before all handles to it has been removed. Furhermore, the MySQL's
-call to drop table must be non-blocking. Therefore we do the drop table
-as a background operation, which is taken care of by the master thread
-in srv0srv.cc.
-@return error code or DB_SUCCESS */
-static
-dberr_t
-row_drop_table_for_mysql_in_background(
-/*===================================*/
-	const char*	name)	/*!< in: table name */
-{
-	dberr_t	error;
-	trx_t*	trx;
-
-	trx = trx_create();
-
-	/* If the original transaction was dropping a table referenced by
-	foreign keys, we must set the following to be able to drop the
-	table: */
-
-	trx->check_foreigns = false;
-
-	/* Try to drop the table in InnoDB */
-
-	error = row_drop_table_for_mysql(name, trx, SQLCOM_TRUNCATE);
-
-	trx_commit_for_mysql(trx);
-
-	trx->free();
-
-	return(error);
-}
-
-/*********************************************************************//**
-The master thread in srv0srv.cc calls this regularly to drop tables which
-we must drop in background after queries to them have ended. Such lazy
-dropping of tables is needed in ALTER TABLE on Unix.
-@return how many tables dropped + remaining tables in list */
-ulint
-row_drop_tables_for_mysql_in_background(void)
-/*=========================================*/
-{
-	row_mysql_drop_t*	drop;
-	dict_table_t*		table;
-	ulint			n_tables;
-	ulint			n_tables_dropped = 0;
-loop:
-	mutex_enter(&row_drop_list_mutex);
-
-	ut_a(row_mysql_drop_list_inited);
-next:
-	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
-
-	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
-
-	mutex_exit(&row_drop_list_mutex);
-
-	if (drop == NULL) {
-		/* All tables dropped */
-
-		return(n_tables + n_tables_dropped);
-	}
-
-	/* On fast shutdown, just empty the list without dropping tables. */
-	table = srv_shutdown_state == SRV_SHUTDOWN_NONE || !srv_fast_shutdown
-		? dict_table_open_on_id(drop->table_id, FALSE,
-					DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
-		: NULL;
-
-	if (!table) {
-		n_tables_dropped++;
-		mutex_enter(&row_drop_list_mutex);
-		UT_LIST_REMOVE(row_mysql_drop_list, drop);
-		MONITOR_DEC(MONITOR_BACKGROUND_DROP_TABLE);
-		ut_free(drop);
-		goto next;
-	}
-
-	ut_a(!table->can_be_evicted);
-
-	bool skip = false;
-
-	if (!table->to_be_dropped) {
-skip:
-		dict_table_close(table, FALSE, FALSE);
-
-		mutex_enter(&row_drop_list_mutex);
-		UT_LIST_REMOVE(row_mysql_drop_list, drop);
-		if (!skip) {
-			UT_LIST_ADD_LAST(row_mysql_drop_list, drop);
-		} else {
-			ut_free(drop);
-		}
-		goto next;
-	}
-
-	if (!srv_fast_shutdown && !trx_sys.any_active_transactions()) {
-		lock_mutex_enter();
-		skip = UT_LIST_GET_LEN(table->locks) != 0;
-		lock_mutex_exit();
-		if (skip) {
-			/* We cannot drop tables that are locked by XA
-			PREPARE transactions. */
-			goto skip;
-		}
-	}
-
-	char* name = mem_strdup(table->name.m_name);
-
-	dict_table_close(table, FALSE, FALSE);
-
-	dberr_t err = row_drop_table_for_mysql_in_background(name);
-
-	ut_free(name);
-
-	if (err != DB_SUCCESS) {
-		/* If the DROP fails for some table, we return, and let the
-		main thread retry later */
-		return(n_tables + n_tables_dropped);
-	}
-
-	goto loop;
-}
-
-/*********************************************************************//**
-Get the background drop list length. NOTE: the caller must own the
-drop list mutex!
-@return how many tables in list */
-ulint
-row_get_background_drop_list_len_low(void)
-/*======================================*/
-{
-	ulint	len;
-
-	mutex_enter(&row_drop_list_mutex);
-
-	ut_a(row_mysql_drop_list_inited);
-
-	len = UT_LIST_GET_LEN(row_mysql_drop_list);
-
-	mutex_exit(&row_drop_list_mutex);
-
-	return(len);
-}
-
-/** Drop garbage tables during recovery. */
-void
-row_mysql_drop_garbage_tables()
-{
-	mem_heap_t*	heap = mem_heap_create(FN_REFLEN);
-	btr_pcur_t	pcur;
-	mtr_t		mtr;
-	trx_t*		trx = trx_create();
-	trx->op_info = "dropping garbage tables";
-	row_mysql_lock_data_dictionary(trx);
-
-	mtr.start();
-	btr_pcur_open_at_index_side(
-		true, dict_table_get_first_index(dict_sys.sys_tables),
-		BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
-
-	for (;;) {
-		const rec_t*	rec;
-		const byte*	field;
-		ulint		len;
-		const char*	table_name;
-
-		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
-
-		if (!btr_pcur_is_on_user_rec(&pcur)) {
-			break;
-		}
-
-		rec = btr_pcur_get_rec(&pcur);
-		if (rec_get_deleted_flag(rec, 0)) {
-			continue;
-		}
-
-		field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
-		if (len == UNIV_SQL_NULL || len == 0) {
-			/* Corrupted SYS_TABLES.NAME */
-			continue;
-		}
-
-		table_name = mem_heap_strdupl(
-			heap,
-			reinterpret_cast<const char*>(field), len);
-		if (strstr(table_name, "/" TEMP_FILE_PREFIX "-") &&
-                    !strstr(table_name, "/" TEMP_FILE_PREFIX "-backup-") &&
-                    !strstr(table_name, "/" TEMP_FILE_PREFIX "-exchange-"))
-                {
-			btr_pcur_store_position(&pcur, &mtr);
-			btr_pcur_commit_specify_mtr(&pcur, &mtr);
-
-			if (dict_load_table(table_name,
-					    DICT_ERR_IGNORE_DROP)) {
-				row_drop_table_for_mysql(table_name, trx,
-							 SQLCOM_DROP_TABLE);
-				trx_commit_for_mysql(trx);
-			}
-
-			mtr.start();
-			btr_pcur_restore_position(BTR_SEARCH_LEAF,
-						  &pcur, &mtr);
-		}
-
-		mem_heap_empty(heap);
-	}
-
-	btr_pcur_close(&pcur);
-	mtr.commit();
-	row_mysql_unlock_data_dictionary(trx);
-	trx->free();
-	mem_heap_free(heap);
-}
-
-/*********************************************************************//**
-If a table is not yet in the drop list, adds the table to the list of tables
-which the master thread drops in background. We need this on Unix because in
-ALTER TABLE MySQL may call drop table even if the table has running queries on
-it. Also, if there are running foreign key checks on the table, we drop the
-table lazily.
-@return	whether background DROP TABLE was scheduled for the first time */
-static
-bool
-row_add_table_to_background_drop_list(table_id_t table_id)
-{
-	row_mysql_drop_t*	drop;
-	bool			added = true;
-
-	mutex_enter(&row_drop_list_mutex);
-
-	ut_a(row_mysql_drop_list_inited);
-
-	/* Look if the table already is in the drop list */
-	for (drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
-	     drop != NULL;
-	     drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop)) {
-
-		if (drop->table_id == table_id) {
-			added = false;
-			goto func_exit;
-		}
-	}
-
-	drop = static_cast<row_mysql_drop_t*>(ut_malloc_nokey(sizeof *drop));
-	drop->table_id = table_id;
-
-	UT_LIST_ADD_LAST(row_mysql_drop_list, drop);
-
-	MONITOR_INC(MONITOR_BACKGROUND_DROP_TABLE);
-func_exit:
-	mutex_exit(&row_drop_list_mutex);
-	return added;
-}
-
 /** Reassigns the table identifier of a table.
 @param[in,out]	table	table
 @param[in,out]	trx	transaction
@@ -2803,6 +2242,13 @@ row_mysql_table_id_reassign(
 	trx_t*		trx,
 	table_id_t*	new_id)
 {
+	if (!dict_sys.sys_tables || dict_sys.sys_tables->corrupted ||
+	    !dict_sys.sys_columns || dict_sys.sys_columns->corrupted ||
+	    !dict_sys.sys_indexes || dict_sys.sys_indexes->corrupted ||
+	    !dict_sys.sys_virtual || dict_sys.sys_virtual->corrupted) {
+		return DB_CORRUPTION;
+	}
+
 	dberr_t		err;
 	pars_info_t*	info	= pars_info_create();
 
@@ -2827,48 +2273,12 @@ row_mysql_table_id_reassign(
 		" WHERE TABLE_ID = :old_id;\n"
 		"UPDATE SYS_VIRTUAL SET TABLE_ID = :new_id\n"
 		" WHERE TABLE_ID = :old_id;\n"
-		"END;\n", FALSE, trx);
+		"END;\n", trx);
 
 	return(err);
 }
 
 /*********************************************************************//**
-Setup the pre-requisites for DISCARD TABLESPACE. It will start the transaction,
-acquire the data dictionary lock in X mode and open the table.
-@return table instance or 0 if not found. */
-static
-dict_table_t*
-row_discard_tablespace_begin(
-/*=========================*/
-	const char*	name,	/*!< in: table name */
-	trx_t*		trx)	/*!< in: transaction handle */
-{
-	trx->op_info = "discarding tablespace";
-
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-
-	trx_start_if_not_started_xa(trx, true);
-
-	/* Serialize data dictionary operations with dictionary mutex:
-	this is to avoid deadlocks during data dictionary operations */
-
-	row_mysql_lock_data_dictionary(trx);
-
-	dict_table_t*	table;
-
-	table = dict_table_open_on_name(
-		name, TRUE, FALSE, DICT_ERR_IGNORE_FK_NOKEY);
-
-	if (table) {
-		dict_stats_wait_bg_to_stop_using_table(table, trx);
-		ut_a(!is_system_tablespace(table->space_id));
-		ut_ad(!table->n_foreign_key_checks_running);
-	}
-
-	return(table);
-}
-
-/*********************************************************************//**
 Do the foreign key constraint checks.
 @return DB_SUCCESS or error code. */
 static
@@ -2903,7 +2313,7 @@ row_discard_tablespace_foreign_key_checks(
 	/* We only allow discarding a referenced table if
 	FOREIGN_KEY_CHECKS is set to 0 */
 
-	mutex_enter(&dict_foreign_err_mutex);
+	mysql_mutex_lock(&dict_foreign_err_mutex);
 
 	rewind(ef);
 
@@ -2916,44 +2326,12 @@ row_discard_tablespace_foreign_key_checks(
 	ut_print_name(ef, trx, foreign->foreign_table_name);
 	putc('\n', ef);
 
-	mutex_exit(&dict_foreign_err_mutex);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
 
 	return(DB_CANNOT_DROP_CONSTRAINT);
 }
 
 /*********************************************************************//**
-Cleanup after the DISCARD TABLESPACE operation.
-@return error code. */
-static
-dberr_t
-row_discard_tablespace_end(
-/*=======================*/
-	trx_t*		trx,	/*!< in/out: transaction handle */
-	dict_table_t*	table,	/*!< in/out: table to be discarded */
-	dberr_t		err)	/*!< in: error code */
-{
-	if (table != 0) {
-		dict_table_close(table, TRUE, FALSE);
-	}
-
-	DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
-			log_buffer_flush_to_disk();
-			DBUG_SUICIDE(););
-
-	trx_commit_for_mysql(trx);
-
-	DBUG_EXECUTE_IF("ib_discard_after_commit_crash",
-			log_buffer_flush_to_disk();
-			DBUG_SUICIDE(););
-
-	row_mysql_unlock_data_dictionary(trx);
-
-	trx->op_info = "";
-
-	return(err);
-}
-
-/*********************************************************************//**
 Do the DISCARD TABLESPACE operation.
 @return DB_SUCCESS or error code. */
 static
@@ -2963,17 +2341,17 @@ row_discard_tablespace(
 	trx_t*		trx,	/*!< in/out: transaction handle */
 	dict_table_t*	table)	/*!< in/out: table to be discarded */
 {
-	dberr_t		err;
+	dberr_t err;
 
 	/* How do we prevent crashes caused by ongoing operations on
 	the table? Old operations could try to access non-existent
-	pages. MySQL will block all DML on the table using MDL and a
+	pages. The SQL layer will block all DML on the table using MDL and a
 	DISCARD will not start unless all existing operations on the
 	table to be discarded are completed.
 
-	1) Acquire the data dictionary latch in X mode. To prevent any
-	internal operations that MySQL is not aware off and also for
-	the internal SQL parser.
+	1) Acquire the data dictionary latch in X mode. This will
+	prevent any internal operations that are not covered by
+	MDL or InnoDB table locks.
 
 	2) Purge and rollback: we assign a new table id for the
 	table. Since purge and rollback look for the table based on
@@ -3006,7 +2384,7 @@ row_discard_tablespace(
 	if (dict_table_has_fts_index(table)
 	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
 
-		fts_drop_tables(trx, table);
+		fts_drop_tables(trx, *table);
 	}
 
 	/* Assign a new space ID to the table definition so that purge
@@ -3018,29 +2396,9 @@ row_discard_tablespace(
 		return(err);
 	}
 
-	/* Discard the physical file that is used for the tablespace. */
-	err = fil_delete_tablespace(table->space_id);
-	switch (err) {
-	case DB_IO_ERROR:
-		ib::warn() << "ALTER TABLE " << table->name
-			<< " DISCARD TABLESPACE failed to delete file";
-		break;
-	case DB_TABLESPACE_NOT_FOUND:
-		ib::warn() << "ALTER TABLE " << table->name
-			<< " DISCARD TABLESPACE failed to find tablespace";
-		break;
-	case DB_SUCCESS:
-		break;
-	default:
-		ut_error;
-	}
-
 	/* All persistent operations successful, update the
 	data dictionary memory cache. */
 
-	table->file_unreadable = true;
-	table->space = NULL;
-	table->flags2 |= DICT_TF2_DISCARDED;
 	dict_table_change_id_in_cache(table, new_id);
 
 	dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
@@ -3062,992 +2420,81 @@ Discards the tablespace of a table which stored in an .ibd file. Discarding
 means that this function renames the .ibd file and assigns a new table id for
 the table. Also the file_unreadable flag is set.
 @return error code or DB_SUCCESS */
-dberr_t
-row_discard_tablespace_for_mysql(
-/*=============================*/
-	const char*	name,	/*!< in: table name */
-	trx_t*		trx)	/*!< in: transaction handle */
-{
-	dberr_t		err;
-	dict_table_t*	table;
-
-	/* Open the table and start the transaction if not started. */
-
-	table = row_discard_tablespace_begin(name, trx);
-
-	if (table == 0) {
-		err = DB_TABLE_NOT_FOUND;
-	} else if (table->is_temporary()) {
-
-		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
-			    ER_CANNOT_DISCARD_TEMPORARY_TABLE);
-
-		err = DB_ERROR;
-
-	} else if (table->space_id == TRX_SYS_SPACE) {
-		char	table_name[MAX_FULL_NAME_LEN + 1];
-
-		innobase_format_name(
-			table_name, sizeof(table_name),
-			table->name.m_name);
-
-		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
-			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
-
-		err = DB_ERROR;
-
-	} else {
-		ut_ad(!table->n_foreign_key_checks_running);
-
-		bool fts_exist = (dict_table_has_fts_index(table)
-				  || DICT_TF2_FLAG_IS_SET(
-					  table, DICT_TF2_FTS_HAS_DOC_ID));
-
-		if (fts_exist) {
-			row_mysql_unlock_data_dictionary(trx);
-			fts_optimize_remove_table(table);
-			row_mysql_lock_data_dictionary(trx);
-		}
-
-		/* Do foreign key constraint checks. */
-
-		err = row_discard_tablespace_foreign_key_checks(trx, table);
-
-		if (err == DB_SUCCESS) {
-			/* Note: This cannot be rolled back.
-			Rollback would see the UPDATE SYS_INDEXES
-			as two operations: DELETE and INSERT.
-			It would invoke btr_free_if_exists()
-			when rolling back the INSERT, effectively
-			dropping all indexes of the table. */
-			err = row_discard_tablespace(trx, table);
-		}
-
-		if (fts_exist && err != DB_SUCCESS) {
-			fts_optimize_add_table(table);
-		}
-	}
-
-	return(row_discard_tablespace_end(trx, table, err));
-}
-
-/*********************************************************************//**
-Sets an exclusive lock on a table.
-@return error code or DB_SUCCESS */
-dberr_t
-row_mysql_lock_table(
-/*=================*/
-	trx_t*		trx,		/*!< in/out: transaction */
-	dict_table_t*	table,		/*!< in: table to lock */
-	enum lock_mode	mode,		/*!< in: LOCK_X or LOCK_S */
-	const char*	op_info)	/*!< in: string for trx->op_info */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
 {
-	mem_heap_t*	heap;
-	que_thr_t*	thr;
-	dberr_t		err;
-	sel_node_t*	node;
-
-	ut_ad(mode == LOCK_X || mode == LOCK_S);
+  ut_ad(!is_system_tablespace(table->space_id));
+  ut_ad(!table->is_temporary());
 
-	heap = mem_heap_create(512);
-
-	trx->op_info = op_info;
-
-	node = sel_node_create(heap);
-	thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
-	thr->graph->state = QUE_FORK_ACTIVE;
-
-	/* We use the select query graph as the dummy graph needed
-	in the lock module call */
+  const auto fts_exist = table->flags2 &
+    (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
 
-	thr = que_fork_get_first_thr(
-		static_cast<que_fork_t*>(que_node_get_parent(thr)));
+  dberr_t err;
 
-	thr->start_running();
-
-run_again:
-	thr->run_node = thr;
-	thr->prev_node = thr->common.parent;
-
-	err = lock_table(0, table, mode, thr);
-
-	trx->error_state = err;
-
-	if (err == DB_SUCCESS) {
-		thr->stop_no_error();
-	} else {
-		que_thr_stop_for_mysql(thr);
-
-		if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
-			goto run_again;
-		}
-	}
-
-	que_graph_free(thr->graph);
-	trx->op_info = "";
-
-	return(err);
-}
-
-/** Drop ancillary FTS tables as part of dropping a table.
-@param[in,out]	table		Table cache entry
-@param[in,out]	trx		Transaction handle
-@return error code or DB_SUCCESS */
-UNIV_INLINE
-dberr_t
-row_drop_ancillary_fts_tables(
-	dict_table_t*	table,
-	trx_t*		trx)
-{
-	/* Drop ancillary FTS tables */
-	if (dict_table_has_fts_index(table)
-	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
-
-		ut_ad(table->get_ref_count() == 0);
-		ut_ad(trx_is_started(trx));
-
-		dberr_t err = fts_drop_tables(trx, table);
-
-		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-			ib::error() << " Unable to remove ancillary FTS"
-				" tables for table "
-				<< table->name << " : " << err;
-
-			return(err);
-		}
-	}
-
-	/* The table->fts flag can be set on the table for which
-	the cluster index is being rebuilt. Such table might not have
-	DICT_TF2_FTS flag set. So keep this out of above
-	dict_table_has_fts_index condition */
-	if (table->fts != NULL) {
-		/* fts_que_graph_free_check_lock would try to acquire
-		dict mutex lock */
-		table->fts->dict_locked = true;
-		table->fts->~fts_t();
-		table->fts = nullptr;
-	}
-
-	return(DB_SUCCESS);
-}
-
-/** Drop a table from the memory cache as part of dropping a table.
-@param[in]	tablename	A copy of table->name. Used when table == null
-@param[in,out]	table		Table cache entry
-@param[in,out]	trx		Transaction handle
-@return error code or DB_SUCCESS */
-UNIV_INLINE
-dberr_t
-row_drop_table_from_cache(
-	const char*	tablename,
-	dict_table_t*	table,
-	trx_t*		trx)
-{
-	dberr_t	err = DB_SUCCESS;
-	ut_ad(!table->is_temporary());
-
-	/* Remove the pointer to this table object from the list
-	of modified tables by the transaction because the object
-	is going to be destroyed below. */
-	trx->mod_tables.erase(table);
-
-	dict_sys.remove(table);
-
-	if (dict_load_table(tablename, DICT_ERR_IGNORE_FK_NOKEY)) {
-		ib::error() << "Not able to remove table "
-			<< ut_get_name(trx, tablename)
-			<< " from the dictionary cache!";
-		err = DB_ERROR;
-	}
-
-	return(err);
-}
-
-/** Drop a table for MySQL.
-If the data dictionary was not already locked by the transaction,
-the transaction will be committed.  Otherwise, the data dictionary
-will remain locked.
-@param[in]	name		Table name
-@param[in,out]	trx		Transaction handle
-@param[in]	sqlcom		type of SQL operation
-@param[in]	create_failed	true=create table failed
-				because e.g. foreign key column
-@param[in]	nonatomic	Whether it is permitted to release
-				and reacquire dict_sys.latch
-@return error code or DB_SUCCESS */
-dberr_t
-row_drop_table_for_mysql(
-	const char*		name,
-	trx_t*			trx,
-	enum_sql_command	sqlcom,
-	bool			create_failed,
-	bool			nonatomic)
-{
-	dberr_t		err;
-	dict_foreign_t*	foreign;
-	dict_table_t*	table;
-	char*		tablename		= NULL;
-	bool		locked_dictionary	= false;
-	pars_info_t*	info			= NULL;
-	mem_heap_t*	heap			= NULL;
-
-
-	DBUG_ENTER("row_drop_table_for_mysql");
-	DBUG_PRINT("row_drop_table_for_mysql", ("table: '%s'", name));
-
-	ut_a(name != NULL);
-
-	/* Serialize data dictionary operations with dictionary mutex:
-	no deadlocks can occur then in these operations */
-
-	trx->op_info = "dropping table";
-
-	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
-		/* Prevent foreign key checks etc. while we are
-		dropping the table */
-
-		row_mysql_lock_data_dictionary(trx);
-
-		locked_dictionary = true;
-		nonatomic = true;
-	}
-
-	ut_d(dict_sys.assert_locked());
-
-	table = dict_table_open_on_name(
-		name, TRUE, FALSE,
-		static_cast<dict_err_ignore_t>(
-			DICT_ERR_IGNORE_INDEX_ROOT
-			| DICT_ERR_IGNORE_CORRUPT));
-
-	if (!table) {
-		if (locked_dictionary) {
-			row_mysql_unlock_data_dictionary(trx);
-		}
-		trx->op_info = "";
-		DBUG_RETURN(DB_TABLE_NOT_FOUND);
-	}
-
-	std::vector<pfs_os_file_t> detached_handles;
-
-	const bool is_temp_name = strstr(table->name.m_name,
-					 "/" TEMP_FILE_PREFIX);
-
-	if (table->is_temporary()) {
-		ut_ad(table->space == fil_system.temp_space);
-		for (dict_index_t* index = dict_table_get_first_index(table);
-		     index != NULL;
-		     index = dict_table_get_next_index(index)) {
-			btr_free(page_id_t(SRV_TMP_SPACE_ID, index->page));
-		}
-		/* Remove the pointer to this table object from the list
-		of modified tables by the transaction because the object
-		is going to be destroyed below. */
-		trx->mod_tables.erase(table);
-		table->release();
-		dict_sys.remove(table);
-		err = DB_SUCCESS;
-		goto funct_exit_all_freed;
-	}
-
-	/* This function is called recursively via fts_drop_tables(). */
-	if (!trx_is_started(trx)) {
-		trx_start_for_ddl(trx, TRX_DICT_OP_TABLE);
-	}
-
-	/* Turn on this drop bit before we could release the dictionary
-	latch */
-	table->to_be_dropped = true;
-
-	if (nonatomic) {
-		/* This trx did not acquire any locks on dictionary
-		table records yet. Thus it is safe to release and
-		reacquire the data dictionary latches. */
-		if (table->fts) {
-			row_mysql_unlock_data_dictionary(trx);
-			fts_optimize_remove_table(table);
-			row_mysql_lock_data_dictionary(trx);
-		}
-
-		dict_stats_wait_bg_to_stop_using_table(table, trx);
-	}
-
-	/* make sure background stats thread is not running on the table */
-	ut_ad(!(table->stats_bg_flag & BG_STAT_IN_PROGRESS));
-	if (!table->no_rollback()) {
-		if (table->space != fil_system.sys_space) {
-			/* Delete the link file if used. */
-			if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-				RemoteDatafile::delete_link_file(name);
-			}
-		}
-
-		dict_stats_recalc_pool_del(table);
-		dict_stats_defrag_pool_del(table, NULL);
-		if (btr_defragment_active) {
-			/* During fts_drop_orphaned_tables() the
-			btr_defragment_mutex has not yet been
-			initialized by btr_defragment_init(). */
-			btr_defragment_remove_table(table);
-		}
-
-		if (UNIV_LIKELY(!strstr(name, "/" TEMP_FILE_PREFIX_INNODB))) {
-			/* Remove any persistent statistics for this table,
-			in a separate transaction. */
-			char errstr[1024];
-			err = dict_stats_drop_table(name, errstr,
-						    sizeof errstr);
-			if (err != DB_SUCCESS) {
-				ib::warn() << errstr;
-			}
-		}
-	}
-
-	dict_table_prevent_eviction(table);
-	dict_table_close(table, TRUE, FALSE);
-
-	/* Check if the table is referenced by foreign key constraints from
-	some other table (not the table itself) */
-
-	if (!srv_read_only_mode && trx->check_foreigns) {
-
-		for (dict_foreign_set::iterator it
-			= table->referenced_set.begin();
-		     it != table->referenced_set.end();
-		     ++it) {
-
-			foreign = *it;
-
-			const bool	ref_ok = sqlcom == SQLCOM_DROP_DB
-				&& dict_tables_have_same_db(
-					name,
-					foreign->foreign_table_name_lookup);
-
-			/* We should allow dropping a referenced table if creating
-			that referenced table has failed for some reason. For example
-			if referenced table is created but it column types that are
-			referenced do not match. */
-			if (foreign->foreign_table != table &&
-			    !create_failed && !ref_ok) {
-
-				FILE*	ef	= dict_foreign_err_file;
-
-				/* We only allow dropping a referenced table
-				if FOREIGN_KEY_CHECKS is set to 0 */
-
-				err = DB_CANNOT_DROP_CONSTRAINT;
-
-				mutex_enter(&dict_foreign_err_mutex);
-				rewind(ef);
-				ut_print_timestamp(ef);
-
-				fputs("  Cannot drop table ", ef);
-				ut_print_name(ef, trx, name);
-				fputs("\n"
-				      "because it is referenced by ", ef);
-				ut_print_name(ef, trx,
-					      foreign->foreign_table_name);
-				putc('\n', ef);
-				mutex_exit(&dict_foreign_err_mutex);
-
-				goto funct_exit;
-			}
-		}
-	}
-
-	DBUG_EXECUTE_IF("row_drop_table_add_to_background", goto defer;);
-
-	/* TODO: could we replace the counter n_foreign_key_checks_running
-	with lock checks on the table? Acquire here an exclusive lock on the
-	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
-	they can cope with the table having been dropped here? Foreign key
-	checks take an IS or IX lock on the table. */
-
-	if (table->n_foreign_key_checks_running > 0) {
-defer:
-		/* Rename #sql-backup to #sql-ib if table has open ref count
-		while dropping the table. This scenario can happen
-		when purge thread is waiting for dict_sys.mutex so
-		that it could close the table. But drop table acquires
-		dict_sys.mutex.
-                In the future this should use 'tmp_file_prefix'!
-                */
-		if (!is_temp_name
-		    || strstr(table->name.m_name, "/#sql-backup-")) {
-			heap = mem_heap_create(FN_REFLEN);
-			const char* tmp_name
-				= dict_mem_create_temporary_tablename(
-					heap, table->name.m_name, table->id);
-			ib::info() << "Deferring DROP TABLE " << table->name
-				   << "; renaming to " << tmp_name;
-			err = row_rename_table_for_mysql(
-				table->name.m_name, tmp_name, trx,
-				false, false);
-		} else {
-			err = DB_SUCCESS;
-		}
-		if (err == DB_SUCCESS) {
-			row_add_table_to_background_drop_list(table->id);
-		}
-		goto funct_exit;
-	}
-
-	/* Remove all locks that are on the table or its records, if there
-	are no references to the table but it has record locks, we release
-	the record locks unconditionally. One use case is:
-
-		CREATE TABLE t2 (PRIMARY KEY (a)) SELECT * FROM t1;
-
-	If after the user transaction has done the SELECT and there is a
-	problem in completing the CREATE TABLE operation, MySQL will drop
-	the table. InnoDB will create a new background transaction to do the
-	actual drop, the trx instance that is passed to this function. To
-	preserve existing behaviour we remove the locks but ideally we
-	shouldn't have to. There should never be record locks on a table
-	that is going to be dropped. */
-
-	if (table->get_ref_count() > 0 || table->n_rec_locks > 0
-	    || lock_table_has_locks(table)) {
-		goto defer;
-	}
-
-	/* The "to_be_dropped" marks table that is to be dropped, but
-	has not been dropped, instead, was put in the background drop
-	list due to being used by concurrent DML operations. Clear it
-	here since there are no longer any concurrent activities on it,
-	and it is free to be dropped */
-	table->to_be_dropped = false;
-
-	switch (trx_get_dict_operation(trx)) {
-	case TRX_DICT_OP_NONE:
-		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-		trx->table_id = table->id;
-	case TRX_DICT_OP_TABLE:
-		break;
-	case TRX_DICT_OP_INDEX:
-		/* If the transaction was previously flagged as
-		TRX_DICT_OP_INDEX, we should be dropping auxiliary
-		tables for full-text indexes. */
-		ut_ad(strstr(table->name.m_name, "/FTS_"));
-	}
-
-	/* Mark all indexes unavailable in the data dictionary cache
-	before starting to drop the table. */
-
-	unsigned*	page_no;
-	unsigned*	page_nos;
-	heap = mem_heap_create(
-		200 + UT_LIST_GET_LEN(table->indexes) * sizeof *page_nos);
-	tablename = mem_heap_strdup(heap, name);
-
-	page_no = page_nos = static_cast<unsigned*>(
-		mem_heap_alloc(
-			heap,
-			UT_LIST_GET_LEN(table->indexes) * sizeof *page_no));
-
-	for (dict_index_t* index = dict_table_get_first_index(table);
-	     index != NULL;
-	     index = dict_table_get_next_index(index)) {
-		rw_lock_x_lock(dict_index_get_lock(index));
-		/* Save the page numbers so that we can restore them
-		if the operation fails. */
-		*page_no++ = index->page;
-		/* Mark the index unusable. */
-		index->page = FIL_NULL;
-		rw_lock_x_unlock(dict_index_get_lock(index));
-	}
-
-	/* Deleting a row from SYS_INDEXES table will invoke
-	dict_drop_index_tree(). */
-	info = pars_info_create();
-
-	pars_info_add_str_literal(info, "name", name);
-
-	if (sqlcom != SQLCOM_TRUNCATE
-	    && strchr(name, '/')
-	    && dict_table_get_low("SYS_FOREIGN")
-	    && dict_table_get_low("SYS_FOREIGN_COLS")) {
-		err = que_eval_sql(
-			info,
-			"PROCEDURE DROP_FOREIGN_PROC () IS\n"
-			"fid CHAR;\n"
-
-			"DECLARE CURSOR fk IS\n"
-			"SELECT ID FROM SYS_FOREIGN\n"
-			"WHERE FOR_NAME = :name\n"
-			"AND TO_BINARY(FOR_NAME) = TO_BINARY(:name)\n"
-			"FOR UPDATE;\n"
-
-			"BEGIN\n"
-			"OPEN fk;\n"
-			"WHILE 1 = 1 LOOP\n"
-			"  FETCH fk INTO fid;\n"
-			"  IF (SQL % NOTFOUND) THEN RETURN; END IF;\n"
-			"  DELETE FROM SYS_FOREIGN_COLS WHERE ID=fid;\n"
-			"  DELETE FROM SYS_FOREIGN WHERE ID=fid;\n"
-			"END LOOP;\n"
-			"CLOSE fk;\n"
-			"END;\n", FALSE, trx);
-		if (err == DB_SUCCESS) {
-			info = pars_info_create();
-			pars_info_add_str_literal(info, "name", name);
-			goto do_drop;
-		}
-	} else {
-do_drop:
-		if (dict_table_get_low("SYS_VIRTUAL")) {
-			err = que_eval_sql(
-				info,
-				"PROCEDURE DROP_VIRTUAL_PROC () IS\n"
-				"tid CHAR;\n"
-
-				"BEGIN\n"
-				"SELECT ID INTO tid FROM SYS_TABLES\n"
-				"WHERE NAME = :name FOR UPDATE;\n"
-				"IF (SQL % NOTFOUND) THEN RETURN;"
-				" END IF;\n"
-				"DELETE FROM SYS_VIRTUAL"
-				" WHERE TABLE_ID = tid;\n"
-				"END;\n", FALSE, trx);
-			if (err == DB_SUCCESS) {
-				info = pars_info_create();
-				pars_info_add_str_literal(
-					info, "name", name);
-			}
-		} else {
-			err = DB_SUCCESS;
-		}
-
-		err = err == DB_SUCCESS ? que_eval_sql(
-			info,
-			"PROCEDURE DROP_TABLE_PROC () IS\n"
-			"tid CHAR;\n"
-			"iid CHAR;\n"
-
-			"DECLARE CURSOR cur_idx IS\n"
-			"SELECT ID FROM SYS_INDEXES\n"
-			"WHERE TABLE_ID = tid FOR UPDATE;\n"
-
-			"BEGIN\n"
-			"SELECT ID INTO tid FROM SYS_TABLES\n"
-			"WHERE NAME = :name FOR UPDATE;\n"
-			"IF (SQL % NOTFOUND) THEN RETURN; END IF;\n"
-
-			"OPEN cur_idx;\n"
-			"WHILE 1 = 1 LOOP\n"
-			"  FETCH cur_idx INTO iid;\n"
-			"  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
-			"  DELETE FROM SYS_FIELDS\n"
-			"  WHERE INDEX_ID = iid;\n"
-			"  DELETE FROM SYS_INDEXES\n"
-			"  WHERE ID = iid AND TABLE_ID = tid;\n"
-			"END LOOP;\n"
-			"CLOSE cur_idx;\n"
-
-			"DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
-			"DELETE FROM SYS_TABLES WHERE NAME=:name;\n"
-
-			"END;\n", FALSE, trx) : err;
-
-		if (err == DB_SUCCESS && table->space
-		    && dict_table_get_low("SYS_TABLESPACES")
-		    && dict_table_get_low("SYS_DATAFILES")) {
-			info = pars_info_create();
-			pars_info_add_int4_literal(info, "id",
-						   lint(table->space_id));
-			err = que_eval_sql(
-				info,
-				"PROCEDURE DROP_SPACE_PROC () IS\n"
-				"BEGIN\n"
-				"DELETE FROM SYS_TABLESPACES\n"
-				"WHERE SPACE = :id;\n"
-				"DELETE FROM SYS_DATAFILES\n"
-				"WHERE SPACE = :id;\n"
-				"END;\n", FALSE, trx);
-		}
-	}
-
-	switch (err) {
-		fil_space_t* space;
-		char* filepath;
-	case DB_SUCCESS:
-		if (!table->no_rollback()) {
-			err = row_drop_ancillary_fts_tables(table, trx);
-			if (err != DB_SUCCESS) {
-				break;
-			}
-		}
-
-		space = table->space;
-		ut_ad(!space || space->id == table->space_id);
-		/* Determine the tablespace filename before we drop
-		dict_table_t. */
-		if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-			dict_get_and_save_data_dir_path(table, true);
-			ut_ad(table->data_dir_path || !space);
-			filepath = space ? NULL : fil_make_filepath(
-				table->data_dir_path,
-				table->name.m_name, IBD,
-				table->data_dir_path != NULL);
-		} else {
-			filepath = space ? NULL : fil_make_filepath(
-				NULL, table->name.m_name, IBD, false);
-		}
-
-		/* Free the dict_table_t object. */
-		err = row_drop_table_from_cache(tablename, table, trx);
-		if (err != DB_SUCCESS) {
-			ut_free(filepath);
-			break;
-		}
-
-		/* Do not attempt to drop known-to-be-missing tablespaces,
-		nor the system tablespace. */
-		if (!space) {
-			fil_delete_file(filepath);
-			ut_free(filepath);
-			break;
-		}
-
-		ut_ad(!filepath);
-
-		if (space->id != TRX_SYS_SPACE) {
-			err = fil_delete_tablespace(space->id, false,
-						    &detached_handles);
-		}
-		break;
-
-	case DB_OUT_OF_FILE_SPACE:
-		err = DB_MUST_GET_MORE_FILE_SPACE;
-		trx->error_state = err;
-		row_mysql_handle_errors(&err, trx, NULL, NULL);
-
-		/* raise error */
-		ut_error;
-		break;
-
-	case DB_TOO_MANY_CONCURRENT_TRXS:
-		/* Cannot even find a free slot for the
-		the undo log. We can directly exit here
-		and return the DB_TOO_MANY_CONCURRENT_TRXS
-		error. */
-
-	default:
-		/* This is some error we do not expect. Print
-		the error number and rollback the transaction */
-		ib::error() << "Unknown error code " << err << " while"
-			" dropping table: "
-			<< ut_get_name(trx, tablename) << ".";
-
-		trx->error_state = DB_SUCCESS;
-		trx->rollback();
-		trx->error_state = DB_SUCCESS;
-
-		/* Mark all indexes available in the data dictionary
-		cache again. */
-
-		page_no = page_nos;
-
-		for (dict_index_t* index = dict_table_get_first_index(table);
-		     index != NULL;
-		     index = dict_table_get_next_index(index)) {
-			rw_lock_x_lock(dict_index_get_lock(index));
-			ut_a(index->page == FIL_NULL);
-			index->page = *page_no++;
-			rw_lock_x_unlock(dict_index_get_lock(index));
-		}
-	}
-
-	if (err != DB_SUCCESS && table != NULL) {
-		/* Drop table has failed with error but as drop table is not
-		transaction safe we should mark the table as corrupted to avoid
-		unwarranted follow-up action on this table that can result
-		in more serious issues. */
-
-		table->corrupted = true;
-		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
-		     index != NULL;
-		     index = UT_LIST_GET_NEXT(indexes, index)) {
-			dict_set_corrupted(index, trx, "DROP TABLE");
-		}
-	}
-
-funct_exit:
-	if (heap) {
-		mem_heap_free(heap);
-	}
-
-funct_exit_all_freed:
-	if (locked_dictionary) {
-
-		if (trx_is_started(trx)) {
-
-			trx_commit_for_mysql(trx);
-		}
-
-		/* Add the table to fts queue if drop table fails */
-		if (err != DB_SUCCESS && table->fts) {
-			fts_optimize_add_table(table);
-		}
-
-		row_mysql_unlock_data_dictionary(trx);
-	}
-
-	for (const auto& handle : detached_handles) {
-		ut_ad(handle != OS_FILE_CLOSED);
-		os_file_close(handle);
-	}
-
-	trx->op_info = "";
-
-	DBUG_RETURN(err);
-}
-
-/** Drop a table after failed CREATE TABLE. */
-dberr_t row_drop_table_after_create_fail(const char* name, trx_t* trx)
-{
-	ib::warn() << "Dropping incompletely created " << name << " table.";
-	return row_drop_table_for_mysql(name, trx, SQLCOM_DROP_DB, true);
-}
-
-/*******************************************************************//**
-Drop all foreign keys in a database, see Bug#18942.
-Called at the end of row_drop_database_for_mysql().
-@return error code or DB_SUCCESS */
-static MY_ATTRIBUTE((nonnull, warn_unused_result))
-dberr_t
-drop_all_foreign_keys_in_db(
-/*========================*/
-	const char*	name,	/*!< in: database name which ends to '/' */
-	trx_t*		trx)	/*!< in: transaction handle */
-{
-	pars_info_t*	pinfo;
-	dberr_t		err;
-
-	ut_a(name[strlen(name) - 1] == '/');
-
-	pinfo = pars_info_create();
-
-	pars_info_add_str_literal(pinfo, "dbname", name);
-
-/** true if for_name is not prefixed with dbname */
-#define TABLE_NOT_IN_THIS_DB \
-"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname"
-
-	err = que_eval_sql(pinfo,
-			   "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n"
-			   "foreign_id CHAR;\n"
-			   "for_name CHAR;\n"
-			   "found INT;\n"
-			   "DECLARE CURSOR cur IS\n"
-			   "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n"
-			   "WHERE FOR_NAME >= :dbname\n"
-			   "LOCK IN SHARE MODE\n"
-			   "ORDER BY FOR_NAME;\n"
-			   "BEGIN\n"
-			   "found := 1;\n"
-			   "OPEN cur;\n"
-			   "WHILE found = 1 LOOP\n"
-			   "        FETCH cur INTO foreign_id, for_name;\n"
-			   "        IF (SQL % NOTFOUND) THEN\n"
-			   "                found := 0;\n"
-			   "        ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n"
-			   "                found := 0;\n"
-			   "        ELSIF (1=1) THEN\n"
-			   "                DELETE FROM SYS_FOREIGN_COLS\n"
-			   "                WHERE ID = foreign_id;\n"
-			   "                DELETE FROM SYS_FOREIGN\n"
-			   "                WHERE ID = foreign_id;\n"
-			   "        END IF;\n"
-			   "END LOOP;\n"
-			   "CLOSE cur;\n"
-			   "COMMIT WORK;\n"
-			   "END;\n",
-			   FALSE, /* do not reserve dict mutex,
-				  we are already holding it */
-			   trx);
-
-	return(err);
-}
-
-/** Drop a database for MySQL.
-@param[in]	name	database name which ends at '/'
-@param[in]	trx	transaction handle
-@param[out]	found	number of dropped tables/partitions
-@return error code or DB_SUCCESS */
-dberr_t
-row_drop_database_for_mysql(
-	const char*	name,
-	trx_t*		trx,
-	ulint*		found)
-{
-	dict_table_t*	table;
-	char*		table_name;
-	dberr_t		err	= DB_SUCCESS;
-	ulint		namelen	= strlen(name);
-	bool		is_partition = false;
-
-	ut_ad(found != NULL);
-
-	DBUG_ENTER("row_drop_database_for_mysql");
-
-	DBUG_PRINT("row_drop_database_for_mysql", ("db: '%s'", name));
-
-	ut_a(name != NULL);
-	/* Assert DB name or partition name. */
-	if (name[namelen - 1] == '#') {
-		ut_ad(name[namelen - 2] != '/');
-		is_partition = true;
-		trx->op_info = "dropping partitions";
-	} else {
-		ut_a(name[namelen - 1] == '/');
-		trx->op_info = "dropping database";
-	}
-
-	*found = 0;
-
-	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-
-	trx_start_if_not_started_xa(trx, true);
-
-loop:
-	row_mysql_lock_data_dictionary(trx);
-
-	while ((table_name = dict_get_first_table_name_in_db(name))) {
-		/* Drop parent table if it is a fts aux table, to
-		avoid accessing dropped fts aux tables in information
-		scheam when parent table still exists.
-		Note: Drop parent table will drop fts aux tables. */
-		char*		parent_table_name = NULL;
-		table_id_t	table_id;
-		index_id_t	index_id;
-
-		if (fts_check_aux_table(
-				table_name, &table_id, &index_id)) {
-			dict_table_t* parent_table = dict_table_open_on_id(
-					table_id, TRUE, DICT_TABLE_OP_NORMAL);
-			if (parent_table != NULL) {
-				parent_table_name = mem_strdupl(
-					parent_table->name.m_name,
-					strlen(parent_table->name.m_name));
-				dict_table_close(parent_table, TRUE, FALSE);
-			}
-		}
-
-		if (parent_table_name != NULL) {
-			ut_free(table_name);
-			table_name = parent_table_name;
-		}
-
-		ut_a(memcmp(table_name, name, namelen) == 0);
-
-		table = dict_table_open_on_name(
-			table_name, TRUE, FALSE, static_cast<dict_err_ignore_t>(
-				DICT_ERR_IGNORE_INDEX_ROOT
-				| DICT_ERR_IGNORE_CORRUPT));
-
-		if (!table) {
-			ib::error() << "Cannot load table " << table_name
-				<< " from InnoDB internal data dictionary"
-				" during drop database";
-			ut_free(table_name);
-			err = DB_TABLE_NOT_FOUND;
-			break;
-
-		}
-
-		if (!table->name.is_temporary()) {
-			/* There could be orphan temp tables left from
-			interrupted alter table. Leave them, and handle
-			the rest.*/
-			if (table->can_be_evicted
-			    && (name[namelen - 1] != '#')) {
-				ib::warn() << "Orphan table encountered during"
-					" DROP DATABASE. This is possible if '"
-					<< table->name << ".frm' was lost.";
-			}
-
-			if (!table->is_readable() && !table->space) {
-				ib::warn() << "Missing .ibd file for table "
-					<< table->name << ".";
-			}
-		}
-
-		dict_table_close(table, TRUE, FALSE);
-
-		/* The dict_table_t object must not be accessed before
-		dict_table_open() or after dict_table_close(). But this is OK
-		if we are holding, the dict_sys.mutex. */
-		ut_ad(mutex_own(&dict_sys.mutex));
-
-		/* Disable statistics on the found table. */
-		if (!dict_stats_stop_bg(table)) {
-			row_mysql_unlock_data_dictionary(trx);
-
-			os_thread_sleep(250000);
-
-			ut_free(table_name);
-
-			goto loop;
-		}
-
-		/* Wait until MySQL does not have any queries running on
-		the table */
-
-		if (table->get_ref_count() > 0) {
-			row_mysql_unlock_data_dictionary(trx);
-
-			ib::warn() << "MySQL is trying to drop database "
-				<< ut_get_name(trx, name) << " though"
-				" there are still open handles to table "
-				<< table->name << ".";
-
-			os_thread_sleep(1000000);
-
-			ut_free(table_name);
-
-			goto loop;
-		}
-
-		err = row_drop_table_for_mysql(
-			table_name, trx, SQLCOM_DROP_DB);
-		trx_commit_for_mysql(trx);
-
-		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
-			ib::error() << "DROP DATABASE "
-				<< ut_get_name(trx, name) << " failed"
-				" with error (" << err << ") for"
-				" table " << ut_get_name(trx, table_name);
-			ut_free(table_name);
-			break;
-		}
-
-		ut_free(table_name);
-		(*found)++;
-	}
-
-	/* Partitioning does not yet support foreign keys. */
-	if (err == DB_SUCCESS && !is_partition) {
-		/* after dropping all tables try to drop all leftover
-		foreign keys in case orphaned ones exist */
-		err = drop_all_foreign_keys_in_db(name, trx);
-
-		if (err != DB_SUCCESS) {
-			const std::string&	db = ut_get_name(trx, name);
-			ib::error() << "DROP DATABASE " << db << " failed with"
-				" error " << err << " while dropping all"
-				" foreign keys";
-		}
-	}
-
-	trx_commit_for_mysql(trx);
-
-	row_mysql_unlock_data_dictionary(trx);
-
-	trx->op_info = "";
+  if (fts_exist)
+  {
+    fts_optimize_remove_table(table);
+    purge_sys.stop_FTS(*table);
+    err= fts_lock_tables(trx, *table);
+    if (err != DB_SUCCESS)
+    {
+rollback:
+      if (fts_exist)
+      {
+        purge_sys.resume_FTS();
+        fts_optimize_add_table(table);
+      }
+      trx->rollback();
+      if (trx->dict_operation_lock_mode)
+        row_mysql_unlock_data_dictionary(trx);
+      return err;
+    }
+  }
 
-	DBUG_RETURN(err);
+  row_mysql_lock_data_dictionary(trx);
+  trx->op_info = "discarding tablespace";
+  trx->dict_operation= true;
+
+  /* We serialize data dictionary operations with dict_sys.latch:
+  this is to avoid deadlocks during data dictionary operations */
+
+  err= row_discard_tablespace_foreign_key_checks(trx, table);
+  if (err != DB_SUCCESS)
+    goto rollback;
+
+  /* Note: The following cannot be rolled back. Rollback would see the
+  UPDATE of SYS_INDEXES.TABLE_ID as two operations: DELETE and INSERT.
+  It would invoke btr_free_if_exists() when rolling back the INSERT,
+  effectively dropping all indexes of the table. Furthermore, calls like
+  ibuf_delete_for_discarded_space() are already discarding data
+  before the transaction is committed.
+
+  It would be better to remove the integrity-breaking
+  ALTER TABLE...DISCARD TABLESPACE operation altogether. */
+  table->file_unreadable= true;
+  table->space= nullptr;
+  table->flags2|= DICT_TF2_DISCARDED;
+  err= row_discard_tablespace(trx, table);
+  DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+                  log_buffer_flush_to_disk(); DBUG_SUICIDE(););
+  /* FTS_ tables may be deleted */
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  const auto space_id= table->space_id;
+  pfs_os_file_t d= fil_delete_tablespace(space_id);
+  DBUG_EXECUTE_IF("ib_discard_after_commit_crash", DBUG_SUICIDE(););
+  row_mysql_unlock_data_dictionary(trx);
+
+  if (d != OS_FILE_CLOSED)
+    os_file_close(d);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+
+  if (fts_exist)
+    purge_sys.resume_FTS();
+
+  buf_flush_remove_pages(space_id);
+  trx->op_info= "";
+  return err;
 }
 
 /****************************************************************//**
@@ -4069,8 +2516,7 @@ row_delete_constraint_low(
 			    "BEGIN\n"
 			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
 			    "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
-			    "END;\n"
-			    , FALSE, trx));
+			    "END;\n", trx));
 }
 
 /****************************************************************//**
@@ -4115,7 +2561,6 @@ row_rename_table_for_mysql(
 	const char*	old_name,	/*!< in: old table name */
 	const char*	new_name,	/*!< in: new table name */
 	trx_t*		trx,		/*!< in/out: transaction */
-	bool		commit,		/*!< in: whether to commit trx */
 	bool		use_fk)		/*!< in: whether to parse and enforce
 					FOREIGN KEY constraints */
 {
@@ -4126,15 +2571,11 @@ row_rename_table_for_mysql(
 	ulint		n_constraints_to_drop	= 0;
 	ibool		old_is_tmp, new_is_tmp;
 	pars_info_t*	info			= NULL;
-	int		retry;
-	bool		aux_fts_rename		= false;
-	char*		is_part 		= NULL;
 
 	ut_a(old_name != NULL);
 	ut_a(new_name != NULL);
 	ut_ad(trx->state == TRX_STATE_ACTIVE);
-	const bool dict_locked = trx->dict_operation_lock_mode == RW_X_LATCH;
-	ut_ad(!commit || dict_locked);
+	ut_ad(trx->dict_operation_lock_mode);
 
 	if (high_level_read_only) {
 		return(DB_READ_ONLY);
@@ -4145,21 +2586,12 @@ row_rename_table_for_mysql(
 	old_is_tmp = dict_table_t::is_temporary_name(old_name);
 	new_is_tmp = dict_table_t::is_temporary_name(new_name);
 
-	table = dict_table_open_on_name(old_name, dict_locked, FALSE,
+	table = dict_table_open_on_name(old_name, true,
 					DICT_ERR_IGNORE_FK_NOKEY);
 
-	/* We look for pattern #P# to see if the table is partitioned
-	MySQL table. */
-#ifdef __WIN__
-	is_part = strstr((char *)old_name, (char *)"#p#");
-#else
-	is_part = strstr((char *)old_name, (char *)"#P#");
-#endif /* __WIN__ */
-
-	/* MySQL partition engine hard codes the file name
-	separator as "#P#". The text case is fixed even if
-	lower_case_table_names is set to 1 or 2. This is true
-	for sub-partition names as well. InnoDB always
+	/* MariaDB partition engine hard codes the file name
+	separator as "#P#" and "#SP#". The text case is fixed even if
+	lower_case_table_names is set to 1 or 2. InnoDB always
 	normalises file names to lower case on Windows, this
 	can potentially cause problems when copying/moving
 	tables between platforms.
@@ -4173,11 +2605,10 @@ row_rename_table_for_mysql(
 	sensitive platform in Windows, we might need to
 	check the existence of table name without lowering
 	case them in the system table. */
-	if (!table &&
-	    is_part &&
-	    innobase_get_lower_case_table_names() == 1) {
+	if (!table && lower_case_table_names == 1
+	    && strstr(old_name, table_name_t::part_suffix)) {
 		char par_case_name[MAX_FULL_NAME_LEN + 1];
-#ifndef __WIN__
+#ifndef _WIN32
 		/* Check for the table using lower
 		case name, including the partition
 		separator "P" */
@@ -4193,16 +2624,19 @@ row_rename_table_for_mysql(
 		normalize_table_name_c_low(
 			par_case_name, old_name, FALSE);
 #endif
-		table = dict_table_open_on_name(par_case_name, dict_locked, FALSE,
+		table = dict_table_open_on_name(par_case_name, true,
 						DICT_ERR_IGNORE_FK_NOKEY);
 	}
 
 	if (!table) {
 		err = DB_TABLE_NOT_FOUND;
 		goto funct_exit;
+	}
+
+	ut_ad(!table->is_temporary());
 
-	} else if (!table->is_readable() && !table->space
-		   && !(table->flags2 & DICT_TF2_DISCARDED)) {
+	if (!table->is_readable() && !table->space
+	    && !(table->flags2 & DICT_TF2_DISCARDED)) {
 
 		err = DB_TABLE_NOT_FOUND;
 
@@ -4230,35 +2664,12 @@ row_rename_table_for_mysql(
 		}
 	}
 
-	/* Is a foreign key check running on this table? */
-	for (retry = 0; retry < 100
-	     && table->n_foreign_key_checks_running > 0; ++retry) {
-		row_mysql_unlock_data_dictionary(trx);
-		os_thread_yield();
-		row_mysql_lock_data_dictionary(trx);
-	}
+	err = trx_undo_report_rename(trx, table);
 
-	if (table->n_foreign_key_checks_running > 0) {
-		ib::error() << "In ALTER TABLE "
-			<< ut_get_name(trx, old_name)
-			<< " a FOREIGN KEY check is running. Cannot rename"
-			" table.";
-		err = DB_TABLE_IN_FK_CHECK;
+	if (err != DB_SUCCESS) {
 		goto funct_exit;
 	}
 
-	if (!table->is_temporary()) {
-		if (commit) {
-			dict_stats_wait_bg_to_stop_using_table(table, trx);
-		}
-
-		err = trx_undo_report_rename(trx, table);
-
-		if (err != DB_SUCCESS) {
-			goto funct_exit;
-		}
-	}
-
 	/* We use the private SQL parser of Innobase to generate the query
 	graphs needed in updating the dictionary data from system tables. */
 
@@ -4273,46 +2684,12 @@ row_rename_table_for_mysql(
 			   "UPDATE SYS_TABLES"
 			   " SET NAME = :new_table_name\n"
 			   " WHERE NAME = :old_table_name;\n"
-			   "END;\n"
-			   , FALSE, trx);
-
-	/* Assume the caller guarantees destination name doesn't exist. */
-	ut_ad(err != DB_DUPLICATE_KEY);
-
-	/* SYS_TABLESPACES and SYS_DATAFILES need to be updated if
-	the table is in a single-table tablespace. */
-	if (err != DB_SUCCESS || !dict_table_is_file_per_table(table)) {
-	} else if (table->space) {
-		/* If old path and new path are the same means tablename
-		has not changed and only the database name holding the table
-		has changed so we need to make the complete filepath again. */
-		char*	new_path = dict_tables_have_same_db(old_name, new_name)
-			? os_file_make_new_pathname(
-				table->space->chain.start->name, new_name)
-			: fil_make_filepath(NULL, new_name, IBD, false);
+			   "END;\n", trx);
 
-		info = pars_info_create();
-
-		pars_info_add_str_literal(info, "new_table_name", new_name);
-		pars_info_add_str_literal(info, "new_path_name", new_path);
-		pars_info_add_int4_literal(info, "space_id", table->space_id);
-
-		err = que_eval_sql(info,
-				   "PROCEDURE RENAME_SPACE () IS\n"
-				   "BEGIN\n"
-				   "UPDATE SYS_TABLESPACES"
-				   " SET NAME = :new_table_name\n"
-				   " WHERE SPACE = :space_id;\n"
-				   "UPDATE SYS_DATAFILES"
-				   " SET PATH = :new_path_name\n"
-				   " WHERE SPACE = :space_id;\n"
-				   "END;\n"
-				   , FALSE, trx);
-
-		ut_free(new_path);
-	}
 	if (err != DB_SUCCESS) {
-		goto err_exit;
+		// Assume the caller guarantees destination name doesn't exist.
+		ut_ad(err != DB_DUPLICATE_KEY);
+		goto rollback_and_exit;
 	}
 
 	if (!new_is_tmp) {
@@ -4425,8 +2802,7 @@ row_rename_table_for_mysql(
 			"WHERE REF_NAME = :old_table_name\n"
 			"  AND TO_BINARY(REF_NAME)\n"
 			"    = TO_BINARY(:old_table_name);\n"
-			"END;\n"
-			, FALSE, trx);
+			"END;\n", trx);
 
 	} else if (n_constraints_to_drop > 0) {
 		/* Drop some constraints of tmp tables. */
@@ -4451,54 +2827,29 @@ row_rename_table_for_mysql(
 	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
 	    && !dict_tables_have_same_db(old_name, new_name)) {
 		err = fts_rename_aux_tables(table, new_name, trx);
-		if (err != DB_TABLE_NOT_FOUND) {
-			aux_fts_rename = true;
-		}
 	}
 
-	if (err != DB_SUCCESS) {
-err_exit:
-		if (err == DB_DUPLICATE_KEY) {
-			ib::error() << "Possible reasons:";
-			ib::error() << "(1) Table rename would cause two"
-				" FOREIGN KEY constraints to have the same"
-				" internal name in case-insensitive"
-				" comparison.";
-			ib::error() << "(2) Table "
-				<< ut_get_name(trx, new_name)
-				<< " exists in the InnoDB internal data"
-				" dictionary though MySQL is trying to rename"
-				" table " << ut_get_name(trx, old_name)
-				<< " to it. Have you deleted the .frm file and"
-				" not used DROP TABLE?";
-			ib::info() << TROUBLESHOOTING_MSG;
-			ib::error() << "If table "
-				<< ut_get_name(trx, new_name)
-				<< " is a temporary table #sql..., then"
-				" it can be that there are still queries"
-				" running on the table, and it will be dropped"
-				" automatically when the queries end. You can"
-				" drop the orphaned table inside InnoDB by"
-				" creating an InnoDB table with the same name"
-				" in another database and copying the .frm file"
-				" to the current database. Then MySQL thinks"
-				" the table exists, and DROP TABLE will"
-				" succeed.";
-		}
+	switch (err) {
+	case DB_DUPLICATE_KEY:
+		ib::error() << "Table rename might cause two"
+			" FOREIGN KEY constraints to have the same"
+			" internal name in case-insensitive comparison.";
+		ib::info() << TROUBLESHOOTING_MSG;
+		/* fall through */
+	rollback_and_exit:
+	default:
 		trx->error_state = DB_SUCCESS;
 		trx->rollback();
 		trx->error_state = DB_SUCCESS;
-	} else {
-		/* The following call will also rename the .ibd data file if
-		the table is stored in a single-table tablespace */
-
+		break;
+	case DB_SUCCESS:
+		DEBUG_SYNC_C("innodb_rename_in_cache");
+		/* The following call will also rename the .ibd file */
 		err = dict_table_rename_in_cache(
-			table, new_name, !new_is_tmp);
+			table, span<const char>{new_name,strlen(new_name)},
+			false);
 		if (err != DB_SUCCESS) {
-			trx->error_state = DB_SUCCESS;
-			trx->rollback();
-			trx->error_state = DB_SUCCESS;
-			goto funct_exit;
+			goto rollback_and_exit;
 		}
 
 		/* In case of copy alter, template db_name and
@@ -4513,7 +2864,7 @@ err_exit:
 		dict_names_t	fk_tables;
 
 		err = dict_load_foreigns(
-			new_name, NULL, false,
+			new_name, nullptr, trx->id,
 			!old_is_tmp || trx->check_foreigns,
 			use_fk
 			? DICT_ERR_IGNORE_NONE
@@ -4521,7 +2872,6 @@ err_exit:
 			fk_tables);
 
 		if (err != DB_SUCCESS) {
-
 			if (old_is_tmp) {
 				/* In case of copy alter, ignore the
 				loading of foreign key constraint
@@ -4535,7 +2885,7 @@ err_exit:
 					" definition.";
 				if (!trx->check_foreigns) {
 					err = DB_SUCCESS;
-					goto funct_exit;
+					break;
 				}
 			} else {
 				ib::error() << "In RENAME TABLE table "
@@ -4545,22 +2895,14 @@ err_exit:
 					" with the new table definition.";
 			}
 
-			trx->error_state = DB_SUCCESS;
-			trx->rollback();
-			trx->error_state = DB_SUCCESS;
+			goto rollback_and_exit;
 		}
 
 		/* Check whether virtual column or stored column affects
 		the foreign key constraint of the table. */
-		if (dict_foreigns_has_s_base_col(
-				table->foreign_set, table)) {
+		if (dict_foreigns_has_s_base_col(table->foreign_set, table)) {
 			err = DB_NO_FK_ON_S_BASE_COL;
-			ut_a(DB_SUCCESS == dict_table_rename_in_cache(
-				table, old_name, FALSE));
-			trx->error_state = DB_SUCCESS;
-			trx->rollback();
-			trx->error_state = DB_SUCCESS;
-			goto funct_exit;
+			goto rollback_and_exit;
 		}
 
 		/* Fill the virtual column set in foreign when
@@ -4569,8 +2911,8 @@ err_exit:
 		dict_mem_table_fill_foreign_vcol_set(table);
 
 		while (!fk_tables.empty()) {
-			dict_load_table(fk_tables.front(),
-					DICT_ERR_IGNORE_NONE);
+			const char *f = fk_tables.front();
+			dict_sys.load_table({f, strlen(f)});
 			fk_tables.pop_front();
 		}
 
@@ -4578,47 +2920,8 @@ err_exit:
 	}
 
 funct_exit:
-	if (aux_fts_rename && err != DB_SUCCESS
-	    && table != NULL && (table->space != 0)) {
-
-		char*	orig_name = table->name.m_name;
-		trx_t*	trx_bg = trx_create();
-
-		/* If the first fts_rename fails, the trx would
-		be rolled back and committed, we can't use it any more,
-		so we have to start a new background trx here. */
-		ut_a(trx_state_eq(trx_bg, TRX_STATE_NOT_STARTED));
-		trx_bg->op_info = "Revert the failing rename "
-				  "for fts aux tables";
-		trx_bg->dict_operation_lock_mode = RW_X_LATCH;
-		trx_start_for_ddl(trx_bg, TRX_DICT_OP_TABLE);
-
-		/* If rename fails and table has its own tablespace,
-		we need to call fts_rename_aux_tables again to
-		revert the ibd file rename, which is not under the
-		control of trx. Also notice the parent table name
-		in cache is not changed yet. If the reverting fails,
-		the ibd data may be left in the new database, which
-		can be fixed only manually. */
-		table->name.m_name = const_cast<char*>(new_name);
-		fts_rename_aux_tables(table, old_name, trx_bg);
-		table->name.m_name = orig_name;
-
-		trx_bg->dict_operation_lock_mode = 0;
-		trx_commit_for_mysql(trx_bg);
-		trx_bg->free();
-	}
-
-	if (table != NULL) {
-		if (commit && !table->is_temporary()) {
-			table->stats_bg_flag &= byte(~BG_STAT_SHOULD_QUIT);
-		}
-		dict_table_close(table, dict_locked, FALSE);
-	}
-
-	if (commit) {
-		DEBUG_SYNC(trx->mysql_thd, "before_rename_table_commit");
-		trx_commit_for_mysql(trx);
+	if (table) {
+		table->release();
 	}
 
 	if (UNIV_LIKELY_NULL(heap)) {
@@ -4629,214 +2932,3 @@ funct_exit:
 
 	return(err);
 }
-
-/*********************************************************************//**
-Scans an index for either COUNT(*) or CHECK TABLE.
-If CHECK TABLE; Checks that the index contains entries in an ascending order,
-unique constraint is not broken, and calculates the number of index entries
-in the read view of the current transaction.
-@return DB_SUCCESS or other error */
-dberr_t
-row_scan_index_for_mysql(
-/*=====================*/
-	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
-						in MySQL handle */
-	const dict_index_t*	index,		/*!< in: index */
-	ulint*			n_rows)		/*!< out: number of entries
-						seen in the consistent read */
-{
-	dtuple_t*	prev_entry	= NULL;
-	ulint		matched_fields;
-	byte*		buf;
-	dberr_t		ret;
-	rec_t*		rec;
-	int		cmp;
-	ibool		contains_null;
-	ulint		i;
-	ulint		cnt;
-	mem_heap_t*	heap		= NULL;
-	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
-	rec_offs*	offsets;
-	rec_offs_init(offsets_);
-
-	*n_rows = 0;
-
-	/* Don't support RTree Leaf level scan */
-	ut_ad(!dict_index_is_spatial(index));
-
-	if (dict_index_is_clust(index)) {
-		/* The clustered index of a table is always available.
-		During online ALTER TABLE that rebuilds the table, the
-		clustered index in the old table will have
-		index->online_log pointing to the new table. All
-		indexes of the old table will remain valid and the new
-		table will be unaccessible to MySQL until the
-		completion of the ALTER TABLE. */
-	} else if (dict_index_is_online_ddl(index)
-		   || (index->type & DICT_FTS)) {
-		/* Full Text index are implemented by auxiliary tables,
-		not the B-tree. We also skip secondary indexes that are
-		being created online. */
-		return(DB_SUCCESS);
-	}
-
-	ulint bufsize = std::max<ulint>(srv_page_size,
-					prebuilt->mysql_row_len);
-	buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
-	heap = mem_heap_create(100);
-
-	cnt = 1000;
-
-	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
-loop:
-	/* Check thd->killed every 1,000 scanned rows */
-	if (--cnt == 0) {
-		if (trx_is_interrupted(prebuilt->trx)) {
-			ret = DB_INTERRUPTED;
-			goto func_exit;
-		}
-		cnt = 1000;
-	}
-
-	switch (ret) {
-	case DB_SUCCESS:
-		break;
-	case DB_DEADLOCK:
-	case DB_LOCK_TABLE_FULL:
-	case DB_LOCK_WAIT_TIMEOUT:
-	case DB_INTERRUPTED:
-		goto func_exit;
-	default:
-		ib::warn() << "CHECK TABLE on index " << index->name << " of"
-			" table " << index->table->name << " returned " << ret;
-		/* (this error is ignored by CHECK TABLE) */
-		/* fall through */
-	case DB_END_OF_INDEX:
-		ret = DB_SUCCESS;
-func_exit:
-		ut_free(buf);
-		mem_heap_free(heap);
-
-		return(ret);
-	}
-
-	*n_rows = *n_rows + 1;
-
-	/* else this code is doing handler::check() for CHECK TABLE */
-
-	/* row_search... returns the index record in buf, record origin offset
-	within buf stored in the first 4 bytes, because we have built a dummy
-	template */
-
-	rec = buf + mach_read_from_4(buf);
-
-	offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
-				  ULINT_UNDEFINED, &heap);
-
-	if (prev_entry != NULL) {
-		matched_fields = 0;
-
-		cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
-						&matched_fields);
-		contains_null = FALSE;
-
-		/* In a unique secondary index we allow equal key values if
-		they contain SQL NULLs */
-
-		for (i = 0;
-		     i < dict_index_get_n_ordering_defined_by_user(index);
-		     i++) {
-			if (UNIV_SQL_NULL == dfield_get_len(
-				    dtuple_get_nth_field(prev_entry, i))) {
-
-				contains_null = TRUE;
-				break;
-			}
-		}
-
-		const char* msg;
-
-		if (cmp > 0) {
-			ret = DB_INDEX_CORRUPT;
-			msg = "index records in a wrong order in ";
-not_ok:
-			ib::error()
-				<< msg << index->name
-				<< " of table " << index->table->name
-				<< ": " << *prev_entry << ", "
-				<< rec_offsets_print(rec, offsets);
-			/* Continue reading */
-		} else if (dict_index_is_unique(index)
-			   && !contains_null
-			   && matched_fields
-			   >= dict_index_get_n_ordering_defined_by_user(
-				   index)) {
-			ret = DB_DUPLICATE_KEY;
-			msg = "duplicate key in ";
-			goto not_ok;
-		}
-	}
-
-	{
-		mem_heap_t*	tmp_heap = NULL;
-
-		/* Empty the heap on each round.  But preserve offsets[]
-		for the row_rec_to_index_entry() call, by copying them
-		into a separate memory heap when needed. */
-		if (UNIV_UNLIKELY(offsets != offsets_)) {
-			ulint	size = rec_offs_get_n_alloc(offsets)
-				* sizeof *offsets;
-
-			tmp_heap = mem_heap_create(size);
-
-			offsets = static_cast<rec_offs*>(
-				mem_heap_dup(tmp_heap, offsets, size));
-		}
-
-		mem_heap_empty(heap);
-
-		prev_entry = row_rec_to_index_entry(
-			rec, index, offsets, heap);
-
-		if (UNIV_LIKELY_NULL(tmp_heap)) {
-			mem_heap_free(tmp_heap);
-		}
-	}
-
-	ret = row_search_for_mysql(
-		buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
-
-	goto loop;
-}
-
-/*********************************************************************//**
-Initialize this module */
-void
-row_mysql_init(void)
-/*================*/
-{
-	mutex_create(LATCH_ID_ROW_DROP_LIST, &row_drop_list_mutex);
-
-	UT_LIST_INIT(
-		row_mysql_drop_list,
-		&row_mysql_drop_t::row_mysql_drop_list);
-
-	row_mysql_drop_list_inited = true;
-}
-
-void row_mysql_close()
-{
-  ut_ad(!UT_LIST_GET_LEN(row_mysql_drop_list) ||
-        srv_force_recovery >= SRV_FORCE_NO_BACKGROUND);
-  if (row_mysql_drop_list_inited)
-  {
-    row_mysql_drop_list_inited= false;
-    mutex_free(&row_drop_list_mutex);
-
-    while (row_mysql_drop_t *drop= UT_LIST_GET_FIRST(row_mysql_drop_list))
-    {
-      UT_LIST_REMOVE(row_mysql_drop_list, drop);
-      ut_free(drop);
-    }
-  }
-}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
index 74bbc61df52..753b42332fc 100644
--- a/storage/innobase/row/row0purge.cc
+++ b/storage/innobase/row/row0purge.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2022, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,8 +25,10 @@ Created 3/14/1997 Heikki Tuuri
 *******************************************************/
 
 #include "row0purge.h"
+#include "btr0cur.h"
 #include "fsp0fsp.h"
 #include "mach0data.h"
+#include "dict0crea.h"
 #include "dict0stats.h"
 #include "trx0rseg.h"
 #include "trx0trx.h"
@@ -39,13 +41,13 @@ Created 3/14/1997 Heikki Tuuri
 #include "row0upd.h"
 #include "row0vers.h"
 #include "row0mysql.h"
-#include "row0log.h"
 #include "log0log.h"
 #include "srv0mon.h"
 #include "srv0start.h"
 #include "handler.h"
 #include "ha_innodb.h"
 #include "fil0fil.h"
+#include <mysql/service_thd_mdl.h>
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -65,7 +67,7 @@ static
 ibool
 row_purge_reposition_pcur(
 /*======================*/
-	ulint		mode,	/*!< in: latching mode */
+	btr_latch_mode	mode,	/*!< in: latching mode */
 	purge_node_t*	node,	/*!< in: row purge node */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
@@ -73,7 +75,7 @@ row_purge_reposition_pcur(
 		ut_ad(node->validate_pcur());
 
 		node->found_clust =
-		  btr_pcur_restore_position(mode, &node->pcur, mtr) ==
+		  node->pcur.restore_position(mode, mtr) ==
 		    btr_pcur_t::SAME_ALL;
 
 	} else {
@@ -102,20 +104,93 @@ bool
 row_purge_remove_clust_if_poss_low(
 /*===============================*/
 	purge_node_t*	node,	/*!< in/out: row purge node */
-	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+	btr_latch_mode	mode)	/*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
 {
 	dict_index_t* index = dict_table_get_first_index(node->table);
+	table_id_t table_id = 0;
+	index_id_t index_id = 0;
+	dict_table_t *table = nullptr;
+	pfs_os_file_t f = OS_FILE_CLOSED;
 
-	log_free_check();
-
+	if (table_id) {
+retry:
+		purge_sys.check_stop_FTS();
+		dict_sys.lock(SRW_LOCK_CALL);
+		table = dict_sys.find_table(table_id);
+		if (!table) {
+			dict_sys.unlock();
+		} else if (table->n_rec_locks) {
+			for (dict_index_t* ind = UT_LIST_GET_FIRST(
+				     table->indexes); ind;
+			     ind = UT_LIST_GET_NEXT(indexes, ind)) {
+				if (ind->id == index_id) {
+					lock_discard_for_index(*ind);
+				}
+			}
+		}
+	}
 	mtr_t mtr;
 	mtr.start();
 	index->set_modified(mtr);
+	log_free_check();
+	bool success = true;
 
 	if (!row_purge_reposition_pcur(mode, node, &mtr)) {
 		/* The record was already removed. */
+removed:
 		mtr.commit();
-		return true;
+close_and_exit:
+		if (table) {
+			dict_sys.unlock();
+		}
+		return success;
+	}
+
+	if (node->table->id == DICT_INDEXES_ID) {
+		/* If this is a record of the SYS_INDEXES table, then
+		we have to free the file segments of the index tree
+		associated with the index */
+		if (!table_id) {
+			const rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+			table_id = mach_read_from_8(rec);
+			index_id = mach_read_from_8(rec + 8);
+			if (table_id) {
+				mtr.commit();
+				goto retry;
+			}
+			ut_ad("corrupted SYS_INDEXES record" == 0);
+		}
+
+		if (const uint32_t space_id = dict_drop_index_tree(
+			    &node->pcur, nullptr, &mtr)) {
+			if (table) {
+				if (table->get_ref_count() == 0) {
+					dict_sys.remove(table);
+				} else if (table->space_id == space_id) {
+					table->space = nullptr;
+					table->file_unreadable = true;
+				}
+				dict_sys.unlock();
+				table = nullptr;
+			}
+			f = fil_delete_tablespace(space_id);
+		}
+
+		mtr.commit();
+
+		if (table) {
+			dict_sys.unlock();
+			table = nullptr;
+		}
+
+		purge_sys.check_stop_SYS();
+		mtr.start();
+		index->set_modified(mtr);
+
+		if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+			goto removed;
+		}
 	}
 
 	rec_t* rec = btr_pcur_get_rec(&node->pcur);
@@ -125,7 +200,6 @@ row_purge_remove_clust_if_poss_low(
 	rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
 					    index->n_core_fields,
 					    ULINT_UNDEFINED, &heap);
-	bool success = true;
 
 	if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
 		/* Someone else has modified the record later: do not remove */
@@ -138,24 +212,15 @@ row_purge_remove_clust_if_poss_low(
 	ut_ad(row_get_rec_trx_id(rec, index, offsets));
 
 	if (mode == BTR_MODIFY_LEAF) {
-		success = btr_cur_optimistic_delete(
+		success = DB_FAIL != btr_cur_optimistic_delete(
 			btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
 	} else {
 		dberr_t	err;
-		ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
+		ut_ad(mode == BTR_PURGE_TREE);
 		btr_cur_pessimistic_delete(
 			&err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
 			false, &mtr);
-
-		switch (err) {
-		case DB_SUCCESS:
-			break;
-		case DB_OUT_OF_FILE_SPACE:
-			success = false;
-			break;
-		default:
-			ut_error;
-		}
+		success = err == DB_SUCCESS;
 	}
 
 func_exit:
@@ -170,7 +235,7 @@ func_exit:
 		mtr_commit(&mtr);
 	}
 
-	return(success);
+	goto close_and_exit;
 }
 
 /***********************************************************//**
@@ -192,12 +257,11 @@ row_purge_remove_clust_if_poss(
 	for (ulint n_tries = 0;
 	     n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
 	     n_tries++) {
-		if (row_purge_remove_clust_if_poss_low(
-			    node, BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE)) {
+		if (row_purge_remove_clust_if_poss_low(node, BTR_PURGE_TREE)) {
 			return(true);
 		}
 
-		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
 	}
 
 	return(false);
@@ -278,39 +342,20 @@ row_purge_remove_sec_if_poss_tree(
 	ibool			success	= TRUE;
 	dberr_t			err;
 	mtr_t			mtr;
-	enum row_search_result	search_result;
 
 	log_free_check();
 	mtr.start();
 	index->set_modified(mtr);
+	pcur.btr_cur.page_cur.index = index;
 
-	if (!index->is_committed()) {
-		/* The index->online_status may change if the index is
-		or was being created online, but not committed yet. It
-		is protected by index->lock. */
-		mtr_sx_lock_index(index, &mtr);
-
-		if (dict_index_is_online_ddl(index)) {
-			/* Online secondary index creation will not
-			copy any delete-marked records. Therefore
-			there is nothing to be purged. We must also
-			skip the purge when a completed index is
-			dropped by rollback_inplace_alter_table(). */
-			goto func_exit_no_pcur;
+	if (index->is_spatial()) {
+		if (!rtr_search(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+			goto found;
 		}
-	} else {
-		/* For secondary indexes,
-		index->online_status==ONLINE_INDEX_COMPLETE if
-		index->is_committed(). */
-		ut_ad(!dict_index_is_online_ddl(index));
+		goto func_exit;
 	}
 
-	search_result = row_search_index_entry(
-				index, entry,
-				BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-				&pcur, &mtr);
-
-	switch (search_result) {
+	switch (row_search_index_entry(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
 	case ROW_NOT_FOUND:
 		/* Not found.  This is a legitimate condition.  In a
 		rollback, InnoDB will remove secondary recs that would
@@ -339,6 +384,7 @@ row_purge_remove_sec_if_poss_tree(
 	which cannot be purged yet, requires its existence. If some requires,
 	we should do nothing. */
 
+found:
 	if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
 
 		/* Remove the index record, which should have been
@@ -377,7 +423,6 @@ row_purge_remove_sec_if_poss_tree(
 
 func_exit:
 	btr_pcur_close(&pcur); // FIXME: need this?
-func_exit_no_pcur:
 	mtr.commit();
 
 	return(success);
@@ -398,8 +443,6 @@ row_purge_remove_sec_if_poss_leaf(
 {
 	mtr_t			mtr;
 	btr_pcur_t		pcur;
-	enum btr_latch_mode	mode;
-	enum row_search_result	search_result;
 	bool			success	= true;
 
 	log_free_check();
@@ -408,62 +451,27 @@ row_purge_remove_sec_if_poss_leaf(
 	mtr.start();
 	index->set_modified(mtr);
 
-	if (!index->is_committed()) {
-		/* For uncommitted spatial index, we also skip the purge. */
-		if (dict_index_is_spatial(index)) {
-			goto func_exit_no_pcur;
-		}
-
-		/* The index->online_status may change if the the
-		index is or was being created online, but not
-		committed yet. It is protected by index->lock. */
-		mtr_s_lock_index(index, &mtr);
-
-		if (dict_index_is_online_ddl(index)) {
-			/* Online secondary index creation will not
-			copy any delete-marked records. Therefore
-			there is nothing to be purged. We must also
-			skip the purge when a completed index is
-			dropped by rollback_inplace_alter_table(). */
-			goto func_exit_no_pcur;
-		}
-
-		mode = BTR_PURGE_LEAF_ALREADY_S_LATCHED;
-	} else {
-		/* For secondary indexes,
-		index->online_status==ONLINE_INDEX_COMPLETE if
-		index->is_committed(). */
-		ut_ad(!dict_index_is_online_ddl(index));
-
-		/* Change buffering is disabled for spatial index and
-		virtual index. */
-		mode = (dict_index_is_spatial(index)
-			|| dict_index_has_virtual(index))
-			? BTR_MODIFY_LEAF
-			: BTR_PURGE_LEAF;
-	}
+	pcur.btr_cur.page_cur.index = index;
 
 	/* Set the purge node for the call to row_purge_poss_sec(). */
 	pcur.btr_cur.purge_node = node;
-	if (dict_index_is_spatial(index)) {
-		rw_lock_sx_lock(dict_index_get_lock(index));
+	if (index->is_spatial()) {
 		pcur.btr_cur.thr = NULL;
-	} else {
-		/* Set the query thread, so that ibuf_insert_low() will be
-		able to invoke thd_get_trx(). */
-		pcur.btr_cur.thr = static_cast<que_thr_t*>(
-			que_node_get_parent(node));
+		if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, &mtr)) {
+			goto found;
+		}
+		goto func_exit;
 	}
 
-	search_result = row_search_index_entry(
-		index, entry, mode, &pcur, &mtr);
-
-	if (dict_index_is_spatial(index)) {
-		rw_lock_sx_unlock(dict_index_get_lock(index));
-	}
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
 
-	switch (search_result) {
+	switch (row_search_index_entry(entry, index->has_virtual()
+				       ? BTR_MODIFY_LEAF : BTR_PURGE_LEAF,
+				       &pcur, &mtr)) {
 	case ROW_FOUND:
+found:
 		/* Before attempting to purge a record, check
 		if it is safe to do so. */
 		if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) {
@@ -483,11 +491,9 @@ row_purge_remove_sec_if_poss_leaf(
 					<< rec_index_print(
 						btr_cur_get_rec(btr_cur),
 						index);
-				ut_ad(0);
-
-				btr_pcur_close(&pcur);
-
-				goto func_exit_no_pcur;
+				mtr.commit();
+				dict_set_corrupted(index, "purge");
+				goto cleanup;
 			}
 
 			if (index->is_spatial()) {
@@ -496,7 +502,7 @@ row_purge_remove_sec_if_poss_leaf(
 
 				if (block->page.id().page_no()
 				    != index->page
-				    && page_get_n_recs(block->frame) < 2
+				    && page_get_n_recs(block->page.frame) < 2
 				    && !lock_test_prdt_page_lock(
 					    btr_cur->rtr_info
 					    && btr_cur->rtr_info->thr
@@ -512,18 +518,12 @@ row_purge_remove_sec_if_poss_leaf(
 						 "skip purging last"
 						 " record on page "
 						 << block->page.id());
-
-					btr_pcur_close(&pcur);
-					mtr.commit();
-					return(success);
+					goto func_exit;
 				}
 			}
 
-			if (!btr_cur_optimistic_delete(btr_cur, 0, &mtr)) {
-
-				/* The index entry could not be deleted. */
-				success = false;
-			}
+			success = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+				!= DB_FAIL;
 		}
 
 		/* (The index entry is still needed,
@@ -535,9 +535,10 @@ row_purge_remove_sec_if_poss_leaf(
 		/* The deletion was buffered. */
 	case ROW_NOT_FOUND:
 		/* The index entry does not exist, nothing to do. */
-		btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
-func_exit_no_pcur:
+func_exit:
 		mtr.commit();
+cleanup:
+		btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
 		return(success);
 	}
 
@@ -581,7 +582,7 @@ retry:
 
 		n_tries++;
 
-		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
 
 		goto retry;
 	}
@@ -589,25 +590,6 @@ retry:
 	ut_a(success);
 }
 
-/** Skip uncommitted virtual indexes on newly added virtual column.
-@param[in,out]	index	dict index object */
-static
-inline
-void
-row_purge_skip_uncommitted_virtual_index(
-	dict_index_t*&	index)
-{
-	/* We need to skip virtual indexes which is not
-	committed yet. It's safe because these indexes are
-	newly created by alter table, and because we do
-	not support LOCK=NONE when adding an index on newly
-	added virtual column.*/
-	while (index != NULL && dict_index_has_virtual(index)
-	       && !index->is_committed() && index->has_new_v_col()) {
-		index = dict_table_get_next_index(index);
-	}
-}
-
 /***********************************************************//**
 Purges a delete marking of a record.
 @retval true if the row was not found, or it was successfully removed
@@ -619,34 +601,42 @@ row_purge_del_mark(
 /*===============*/
 	purge_node_t*	node)	/*!< in/out: row purge node */
 {
-	mem_heap_t*	heap;
-
-	heap = mem_heap_create(1024);
-
-	while (node->index != NULL) {
-		/* skip corrupted secondary index */
-		dict_table_skip_corrupt_index(node->index);
-
-		row_purge_skip_uncommitted_virtual_index(node->index);
+  if (node->index)
+  {
+    mem_heap_t *heap= mem_heap_create(1024);
 
-		if (!node->index) {
-			break;
-		}
+    do
+    {
+      const auto type= node->index->type;
+      if (type & (DICT_FTS | DICT_CORRUPT))
+        continue;
+      if (UNIV_UNLIKELY(DICT_VIRTUAL & type) && !node->index->is_committed() &&
+          node->index->has_new_v_col())
+        continue;
+      dtuple_t* entry= row_build_index_entry_low(node->row, nullptr,
+                                                 node->index, heap,
+                                                 ROW_BUILD_FOR_PURGE);
+      row_purge_remove_sec_if_poss(node, node->index, entry);
+      mem_heap_empty(heap);
+    }
+    while ((node->index= dict_table_get_next_index(node->index)));
 
-		if (node->index->type != DICT_FTS) {
-			dtuple_t*	entry = row_build_index_entry_low(
-				node->row, NULL, node->index,
-				heap, ROW_BUILD_FOR_PURGE);
-			row_purge_remove_sec_if_poss(node, node->index, entry);
-			mem_heap_empty(heap);
-		}
+    mem_heap_free(heap);
+  }
 
-		node->index = dict_table_get_next_index(node->index);
-	}
+  return row_purge_remove_clust_if_poss(node);
+}
 
-	mem_heap_free(heap);
+void purge_sys_t::wait_SYS()
+{
+  while (must_wait_SYS())
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+}
 
-	return(row_purge_remove_clust_if_poss(node));
+void purge_sys_t::wait_FTS()
+{
+  while (must_wait_FTS())
+    std::this_thread::sleep_for(std::chrono::seconds(1));
 }
 
 /** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record
@@ -655,6 +645,7 @@ whose old history can no longer be observed.
 @param[in,out]	mtr	mini-transaction (will be started and committed) */
 static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
 {
+retry:
 	/* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */
 	mtr->start();
 
@@ -690,6 +681,17 @@ static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
 			ut_ad(!rec_get_deleted_flag(
 					rec, rec_offs_comp(offsets))
 			      || rec_is_alter_metadata(rec, *index));
+			switch (node->table->id) {
+			case DICT_TABLES_ID:
+			case DICT_COLUMNS_ID:
+			case DICT_INDEXES_ID:
+				if (purge_sys.must_wait_SYS()) {
+					mtr->commit();
+					purge_sys.check_stop_SYS();
+					goto retry;
+				}
+			}
+
 			DBUG_LOG("purge", "reset DB_TRX_ID="
 				 << ib::hex(row_get_rec_trx_id(
 						    rec, index, offsets)));
@@ -709,9 +711,9 @@ static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
 				size_t offs = page_offset(ptr);
 				mtr->memset(block, offs, DATA_TRX_ID_LEN, 0);
 				offs += DATA_TRX_ID_LEN;
-				mtr->write<1,mtr_t::MAYBE_NOP>(*block,
-							       block->frame
-							       + offs, 0x80U);
+				mtr->write<1,mtr_t::MAYBE_NOP>(
+					*block, block->page.frame + offs,
+					0x80U);
 				mtr->memset(block, offs + 1,
 					    DATA_ROLL_PTR_LEN - 1, 0);
 			}
@@ -739,20 +741,25 @@ row_purge_upd_exist_or_extern_func(
 	ut_ad(!node->table->skip_alter_undo);
 
 	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
-	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+	    || !node->index) {
 
 		goto skip_secondaries;
 	}
 
 	heap = mem_heap_create(1024);
 
-	while (node->index != NULL) {
-		dict_table_skip_corrupt_index(node->index);
+	do {
+		const auto type = node->index->type;
 
-		row_purge_skip_uncommitted_virtual_index(node->index);
+		if (type & (DICT_FTS | DICT_CORRUPT)) {
+			continue;
+		}
 
-		if (!node->index) {
-			break;
+		if (UNIV_UNLIKELY(DICT_VIRTUAL & type)
+		    && !node->index->is_committed()
+		    && node->index->has_new_v_col()) {
+			continue;
 		}
 
 		if (row_upd_changes_ord_field_binary(node->index, node->update,
@@ -767,9 +774,7 @@ row_purge_upd_exist_or_extern_func(
 
 			mem_heap_empty(heap);
 		}
-
-		node->index = dict_table_get_next_index(node->index);
-	}
+	} while ((node->index = dict_table_get_next_index(node->index)));
 
 	mem_heap_free(heap);
 
@@ -783,9 +788,6 @@ skip_secondaries:
 			= upd_get_nth_field(node->update, i);
 
 		if (dfield_is_ext(&ufield->new_val)) {
-			trx_rseg_t*	rseg;
-			buf_block_t*	block;
-			byte*		data_field;
 			bool		is_insert;
 			ulint		rseg_id;
 			uint32_t	page_no;
@@ -808,11 +810,8 @@ skip_secondaries:
 						 &is_insert, &rseg_id,
 						 &page_no, &offset);
 
-			rseg = trx_sys.rseg_array[rseg_id];
-
-			ut_a(rseg != NULL);
-			ut_ad(rseg->id == rseg_id);
-			ut_ad(rseg->is_persistent());
+			const trx_rseg_t &rseg = trx_sys.rseg_array[rseg_id];
+			ut_ad(rseg.is_persistent());
 
 			mtr.start();
 
@@ -823,7 +822,7 @@ skip_secondaries:
 
 			index->set_modified(mtr);
 
-			/* NOTE: we must also acquire an X-latch to the
+			/* NOTE: we must also acquire a U latch to the
 			root page of the tree. We will need it when we
 			free pages from the tree. If the tree is of height 1,
 			the tree X-latch does NOT protect the root page,
@@ -832,24 +831,26 @@ skip_secondaries:
 			latching order if we would only later latch the
 			root page of such a tree! */
 
-			btr_root_get(index, &mtr);
-
-			block = buf_page_get(
-				page_id_t(rseg->space->id, page_no),
-				0, RW_X_LATCH, &mtr);
-
-			buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-
-			data_field = buf_block_get_frame(block)
-				+ offset + internal_offset;
+			dberr_t err;
+			if (!btr_root_block_get(index, RW_SX_LATCH, &mtr,
+						&err)) {
+			} else if (buf_block_t* block =
+				   buf_page_get(page_id_t(rseg.space->id,
+							  page_no),
+						0, RW_X_LATCH, &mtr)) {
+				byte* data_field = block->page.frame
+					+ offset + internal_offset;
+
+				ut_a(dfield_get_len(&ufield->new_val)
+				     >= BTR_EXTERN_FIELD_REF_SIZE);
+				btr_free_externally_stored_field(
+					index,
+					data_field
+					+ dfield_get_len(&ufield->new_val)
+					- BTR_EXTERN_FIELD_REF_SIZE,
+					NULL, NULL, block, 0, false, &mtr);
+			}
 
-			ut_a(dfield_get_len(&ufield->new_val)
-			     >= BTR_EXTERN_FIELD_REF_SIZE);
-			btr_free_externally_stored_field(
-				index,
-				data_field + dfield_get_len(&ufield->new_val)
-				- BTR_EXTERN_FIELD_REF_SIZE,
-				NULL, NULL, block, 0, false, &mtr);
 			mtr.commit();
 		}
 	}
@@ -997,6 +998,7 @@ static byte *row_purge_get_partial(const byte *ptr, const dict_index_t &index,
   return const_cast<byte*>(ptr);
 }
 
+MY_ATTRIBUTE((nonnull,warn_unused_result))
 /** Parses the row reference and other info in a modify undo log record.
 @param[in]	node		row undo node
 @param[in]	undo_rec	record to purge
@@ -1013,17 +1015,13 @@ row_purge_parse_undo_rec(
 	bool*			updated_extern)
 {
 	dict_index_t*	clust_index;
-	byte*		ptr;
 	undo_no_t	undo_no;
 	table_id_t	table_id;
 	roll_ptr_t	roll_ptr;
 	byte		info_bits;
 	ulint		type;
 
-	ut_ad(node != NULL);
-	ut_ad(thr != NULL);
-
-	ptr = trx_undo_rec_get_pars(
+	const byte* ptr = trx_undo_rec_get_pars(
 		undo_rec, &type, &node->cmpl_info,
 		updated_extern, &undo_no, &table_id);
 
@@ -1032,6 +1030,7 @@ row_purge_parse_undo_rec(
 	switch (type) {
 	case TRX_UNDO_RENAME_TABLE:
 		return false;
+	case TRX_UNDO_EMPTY:
 	case TRX_UNDO_INSERT_METADATA:
 	case TRX_UNDO_INSERT_REC:
 		/* These records do not store any transaction identifier.
@@ -1065,10 +1064,17 @@ row_purge_parse_undo_rec(
 	}
 
 try_again:
+	purge_sys.check_stop_FTS();
+
 	node->table = dict_table_open_on_id<true>(
 		table_id, false, DICT_TABLE_OP_NORMAL, node->purge_thd,
 		&node->mdl_ticket);
 
+	if (node->table == reinterpret_cast<dict_table_t*>(-1)) {
+		/* purge stop signal */
+		goto try_again;
+	}
+
 	if (!node->table) {
 		/* The table has been dropped: no need to do purge and
 		release mdl happened as a part of open process itself */
@@ -1094,7 +1100,7 @@ already_locked:
 			if (srv_shutdown_state > SRV_SHUTDOWN_NONE) {
 				return(false);
 			}
-			os_thread_sleep(1000000);
+			std::this_thread::sleep_for(std::chrono::seconds(1));
 			goto try_again;
 		}
 	}
@@ -1122,6 +1128,9 @@ err_exit:
 	if (type == TRX_UNDO_INSERT_METADATA) {
 		node->ref = &trx_undo_metadata;
 		return(true);
+	} else if (type == TRX_UNDO_EMPTY) {
+		node->ref = nullptr;
+		return true;
 	}
 
 	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
@@ -1164,18 +1173,18 @@ row_purge_record_func(
 #endif /* UNIV_DEBUG || WITH_WSREP */
 	bool		updated_extern)
 {
-	dict_index_t*	clust_index;
-	bool		purged		= true;
-
 	ut_ad(!node->found_clust);
 	ut_ad(!node->table->skip_alter_undo);
+	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
 
-	clust_index = dict_table_get_first_index(node->table);
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
 
-	node->index = dict_table_get_next_index(clust_index);
-	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+	bool purged = true;
 
 	switch (node->rec_type) {
+	case TRX_UNDO_EMPTY:
+		break;
 	case TRX_UNDO_DEL_MARK_REC:
 		purged = row_purge_del_mark(node);
 		if (purged) {
@@ -1205,8 +1214,8 @@ row_purge_record_func(
 	}
 
 	if (node->found_clust) {
+		node->found_clust = false;
 		btr_pcur_close(&node->pcur);
-		node->found_clust = FALSE;
 	}
 
 	return(purged);
@@ -1232,7 +1241,7 @@ row_purge(
 	trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
 	que_thr_t*	thr)		/*!< in: query thread */
 {
-	if (undo_rec != &trx_purge_dummy_rec) {
+	if (undo_rec != reinterpret_cast<trx_undo_rec_t*>(-1)) {
 		bool	updated_extern;
 
 		while (row_purge_parse_undo_rec(
@@ -1247,30 +1256,44 @@ row_purge(
 			}
 
 			/* Retry the purge in a second. */
-			os_thread_sleep(1000000);
+			std::this_thread::sleep_for(std::chrono::seconds(1));
 		}
 	}
 }
 
-/***********************************************************//**
-Reset the purge query thread. */
-UNIV_INLINE
-void
-row_purge_end(
-/*==========*/
-	que_thr_t*	thr)	/*!< in: query thread */
+inline void purge_node_t::start()
 {
-	ut_ad(thr);
-
-	thr->run_node = static_cast<purge_node_t*>(thr->run_node)->end();
+  ut_ad(in_progress);
+  DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+
+  row= nullptr;
+  ref= nullptr;
+  index= nullptr;
+  update= nullptr;
+  found_clust= FALSE;
+  rec_type= ULINT_UNDEFINED;
+  cmpl_info= ULINT_UNDEFINED;
+  if (!purge_thd)
+    purge_thd= current_thd;
+}
 
-	ut_a(thr->run_node != NULL);
+/** Reset the state at end
+@return the query graph parent */
+inline que_node_t *purge_node_t::end()
+{
+  DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+  close_table();
+  ut_ad(undo_recs.empty());
+  ut_d(in_progress= false);
+  purge_thd= nullptr;
+  mem_heap_empty(heap);
+  return common.parent;
 }
 
+
 /***********************************************************//**
-Does the purge operation for a single undo log record. This is a high-level
-function used in an SQL execution graph.
-@return query thread to run next or NULL */
+Does the purge operation.
+@return query thread to run next */
 que_thr_t*
 row_purge_step(
 /*===========*/
@@ -1282,22 +1305,15 @@ row_purge_step(
 
 	node->start();
 
-	if (!node->undo_recs.empty()) {
+	while (!node->undo_recs.empty()) {
 		trx_purge_rec_t purge_rec = node->undo_recs.front();
 		node->undo_recs.pop();
 		node->roll_ptr = purge_rec.roll_ptr;
 
 		row_purge(node, purge_rec.undo_rec, thr);
-
-		if (node->undo_recs.empty()) {
-			row_purge_end(thr);
-		} else {
-			thr->run_node = node;
-		}
-	} else {
-		row_purge_end(thr);
 	}
 
+	thr->run_node = node->end();
 	return(thr);
 }
 
@@ -1324,11 +1340,11 @@ purge_node_t::validate_pcur()
 		return(true);
 	}
 
-	if (!pcur.old_stored) {
+	if (!pcur.old_rec) {
 		return(true);
 	}
 
-	dict_index_t*	clust_index = pcur.btr_cur.index;
+	dict_index_t* clust_index = pcur.index();
 
 	rec_offs* offsets = rec_get_offsets(
 		pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields,
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
index 063fed764e8..a4d634f2d14 100644
--- a/storage/innobase/row/row0quiesce.cc
+++ b/storage/innobase/row/row0quiesce.cc
@@ -499,8 +499,6 @@ row_quiesce_table_has_fts_index(
 {
 	bool			exists = false;
 
-	dict_mutex_enter_for_mysql();
-
 	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
 	     index != 0;
 	     index = UT_LIST_GET_NEXT(indexes, index)) {
@@ -511,8 +509,6 @@ row_quiesce_table_has_fts_index(
 		}
 	}
 
-	dict_mutex_exit_for_mysql();
-
 	return(exists);
 }
 
@@ -600,8 +596,7 @@ row_quiesce_table_complete(
 				<< " to complete";
 		}
 
-		/* Sleep for a second. */
-		os_thread_sleep(1000000);
+		std::this_thread::sleep_for(std::chrono::seconds(1));
 
 		++count;
 	}
@@ -685,15 +680,13 @@ row_quiesce_set_state(
 
 	dict_index_t* clust_index = dict_table_get_first_index(table);
 
-	row_mysql_lock_data_dictionary(trx);
-
 	for (dict_index_t* index = dict_table_get_next_index(clust_index);
 	     index != NULL;
 	     index = dict_table_get_next_index(index)) {
-		rw_lock_x_lock(&index->lock);
+		index->lock.x_lock(SRW_LOCK_CALL);
 	}
 
-	rw_lock_x_lock(&clust_index->lock);
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
 
 	switch (state) {
 	case QUIESCE_START:
@@ -713,11 +706,9 @@ row_quiesce_set_state(
 	for (dict_index_t* index = dict_table_get_first_index(table);
 	     index != NULL;
 	     index = dict_table_get_next_index(index)) {
-		rw_lock_x_unlock(&index->lock);
+		index->lock.x_unlock();
 	}
 
-	row_mysql_unlock_data_dictionary(trx);
-
 	return(DB_SUCCESS);
 }
 
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
index 30622d031ea..4a00b2a430e 100644
--- a/storage/innobase/row/row0row.cc
+++ b/storage/innobase/row/row0row.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2018, 2021, MariaDB Corporation.
+Copyright (c) 2018, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -531,7 +531,11 @@ row_build_low(
 			continue;
 		}
 
-		ut_ad(ind_field < &index->fields[index->n_fields]);
+		if (UNIV_UNLIKELY(ind_field
+				  >= &index->fields[index->n_fields])) {
+			ut_ad(rec_is_metadata(rec, *index));
+			continue;
+		}
 
 		const dict_col_t* col = dict_field_get_col(ind_field);
 
@@ -745,11 +749,15 @@ row_rec_to_index_entry_impl(
 	if (mblob == 2) {
 		ut_ad(info_bits == REC_INFO_METADATA_ALTER
 		      || info_bits == REC_INFO_METADATA_ADD);
-		ut_ad(rec_len <= ulint(index->n_fields + got));
 		if (pad) {
+			ut_ad(rec_len <= ulint(index->n_fields + got));
 			rec_len = ulint(index->n_fields)
 				+ (info_bits == REC_INFO_METADATA_ALTER);
-		} else if (!got && info_bits == REC_INFO_METADATA_ALTER) {
+		} else if (got) {
+			rec_len = std::min(rec_len,
+					   ulint(index->n_fields + got));
+		} else if (info_bits == REC_INFO_METADATA_ALTER) {
+			ut_ad(rec_len <= index->n_fields);
 			rec_len++;
 		}
 	} else {
@@ -1175,32 +1183,28 @@ row_build_row_ref_in_tuple(
 /***************************************************************//**
 Searches the clustered index record for a row, if we have the row reference.
 @return TRUE if found */
-ibool
+bool
 row_search_on_row_ref(
 /*==================*/
 	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
 					be closed by the caller */
-	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_latch_mode		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	const dict_table_t*	table,	/*!< in: table */
 	const dtuple_t*		ref,	/*!< in: row reference */
 	mtr_t*			mtr)	/*!< in/out: mtr */
 {
-	ulint		low_match;
-	rec_t*		rec;
-	dict_index_t*	index;
-
 	ut_ad(dtuple_check_typed(ref));
 
-	index = dict_table_get_first_index(table);
+	dict_index_t *index = dict_table_get_first_index(table);
+	btr_pcur_init(pcur);
+	pcur->btr_cur.page_cur.index = index;
 
 	if (UNIV_UNLIKELY(ref->info_bits != 0)) {
 		ut_ad(ref->is_metadata());
 		ut_ad(ref->n_fields <= index->n_uniq);
-		if (btr_pcur_open_at_index_side(
-			    true, index, mode, pcur, true, 0, mtr)
-		    != DB_SUCCESS
+		if (pcur->open_leaf(true, index, mode, mtr) != DB_SUCCESS
 		    || !btr_pcur_move_to_next_user_rec(pcur, mtr)) {
-			return FALSE;
+			return false;
 		}
 		/* We do not necessarily have index->is_instant() here,
 		because we could be executing a rollback of an
@@ -1212,27 +1216,14 @@ row_search_on_row_ref(
 			& REC_INFO_MIN_REC_FLAG;
 	} else {
 		ut_a(ref->n_fields == index->n_uniq);
-		if (btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr)
+		if (btr_pcur_open(ref, PAGE_CUR_LE, mode, pcur, mtr)
 		    != DB_SUCCESS) {
-			return FALSE;
+			return false;
 		}
 	}
 
-	low_match = btr_pcur_get_low_match(pcur);
-
-	rec = btr_pcur_get_rec(pcur);
-
-	if (page_rec_is_infimum(rec)) {
-
-		return(FALSE);
-	}
-
-	if (low_match != dtuple_get_n_fields(ref)) {
-
-		return(FALSE);
-	}
-
-	return(TRUE);
+	return !page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		&& btr_pcur_get_low_match(pcur) == dtuple_get_n_fields(ref);
 }
 
 /*********************************************************************//**
@@ -1242,7 +1233,7 @@ on the secondary index record are preserved.
 rec_t*
 row_get_clust_rec(
 /*==============*/
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	const rec_t*	rec,	/*!< in: record in a secondary index */
 	dict_index_t*	index,	/*!< in: secondary index */
 	dict_index_t**	clust_index,/*!< out: clustered index */
@@ -1252,8 +1243,6 @@ row_get_clust_rec(
 	dtuple_t*	ref;
 	dict_table_t*	table;
 	btr_pcur_t	pcur;
-	ibool		found;
-	rec_t*		clust_rec;
 
 	ut_ad(!dict_index_is_clust(index));
 
@@ -1263,17 +1252,12 @@ row_get_clust_rec(
 
 	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
 
-	found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
-
-	clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+	auto found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
 
 	mem_heap_free(heap);
 
-	btr_pcur_close(&pcur);
-
 	*clust_index = dict_table_get_first_index(table);
-
-	return(clust_rec);
+	return found ? btr_pcur_get_rec(&pcur) : nullptr;
 }
 
 /***************************************************************//**
@@ -1282,9 +1266,8 @@ Searches an index record.
 enum row_search_result
 row_search_index_entry(
 /*===================*/
-	dict_index_t*	index,	/*!< in: index */
 	const dtuple_t*	entry,	/*!< in: index entry */
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
 	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
 				be closed by the caller */
 	mtr_t*		mtr)	/*!< in: mtr */
@@ -1295,17 +1278,13 @@ row_search_index_entry(
 
 	ut_ad(dtuple_check_typed(entry));
 
-	if (dict_index_is_spatial(index)) {
-		ut_ad(mode & BTR_MODIFY_LEAF || mode & BTR_MODIFY_TREE);
-		rtr_pcur_open(index, entry, PAGE_CUR_RTREE_LOCATE,
-			      mode, pcur, mtr);
-	} else {
-		btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+	if (btr_pcur_open(entry, PAGE_CUR_LE, mode, pcur, mtr) != DB_SUCCESS) {
+		return ROW_NOT_FOUND;
 	}
 
 	switch (btr_pcur_get_btr_cur(pcur)->flag) {
 	case BTR_CUR_DELETE_REF:
-		ut_a(mode & BTR_DELETE && !dict_index_is_spatial(index));
+		ut_ad(!(~mode & BTR_DELETE));
 		return(ROW_NOT_DELETED_REF);
 
 	case BTR_CUR_DEL_MARK_IBUF:
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 646526311da..d1d264a7e8a 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -36,6 +36,8 @@ Created 12/19/1997 Heikki Tuuri
 #include "dict0boot.h"
 #include "trx0undo.h"
 #include "trx0trx.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
 #include "btr0btr.h"
 #include "btr0cur.h"
 #include "btr0sea.h"
@@ -54,6 +56,7 @@ Created 12/19/1997 Heikki Tuuri
 #include "buf0lru.h"
 #include "srv0srv.h"
 #include "srv0mon.h"
+#include "sql_error.h"
 #ifdef WITH_WSREP
 #include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */
 #endif
@@ -282,7 +285,6 @@ row_sel_sec_rec_is_for_clust_rec(
 	rec_offs_init(clust_offsets_);
 	rec_offs_init(sec_offsets_);
 
-
 	ib_vcol_row vc(heap);
 
 	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
@@ -947,6 +949,36 @@ row_sel_test_other_conds(
 	return(TRUE);
 }
 
+/** Check that a clustered index record is visible in a consistent read view.
+@param rec      clustered index record (in leaf page, or in memory)
+@param index    clustered index
+@param offsets  rec_get_offsets(rec, index)
+@param view     consistent read view
+@retval DB_SUCCESS             if rec is visible in view
+@retval DB_SUCCESS_LOCKED_REC  if rec is not visible in view
+@retval DB_CORRUPTION          if the DB_TRX_ID is corrupted */
+static dberr_t row_sel_clust_sees(const rec_t *rec, const dict_index_t &index,
+                                  const rec_offs *offsets,
+                                  const ReadView &view)
+{
+  ut_ad(index.is_primary());
+  ut_ad(page_rec_is_user_rec(rec));
+  ut_ad(rec_offs_validate(rec, &index, offsets));
+  ut_ad(!rec_is_metadata(rec, index));
+  ut_ad(!index.table->is_temporary());
+
+  const trx_id_t id= row_get_rec_trx_id(rec, &index, offsets);
+
+  if (view.changes_visible(id))
+    return DB_SUCCESS;
+  if (UNIV_LIKELY(id < view.low_limit_id() || id < trx_sys.get_max_trx_id()))
+    return DB_SUCCESS_LOCKED_REC;
+
+  ib::warn() << "A transaction id in a record of table " << index.table->name
+             << " is newer than the system-wide maximum.";
+  return DB_CORRUPTION;
+}
+
 /*********************************************************************//**
 Retrieves the clustered index record corresponding to a record in a
 non-clustered index. Does the necessary locking.
@@ -970,7 +1002,6 @@ row_sel_get_clust_rec(
 	dict_index_t*	index;
 	rec_t*		clust_rec;
 	rec_t*		old_vers;
-	dberr_t		err		= DB_SUCCESS;
 	mem_heap_t*	heap		= NULL;
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets		= offsets_;
@@ -978,18 +1009,21 @@ row_sel_get_clust_rec(
 
 	*out_rec = NULL;
 
-	offsets = rec_get_offsets(rec,
-				  btr_pcur_get_btr_cur(&plan->pcur)->index,
-				  offsets,
-				  btr_pcur_get_btr_cur(&plan->pcur)->index
-				  ->n_core_fields, ULINT_UNDEFINED, &heap);
+	offsets = rec_get_offsets(rec, plan->pcur.index(), offsets,
+				  plan->pcur.index()->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
 
 	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
 
 	index = dict_table_get_first_index(plan->table);
-
-	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
-				   BTR_SEARCH_LEAF, &plan->clust_pcur, mtr);
+	plan->clust_pcur.old_rec = nullptr;
+	plan->clust_pcur.btr_cur.page_cur.index = index;
+	dberr_t err = btr_pcur_open_with_no_init(plan->clust_ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 &plan->clust_pcur, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto err_exit;
+	}
 
 	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
 
@@ -1000,9 +1034,10 @@ row_sel_get_clust_rec(
 	    || btr_pcur_get_low_match(&(plan->clust_pcur))
 	    < dict_index_get_n_unique(index)) {
 
-		ut_a(rec_get_deleted_flag(rec,
-					  dict_table_is_comp(plan->table)));
-		ut_a(node->read_view);
+		if (!node->read_view ||
+		    !rec_get_deleted_flag(rec, plan->table->not_redundant())) {
+			err = DB_CORRUPTION;
+		}
 
 		/* In a rare case it is possible that no clust rec is found
 		for a delete-marked secondary index record: if in row0umod.cc
@@ -1051,9 +1086,15 @@ row_sel_get_clust_rec(
 
 		old_vers = NULL;
 
-		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
-						   node->read_view)) {
+		err = row_sel_clust_sees(clust_rec, *index, offsets,
+                                         *node->read_view);
 
+		switch (err) {
+		default:
+			goto err_exit;
+		case DB_SUCCESS:
+			break;
+		case DB_SUCCESS_LOCKED_REC:
 			err = row_sel_build_prev_vers(
 				node->read_view, index, clust_rec,
 				&offsets, &heap, &plan->old_vers_heap,
@@ -1148,15 +1189,15 @@ sel_set_rtr_rec_lock(
 		return(DB_SUCCESS_LOCKED_REC);
 	}
 
-	ut_ad(page_align(first_rec) == cur_block->frame);
+	ut_ad(page_align(first_rec) == cur_block->page.frame);
 	ut_ad(match->valid);
 
-	rw_lock_x_lock(&(match->block.lock));
+	match->block.page.lock.x_lock();
 retry:
 	cur_block = btr_pcur_get_block(pcur);
-	ut_ad(rw_lock_own_flagged(&match->block.lock,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-	ut_ad(page_is_leaf(buf_block_get_frame(cur_block)));
+	ut_ad(match->block.page.lock.have_x()
+	      || match->block.page.lock.have_s());
+	ut_ad(page_is_leaf(cur_block->page.frame));
 
 	err = lock_sec_rec_read_check_and_lock(
 		0, cur_block, rec, index, my_offsets,
@@ -1166,31 +1207,29 @@ retry:
 re_scan:
 		mtr->commit();
 		trx->error_state = err;
-		que_thr_stop_for_mysql(thr);
 		thr->lock_state = QUE_THR_LOCK_ROW;
 		if (row_mysql_handle_errors(
 			&err, trx, thr, NULL)) {
 			thr->lock_state = QUE_THR_LOCK_NOLOCK;
 			mtr->start();
 
-			mutex_enter(&match->rtr_match_mutex);
+			mysql_mutex_lock(&match->rtr_match_mutex);
 			if (!match->valid && match->matched_recs->empty()) {
-				mutex_exit(&match->rtr_match_mutex);
+				mysql_mutex_unlock(&match->rtr_match_mutex);
 				err = DB_RECORD_NOT_FOUND;
 				goto func_end;
 			}
-			mutex_exit(&match->rtr_match_mutex);
+			mysql_mutex_unlock(&match->rtr_match_mutex);
 
 			/* MDEV-14059 FIXME: why re-latch the block?
 			pcur is already positioned on it! */
-			uint32_t page_no = page_get_page_no(
-				btr_pcur_get_page(pcur));
-
 			cur_block = buf_page_get_gen(
-				page_id_t(index->table->space_id, page_no),
-				index->table->space->zip_size(),
-				RW_X_LATCH, NULL, BUF_GET,
-				__FILE__, __LINE__, mtr, &err);
+				btr_pcur_get_block(pcur)->page.id(),
+				btr_pcur_get_block(pcur)->zip_size(),
+				RW_X_LATCH, NULL, BUF_GET, mtr, &err);
+			if (!cur_block) {
+				goto func_end;
+			}
 		} else {
 			mtr->start();
 			goto func_end;
@@ -1207,6 +1246,7 @@ re_scan:
 		}
 
 		match->matched_recs->clear();
+		// FIXME: check for !cur_block
 
 		rtr_cur_search_with_match(
 			cur_block, index,
@@ -1271,7 +1311,7 @@ re_scan:
 	match->locked = true;
 
 func_end:
-	rw_lock_x_unlock(&(match->block.lock));
+	match->block.page.lock.x_unlock();
 	if (heap != NULL) {
 		mem_heap_free(heap);
 	}
@@ -1338,8 +1378,9 @@ sel_set_rec_lock(
 
 /*********************************************************************//**
 Opens a pcur to a table index. */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
 static
-void
+dberr_t
 row_sel_open_pcur(
 /*==============*/
 	plan_t*		plan,	/*!< in: table plan */
@@ -1351,6 +1392,10 @@ row_sel_open_pcur(
 	ulint		n_fields;
 	ulint		i;
 
+	ut_ad(!plan->n_rows_prefetched);
+	ut_ad(!plan->n_rows_fetched);
+	ut_ad(!plan->cursor_at_end);
+
 	index = plan->index;
 
 	/* Calculate the value of the search tuple: the exact match columns
@@ -1365,6 +1410,11 @@ row_sel_open_pcur(
 		cond = UT_LIST_GET_NEXT(cond_list, cond);
 	}
 
+	plan->pcur.old_rec = nullptr;
+	plan->pcur.btr_cur.page_cur.index = index;
+
+	dberr_t err;
+
 	if (plan->tuple) {
 		n_fields = dtuple_get_n_fields(plan->tuple);
 
@@ -1382,23 +1432,16 @@ row_sel_open_pcur(
 					 que_node_get_val(exp));
 		}
 
-		/* Open pcur to the index */
-
-		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
-					   BTR_SEARCH_LEAF, &plan->pcur, mtr);
+		err = btr_pcur_open_with_no_init(plan->tuple,
+						 plan->mode, BTR_SEARCH_LEAF,
+						 &plan->pcur, mtr);
 	} else {
-		/* Open the cursor to the start or the end of the index
-		(FALSE: no init) */
-
-		btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
-					    &(plan->pcur), false, 0, mtr);
+		err = plan->pcur.open_leaf(plan->asc, index, BTR_SEARCH_LEAF,
+					   mtr);
 	}
 
-	ut_ad(plan->n_rows_prefetched == 0);
-	ut_ad(plan->n_rows_fetched == 0);
-	ut_ad(plan->cursor_at_end == FALSE);
-
-	plan->pcur_is_open = TRUE;
+	plan->pcur_is_open = err == DB_SUCCESS;
+	return err;
 }
 
 /*********************************************************************//**
@@ -1422,7 +1465,7 @@ row_sel_restore_pcur_pos(
 	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
 
 	equal_position =
-	  btr_pcur_restore_position(BTR_SEARCH_LEAF, &plan->pcur, mtr) ==
+	  plan->pcur.restore_position(BTR_SEARCH_LEAF, mtr) ==
 	  btr_pcur_t::SAME_ALL;
 
 	/* If the cursor is traveling upwards, and relative_position is
@@ -1527,17 +1570,20 @@ row_sel_try_search_shortcut(
 {
 	dict_index_t*	index = plan->index;
 
+	ut_ad(!index->table->is_temporary());
 	ut_ad(node->read_view);
+	ut_ad(node->read_view->is_open());
 	ut_ad(plan->unique_search);
 	ut_ad(!plan->must_get_clust);
 
-	row_sel_open_pcur(plan, mtr);
+	if (row_sel_open_pcur(plan, mtr) != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
 
 	const rec_t* rec = btr_pcur_get_rec(&(plan->pcur));
 
 	if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
-retry:
-		return(SEL_RETRY);
+		return SEL_RETRY;
 	}
 
 	ut_ad(plan->mode == PAGE_CUR_GE);
@@ -1547,8 +1593,14 @@ retry:
 	fields in the user record matched to the search tuple */
 
 	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
-exhausted:
-		return(SEL_EXHAUSTED);
+		return SEL_EXHAUSTED;
+	}
+
+	if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!node->read_view->changes_visible(bulk_trx_id)) {
+			return SEL_EXHAUSTED;
+		}
 	}
 
 	/* This is a non-locking consistent read: if necessary, fetch
@@ -1562,18 +1614,20 @@ exhausted:
 				  ULINT_UNDEFINED, &heap);
 
 	if (dict_index_is_clust(index)) {
-		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
-						   node->read_view)) {
-			goto retry;
+		if (row_sel_clust_sees(rec, *index, offsets, *node->read_view)
+		    != DB_SUCCESS) {
+			return SEL_RETRY;
+		}
+	} else if (!srv_read_only_mode) {
+		trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+		ut_ad(trx_id);
+		if (!node->read_view->sees(trx_id)) {
+			return SEL_RETRY;
 		}
-	} else if (!srv_read_only_mode
-		   && !lock_sec_rec_cons_read_sees(
-			rec, index, node->read_view)) {
-		goto retry;
 	}
 
 	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
-		goto exhausted;
+		return SEL_EXHAUSTED;
 	}
 
 	/* Fetch the columns needed in test conditions.  The index
@@ -1587,7 +1641,7 @@ exhausted:
 	/* Test the rest of search conditions */
 
 	if (!row_sel_test_other_conds(plan)) {
-		goto exhausted;
+		return SEL_EXHAUSTED;
 	}
 
 	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
@@ -1597,7 +1651,7 @@ exhausted:
 	if (UNIV_LIKELY_NULL(heap)) {
 		mem_heap_free(heap);
 	}
-	return(SEL_FOUND);
+	return SEL_FOUND;
 }
 #endif /* BTR_CUR_HASH_ADAPT */
 
@@ -1618,7 +1672,6 @@ row_sel(
 	rec_t*		rec;
 	rec_t*		old_vers;
 	rec_t*		clust_rec;
-	ibool		consistent_read;
 
 	/* The following flag becomes TRUE when we are doing a
 	consistent read from a non-clustered index and we must look
@@ -1641,21 +1694,11 @@ row_sel(
 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
 	rec_offs*	offsets				= offsets_;
 	rec_offs_init(offsets_);
+	const trx_t*	trx = thr_get_trx(thr);
 
 	ut_ad(thr->run_node == node);
-
-	if (node->read_view) {
-		/* In consistent reads, we try to do with the hash index and
-		not to use the buffer page get. This is to reduce memory bus
-		load resulting from semaphore operations. The search latch
-		will be s-locked when we access an index with a unique search
-		condition, but not locked when we access an index with a
-		less selective search condition. */
-
-		consistent_read = TRUE;
-	} else {
-		consistent_read = FALSE;
-	}
+	ut_ad(!node->read_view || node->read_view == &trx->read_view);
+	ut_ad(!node->read_view || node->read_view->is_open());
 
 table_loop:
 	/* TABLE LOOP
@@ -1690,7 +1733,7 @@ table_loop:
 	mtr.start();
 
 #ifdef BTR_CUR_HASH_ADAPT
-	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+	if (node->read_view && plan->unique_search && !plan->pcur_is_open
 	    && !plan->must_get_clust) {
 		switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
 		case SEL_FOUND:
@@ -1714,7 +1757,11 @@ table_loop:
 	if (!plan->pcur_is_open) {
 		/* Evaluate the expressions to build the search tuple and
 		open the cursor */
-		row_sel_open_pcur(plan, &mtr);
+		err = row_sel_open_pcur(plan, &mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto mtr_commit_exit;
+		}
 
 		cursor_just_opened = TRUE;
 
@@ -1735,6 +1782,15 @@ table_loop:
 		}
 	}
 
+	if (!node->read_view
+	    || trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto table_exhausted;
+		}
+	}
+
 rec_loop:
 	/* RECORD LOOP
 	-----------
@@ -1766,12 +1822,13 @@ rec_loop:
 		and it might be that these new records should appear in the
 		search result set, resulting in the phantom problem. */
 
-		if (!consistent_read) {
-			rec_t*	next_rec = page_rec_get_next(rec);
+		if (!node->read_view) {
+			const rec_t* next_rec = page_rec_get_next_const(rec);
+			if (UNIV_UNLIKELY(!next_rec)) {
+				err = DB_CORRUPTION;
+				goto lock_wait_or_error;
+			}
 			unsigned lock_type;
-			trx_t*	trx;
-
-			trx = thr_get_trx(thr);
 
 			offsets = rec_get_offsets(next_rec, index, offsets,
 						  index->n_core_fields,
@@ -1829,17 +1886,14 @@ skip_lock:
 		goto next_rec;
 	}
 
-	if (!consistent_read) {
+	if (!node->read_view) {
 		/* Try to place a lock on the index record */
 		unsigned lock_type;
-		trx_t*	trx;
 
 		offsets = rec_get_offsets(rec, index, offsets,
 					  index->n_core_fields,
 					  ULINT_UNDEFINED, &heap);
 
-		trx = thr_get_trx(thr);
-
 		/* At READ UNCOMMITTED or READ COMMITTED isolation level,
 		we lock only the record, i.e., next-key locking is
 		not used. */
@@ -1923,14 +1977,20 @@ skip_lock:
 	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
 				  ULINT_UNDEFINED, &heap);
 
-	if (consistent_read) {
+	if (node->read_view) {
 		/* This is a non-locking consistent read: if necessary, fetch
 		a previous version of the record */
 
 		if (dict_index_is_clust(index)) {
+			const trx_id_t id = row_get_rec_trx_id(
+				rec, index, offsets);
 
-			if (!lock_clust_rec_cons_read_sees(
-					rec, index, offsets, node->read_view)) {
+			if (!node->read_view->changes_visible(id)) {
+				if (id >= node->read_view->low_limit_id()
+				    && id >= trx_sys.get_max_trx_id()) {
+					err = DB_CORRUPTION;
+					goto lock_wait_or_error;
+				}
 
 				err = row_sel_build_prev_vers(
 					node->read_view, index, rec,
@@ -1979,11 +2039,12 @@ skip_lock:
 
 				rec = old_vers;
 			}
-		} else if (!srv_read_only_mode
-			   && !lock_sec_rec_cons_read_sees(
-				   rec, index, node->read_view)) {
-
-			cons_read_requires_clust_rec = TRUE;
+		} else if (!srv_read_only_mode) {
+			trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+			ut_ad(trx_id);
+			if (!node->read_view->sees(trx_id)) {
+				cons_read_requires_clust_rec = TRUE;
+			}
 		}
 	}
 
@@ -2049,7 +2110,7 @@ skip_lock:
 
 		if (clust_rec == NULL) {
 			/* The record did not exist in the read view */
-			ut_ad(consistent_read);
+			ut_ad(node->read_view);
 
 			goto next_rec;
 		}
@@ -2242,11 +2303,8 @@ stop_for_a_while:
 	plan->stored_cursor_rec_processed = FALSE;
 	btr_pcur_store_position(&(plan->pcur), &mtr);
 
-	mtr.commit();
-	ut_ad(!sync_check_iterate(sync_check()));
-
 	err = DB_SUCCESS;
-	goto func_exit;
+	goto mtr_commit_exit;
 
 commit_mtr_for_a_while:
 	/* Stores the cursor position and commits &mtr; this is used if
@@ -2260,7 +2318,6 @@ commit_mtr_for_a_while:
 	mtr.commit();
 
 	mtr_has_extra_clust_latch = FALSE;
-	ut_ad(!sync_check_iterate(dict_sync_check()));
 
 	goto table_loop;
 
@@ -2271,12 +2328,10 @@ lock_wait_or_error:
 
 	plan->stored_cursor_rec_processed = FALSE;
 	btr_pcur_store_position(&(plan->pcur), &mtr);
-
+mtr_commit_exit:
 	mtr.commit();
 
 func_exit:
-	ut_ad(!sync_check_iterate(dict_sync_check()));
-
 	if (heap != NULL) {
 		mem_heap_free(heap);
 	}
@@ -2340,8 +2395,8 @@ row_sel_step(
 					que_node_get_next(table_node))) {
 
 				dberr_t	err = lock_table(
-					0, table_node->table, i_lock_mode,
-					thr);
+					table_node->table, nullptr,
+					i_lock_mode, thr);
 
 				if (err != DB_SUCCESS) {
 					trx_t*	trx;
@@ -2702,7 +2757,7 @@ row_sel_convert_mysql_key_to_innobase(
 				<< ". Last data field length "
 				<< data_field_len << " bytes, key ptr now"
 				" exceeds key end by " << (key_ptr - key_end)
-				<< " bytes. Key value in the MySQL format:";
+				<< " bytes. Key value in the MariaDB format:";
 
 			ut_print_buf(stderr, original_key_ptr, key_len);
 			putc('\n', stderr);
@@ -3202,6 +3257,14 @@ static bool row_sel_store_mysql_rec(
 	DBUG_RETURN(true);
 }
 
+static void row_sel_reset_old_vers_heap(row_prebuilt_t *prebuilt)
+{
+  if (prebuilt->old_vers_heap)
+    mem_heap_empty(prebuilt->old_vers_heap);
+  else
+    prebuilt->old_vers_heap= mem_heap_create(200);
+}
+
 /*********************************************************************//**
 Builds a previous version of a clustered index record for a consistent read
 @return DB_SUCCESS or error code */
@@ -3209,9 +3272,8 @@ static MY_ATTRIBUTE((warn_unused_result))
 dberr_t
 row_sel_build_prev_vers_for_mysql(
 /*==============================*/
-	ReadView*	read_view,	/*!< in: read view */
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct */
 	dict_index_t*	clust_index,	/*!< in: clustered index */
-	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
 	const rec_t*	rec,		/*!< in: record in a clustered index */
 	rec_offs**	offsets,	/*!< in/out: offsets returned by
 					rec_get_offsets(rec, clust_index) */
@@ -3225,18 +3287,12 @@ row_sel_build_prev_vers_for_mysql(
 					column data */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	dberr_t	err;
+	row_sel_reset_old_vers_heap(prebuilt);
 
-	if (prebuilt->old_vers_heap) {
-		mem_heap_empty(prebuilt->old_vers_heap);
-	} else {
-		prebuilt->old_vers_heap = mem_heap_create(200);
-	}
-
-	err = row_vers_build_for_consistent_read(
-		rec, mtr, clust_index, offsets, read_view, offset_heap,
+	return row_vers_build_for_consistent_read(
+		rec, mtr, clust_index, offsets,
+		&prebuilt->trx->read_view, offset_heap,
 		prebuilt->old_vers_heap, old_vers, vrow);
-	return(err);
 }
 
 /** Helper class to cache clust_rec and old_vers */
@@ -3313,11 +3369,10 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 				access the clustered index */
 {
 	dict_index_t*	clust_index;
-	const rec_t*	clust_rec;
 	rec_t*		old_vers;
-	dberr_t		err;
 	trx_t*		trx;
 
+	prebuilt->clust_pcur->old_rec = nullptr;
 	*out_rec = NULL;
 	trx = thr_get_trx(thr);
 
@@ -3328,12 +3383,16 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 				   sec_index, *offsets);
 
 	clust_index = dict_table_get_first_index(sec_index->table);
+	prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
 
-	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
-				   PAGE_CUR_LE, BTR_SEARCH_LEAF,
-				   prebuilt->clust_pcur, mtr);
+	dberr_t err = btr_pcur_open_with_no_init(prebuilt->clust_ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 prebuilt->clust_pcur, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return err;
+	}
 
-	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+	const rec_t* clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
 
 	prebuilt->clust_pcur->trx_if_known = trx;
 
@@ -3352,35 +3411,29 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 		if  (dict_index_is_spatial(sec_index)
 		     && btr_cur->rtr_info->matches
 		     && (page_align(rec)
-			== btr_cur->rtr_info->matches->block.frame
+			== btr_cur->rtr_info->matches->block.page.frame
 			|| rec != btr_pcur_get_rec(prebuilt->pcur))) {
 #ifdef UNIV_DEBUG
 			rtr_info_t*	rtr_info = btr_cur->rtr_info;
-			mutex_enter(&rtr_info->matches->rtr_match_mutex);
+			mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex);
 			/* The page could be deallocated (by rollback etc.) */
 			if (!rtr_info->matches->valid) {
-				mutex_exit(&rtr_info->matches->rtr_match_mutex);
+				mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
 				clust_rec = NULL;
-
-                                err = DB_SUCCESS;
                                 goto func_exit;
 			}
-			mutex_exit(&rtr_info->matches->rtr_match_mutex);
+			mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
 
 			if (rec_get_deleted_flag(rec,
                                           dict_table_is_comp(sec_index->table))
                                   && prebuilt->select_lock_type == LOCK_NONE) {
 
 				clust_rec = NULL;
-
-				err = DB_SUCCESS;
 				goto func_exit;
 			}
 
 			if (rec != btr_pcur_get_rec(prebuilt->pcur)) {
 				clust_rec = NULL;
-
-                                err = DB_SUCCESS;
                                 goto func_exit;
 			}
 
@@ -3390,32 +3443,31 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 			buf_block_t*	block = buf_page_get_gen(
 				btr_pcur_get_block(prebuilt->pcur)->page.id(),
 				btr_pcur_get_block(prebuilt->pcur)->zip_size(),
-				RW_NO_LATCH, NULL, BUF_GET,
-				__FILE__, __LINE__, mtr, &err);
+				RW_NO_LATCH, NULL, BUF_GET, mtr, &err);
+			ut_ad(block); // FIXME: avoid crash
 			mem_heap_t*	heap = mem_heap_create(256);
 			dtuple_t*       tuple = dict_index_build_data_tuple(
 				rec, sec_index, true,
 				sec_index->n_fields, heap);
 			page_cur_t     page_cursor;
-
-		        ulint		low_match = page_cur_search(
-						block, sec_index, tuple,
-						PAGE_CUR_LE, &page_cursor);
-
+			page_cursor.block = block;
+			page_cursor.index = sec_index;
+			ulint up_match = 0, low_match = 0;
+			ut_ad(!page_cur_search_with_match(tuple, PAGE_CUR_LE,
+							  &up_match,
+							  &low_match,
+							  &page_cursor,
+							  nullptr));
 			ut_ad(low_match < dtuple_get_n_fields_cmp(tuple));
 			mem_heap_free(heap);
-			clust_rec = NULL;
-
 			err = DB_SUCCESS;
-			goto func_exit;
 #endif /* UNIV_DEBUG */
 		} else if (!rec_get_deleted_flag(rec,
 					  dict_table_is_comp(sec_index->table))
-		    || prebuilt->select_lock_type != LOCK_NONE) {
+			   || prebuilt->select_lock_type != LOCK_NONE) {
 			/* In a rare case it is possible that no clust
 			rec is found for a delete-marked secondary index
-			record: if in row0umod.cc in
-			row_undo_mod_remove_clust_low() we have already removed
+			record: if row_undo_mod_clust() has already removed
 			the clust rec, while purge is still cleaning and
 			removing secondary index records associated with
 			earlier versions of the clustered index record.
@@ -3430,17 +3482,10 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 			fputs("\n"
 			      "InnoDB: clust index record ", stderr);
 			rec_print(stderr, clust_rec, clust_index);
-			putc('\n', stderr);
-			trx_print(stderr, trx, 600);
-			fputs("\n"
-			      "InnoDB: Submit a detailed bug report"
-			      " to https://jira.mariadb.org/\n", stderr);
-			ut_ad(0);
+			err = DB_CORRUPTION;
 		}
 
 		clust_rec = NULL;
-
-		err = DB_SUCCESS;
 		goto func_exit;
 	}
 
@@ -3465,7 +3510,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 		case DB_SUCCESS_LOCKED_REC:
 			break;
 		default:
-			goto err_exit;
+			return err;
 		}
 	} else {
 		/* This is a non-locking consistent read: if necessary, fetch
@@ -3473,13 +3518,22 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 
 		old_vers = NULL;
 
-		/* If the isolation level allows reading of uncommitted data,
-		then we never look for an earlier version */
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+		    || clust_index->table->is_temporary()) {
+		} else {
+			/* If the isolation level allows reading of
+			uncommitted data, then we never look for an
+			earlier version */
+			err = row_sel_clust_sees(clust_rec, *clust_index,
+						 *offsets, trx->read_view);
+		}
 
-		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
-		    && !lock_clust_rec_cons_read_sees(
-			    clust_rec, clust_index, *offsets,
-			    &trx->read_view)) {
+		switch (err) {
+		default:
+			return err;
+		case DB_SUCCESS:
+			break;
+		case DB_SUCCESS_LOCKED_REC:
 			const buf_page_t& bpage = btr_pcur_get_block(
 				prebuilt->clust_pcur)->page;
 
@@ -3492,13 +3546,12 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 				/* The following call returns 'offsets' associated with
 				'old_vers' */
 				err = row_sel_build_prev_vers_for_mysql(
-					&trx->read_view, clust_index, prebuilt,
+					prebuilt, clust_index,
 					clust_rec, offsets, offset_heap, &old_vers,
 					vrow, mtr);
 
-				if (err != DB_SUCCESS) {
-
-					goto err_exit;
+				if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+					return err;
 				}
 				cached_lsn = lsn;
 				cached_page_id = bpage.id();
@@ -3522,7 +3575,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 			}
 
 			if (old_vers == NULL) {
-				goto err_exit;
+				return err;
 			}
 
 			clust_rec = old_vers;
@@ -3558,7 +3611,7 @@ Row_sel_get_clust_rec_for_mysql::operator()(
 			case DB_SUCCESS_LOCKED_REC:
 				break;
 			default:
-				goto err_exit;
+				return err;
 			}
 		}
 
@@ -3575,8 +3628,7 @@ func_exit:
 		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
 	}
 
-err_exit:
-	return(err);
+	return err;
 }
 
 /** Restores cursor position after it has been stored. We have to take into
@@ -3591,10 +3643,11 @@ record with the same ordering prefix in in the B-tree index
 @return true if we may need to process the record the cursor is now
 positioned on (i.e. we should not go to the next record yet) */
 static bool sel_restore_position_for_mysql(bool *same_user_rec,
-                                           ulint latch_mode, btr_pcur_t *pcur,
+                                           btr_latch_mode latch_mode,
+                                           btr_pcur_t *pcur,
                                            bool moves_up, mtr_t *mtr)
 {
-	auto status = btr_pcur_restore_position(latch_mode, pcur, mtr);
+	auto status = pcur->restore_position(latch_mode, mtr);
 
 	*same_user_rec = status == btr_pcur_t::SAME_ALL;
 
@@ -3620,7 +3673,7 @@ static bool sel_restore_position_for_mysql(bool *same_user_rec,
 next:
 			if (btr_pcur_move_to_next(pcur, mtr)
 			    && rec_is_metadata(btr_pcur_get_rec(pcur),
-					       *pcur->btr_cur.index)) {
+					       *pcur->index())) {
 				btr_pcur_move_to_next(pcur, mtr);
 			}
 
@@ -3636,8 +3689,10 @@ next:
 prev:
 		if (btr_pcur_is_on_user_rec(pcur) && !moves_up
 		    && !rec_is_metadata(btr_pcur_get_rec(pcur),
-					*pcur->btr_cur.index)) {
-			btr_pcur_move_to_prev(pcur, mtr);
+					*pcur->index())) {
+			if (!btr_pcur_move_to_prev(pcur, mtr)) {
+				return true;
+			}
 		}
 		return true;
 	case BTR_PCUR_BEFORE:
@@ -3910,16 +3965,22 @@ row_sel_try_search_shortcut_for_mysql(
 	trx_t*		trx		= prebuilt->trx;
 	const rec_t*	rec;
 
-	ut_ad(dict_index_is_clust(index));
+	ut_ad(index->is_primary());
+	ut_ad(!index->table->is_temporary());
 	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(trx->read_view.is_open());
+	pcur->old_rec = nullptr;
+
+	if (btr_pcur_open_with_no_init(search_tuple, PAGE_CUR_GE,
+				       BTR_SEARCH_LEAF, pcur, mtr)
+	    != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
 
-	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
-				   BTR_SEARCH_LEAF, pcur, mtr);
 	rec = btr_pcur_get_rec(pcur);
 
 	if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
-retry:
-		return(SEL_RETRY);
+		return SEL_RETRY;
 	}
 
 	/* As the cursor is now placed on a user record after a search with
@@ -3927,8 +3988,15 @@ retry:
 	fields in the user record matched to the search tuple */
 
 	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
-exhausted:
-		return(SEL_EXHAUSTED);
+		return SEL_EXHAUSTED;
+	}
+
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			return SEL_EXHAUSTED;
+		}
 	}
 
 	/* This is a non-locking consistent read: if necessary, fetch
@@ -3937,21 +4005,21 @@ exhausted:
 	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
 				   ULINT_UNDEFINED, heap);
 
-	if (!lock_clust_rec_cons_read_sees(rec, index, *offsets,
-					   &trx->read_view)) {
-		goto retry;
+	if (row_sel_clust_sees(rec, *index, *offsets, trx->read_view)
+	    != DB_SUCCESS) {
+		return SEL_RETRY;
 	}
 
 	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
 		/* In delete-marked records, DB_TRX_ID must
 		always refer to an existing undo log record. */
 		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
-		goto exhausted;
+		return SEL_EXHAUSTED;
 	}
 
 	*out_rec = rec;
 
-	return(SEL_FOUND);
+	return SEL_FOUND;
 }
 #endif /* BTR_CUR_HASH_ADAPT */
 
@@ -4317,25 +4385,29 @@ row_search_mvcc(
 		DBUG_RETURN(DB_END_OF_INDEX);
 	}
 
-	ut_ad(!sync_check_iterate(sync_check()));
-
 	if (!prebuilt->table->space) {
 		DBUG_RETURN(DB_TABLESPACE_DELETED);
 	} else if (!prebuilt->table->is_readable()) {
-		DBUG_RETURN(prebuilt->table->space
-			    ? DB_DECRYPTION_FAILED
-			    : DB_TABLESPACE_NOT_FOUND);
+		if (fil_space_crypt_t* crypt_data =
+		    prebuilt->table->space->crypt_data) {
+			if (crypt_data->should_encrypt()) {
+				DBUG_RETURN(DB_DECRYPTION_FAILED);
+			}
+		}
+		DBUG_RETURN(DB_CORRUPTION);
 	} else if (!prebuilt->index_usable) {
 		DBUG_RETURN(DB_MISSING_HISTORY);
 	} else if (prebuilt->index->is_corrupted()) {
 		DBUG_RETURN(DB_CORRUPTION);
 	}
 
+	pcur->btr_cur.page_cur.index = index;
+
 	/* We need to get the virtual column values stored in secondary
 	index key, if this is covered index scan or virtual key read is
 	requested. */
-	bool    need_vrow = dict_index_has_virtual(prebuilt->index)
-		&& prebuilt->read_just_key;
+	bool    need_vrow = prebuilt->read_just_key
+		&& prebuilt->index->has_virtual();
 
 	/* Reset the new record lock info if READ UNCOMMITTED or
 	READ COMMITED isolation level is used. Then
@@ -4472,6 +4544,7 @@ early_not_found:
 	    && unique_search
 	    && btr_search_enabled
 	    && dict_index_is_clust(index)
+	    && !index->table->is_temporary()
 	    && !prebuilt->templ_contains_blob
 	    && !prebuilt->used_in_HANDLER
 	    && (prebuilt->mysql_row_len < srv_page_size / 8)) {
@@ -4549,7 +4622,6 @@ aborted:
 				/* NOTE that we do NOT store the cursor
 				position */
 				trx->op_info = "";
-				ut_ad(!sync_check_iterate(sync_check()));
 				ut_ad(!did_semi_consistent_read);
 				if (UNIV_LIKELY_NULL(heap)) {
 					mem_heap_free(heap);
@@ -4575,16 +4647,22 @@ aborted:
 	spatial_search = dict_index_is_spatial(index)
 			 && mode >= PAGE_CUR_CONTAIN;
 
+#ifdef UNIV_DEBUG
 	/* The state of a running trx can only be changed by the
 	thread that is currently serving the transaction. Because we
 	are that thread, we can read trx->state without holding any
 	mutex. */
-	ut_ad(prebuilt->sql_stat_start
-	      || trx->state == TRX_STATE_ACTIVE
-	      || (prebuilt->table->no_rollback()
-		  && trx->state == TRX_STATE_NOT_STARTED));
-
-	ut_ad(!trx_is_started(trx) || trx->state == TRX_STATE_ACTIVE);
+	switch (trx->state) {
+	case TRX_STATE_ACTIVE:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		ut_ad(prebuilt->sql_stat_start
+		      || prebuilt->table->no_rollback());
+		break;
+	default:
+		ut_ad("invalid trx->state" == 0);
+	}
+#endif
 
 	ut_ad(prebuilt->sql_stat_start
 	      || prebuilt->select_lock_type != LOCK_NONE
@@ -4617,8 +4695,6 @@ aborted:
 
 	thr = que_fork_get_first_thr(prebuilt->sel_graph);
 
-	thr->start_running();
-
 	clust_index = dict_table_get_first_index(prebuilt->table);
 
 	dberr_t err = DB_SUCCESS;
@@ -4641,7 +4717,7 @@ aborted:
 			trx->read_view.open(trx);
 		} else {
 wait_table_again:
-			err = lock_table(0, prebuilt->table,
+			err = lock_table(prebuilt->table, nullptr,
 					 prebuilt->select_lock_type == LOCK_S
 					 ? LOCK_IS : LOCK_IX, thr);
 
@@ -4667,6 +4743,15 @@ wait_table_again:
 			pcur, moves_up, &mtr);
 
 		if (UNIV_UNLIKELY(need_to_process)) {
+			if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+				mtr.commit();
+				trx->op_info = "";
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				return DB_CORRUPTION;
+			}
+
 			if (UNIV_UNLIKELY(prebuilt->row_read_type
 					  == ROW_READ_DID_SEMI_CONSISTENT)) {
 				/* We did a semi-consistent read,
@@ -4684,13 +4769,14 @@ wait_table_again:
 			pessimistic locking read, the record
 			cannot be skipped. */
 
-			goto next_rec;
+			goto next_rec_after_check;
 		}
 
 	} else if (dtuple_get_n_fields(search_tuple) > 0) {
 		pcur->btr_cur.thr = thr;
+		pcur->old_rec = nullptr;
 
-		if (dict_index_is_spatial(index)) {
+		if (index->is_spatial()) {
 			if (!prebuilt->rtr_info) {
 				prebuilt->rtr_info = rtr_create_rtr_info(
 					set_also_gap_locks, true,
@@ -4706,12 +4792,16 @@ wait_table_again:
 				prebuilt->rtr_info->search_tuple = search_tuple;
 				prebuilt->rtr_info->search_mode = mode;
 			}
-		}
 
-		err = btr_pcur_open_with_no_init(index, search_tuple, mode,
-						 BTR_SEARCH_LEAF, pcur, &mtr);
+			err = rtr_search_leaf(pcur, search_tuple, mode, &mtr);
+		} else {
+			err = btr_pcur_open_with_no_init(search_tuple, mode,
+							 BTR_SEARCH_LEAF,
+							 pcur, &mtr);
+		}
 
 		if (err != DB_SUCCESS) {
+page_corrupted:
 			rec = NULL;
 			goto page_read_error;
 		}
@@ -4729,6 +4819,10 @@ wait_table_again:
 			/* Try to place a gap lock on the next index record
 			to prevent phantoms in ORDER BY ... DESC queries */
 			const rec_t*	next_rec = page_rec_get_next_const(rec);
+			if (UNIV_UNLIKELY(!next_rec)) {
+				err = DB_CORRUPTION;
+				goto page_corrupted;
+			}
 
 			offsets = rec_get_offsets(next_rec, index, offsets,
 						  index->n_core_fields,
@@ -4749,25 +4843,66 @@ wait_table_again:
 			}
 		}
 	} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
-		err = btr_pcur_open_at_index_side(
-			mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
-			pcur, false, 0, &mtr);
+		err = pcur->open_leaf(mode == PAGE_CUR_G, index,
+				      BTR_SEARCH_LEAF, &mtr);
 
 		if (err != DB_SUCCESS) {
 			if (err == DB_DECRYPTION_FAILED) {
-				ib_push_warning(trx->mysql_thd,
-					DB_DECRYPTION_FAILED,
-					"Table %s is encrypted but encryption service or"
-					" used key_id is not available. "
-					" Can't continue reading table.",
-					prebuilt->table->name.m_name);
-				index->table->file_unreadable = true;
+				btr_decryption_failed(*index);
 			}
 			rec = NULL;
 			goto page_read_error;
 		}
 	}
 
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are already holding a leaf page latch here, either
+	in a secondary index or in a clustered index.
+
+	If we are holding a clustered index page latch, there clearly
+	is no potential for race condition with a concurrent INSERT:
+	such INSERT would be blocked by us.
+
+	If we are holding a secondary index page latch, then we are
+	not directly blocking a concurrent INSERT that might update
+	bulk_trx_id to something that does not exist in our read view.
+	But, in that case, the entire table (all indexes) must have
+	been empty. So, even if our read below missed the update of
+	index->table->bulk_trx_id, we can safely proceed to reading
+	the empty secondary index page. Our latch will prevent the
+	INSERT from proceeding to that page. It will first modify
+	the clustered index. Also, we may only look up something in
+	the clustered index if the secondary index page is not empty
+	to begin with. So, only if the table is corrupted
+	(the clustered index is empty but the secondary index is not)
+	we could return corrupted results. */
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+	    || !trx->read_view.is_open()) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* InnoDB should allow the transaction to read all
+		the rows when InnoDB intends to do any locking
+		on the record */
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && !trx->read_view.changes_visible(bulk_trx_id)) {
+			trx->op_info = "";
+			err = DB_END_OF_INDEX;
+			goto normal_return;
+		}
+	}
+
 rec_loop:
 	DEBUG_SYNC_C("row_search_rec_loop");
 	if (trx_is_interrupted(trx)) {
@@ -4783,11 +4918,6 @@ rec_loop:
 
 	rec = btr_pcur_get_rec(pcur);
 
-	if (!index->table->is_readable()) {
-		err = DB_DECRYPTION_FAILED;
-		goto page_read_error;
-	}
-
 	ut_ad(!!page_rec_is_comp(rec) == comp);
 	ut_ad(page_rec_is_leaf(rec));
 
@@ -4904,7 +5034,7 @@ wrong_offs:
 
 			page_cur_set_after_last(btr_pcur_get_block(pcur),
 						btr_pcur_get_page_cur(pcur));
-			pcur->old_stored = false;
+			pcur->old_rec = nullptr;
 			goto next_rec;
 		}
 	}
@@ -5135,17 +5265,18 @@ no_gap_lock:
 					!= ROW_READ_TRY_SEMI_CONSISTENT)
 			    || unique_search
 			    || index != clust_index) {
-
-				goto lock_wait_or_error;
+				if (!prebuilt->skip_locked) {
+					goto lock_wait_or_error;
+				}
+			} else {
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				row_sel_build_committed_vers_for_mysql(
+					clust_index, prebuilt, rec,
+					&offsets, &heap, &old_vers,
+					need_vrow ? &vrow : NULL, &mtr);
 			}
 
-			/* The following call returns 'offsets'
-			associated with 'old_vers' */
-			row_sel_build_committed_vers_for_mysql(
-				clust_index, prebuilt, rec,
-				&offsets, &heap, &old_vers, need_vrow ? &vrow : NULL,
-			        &mtr);
-
 			/* Check whether it was a deadlock or not, if not
 			a deadlock and the transaction had to wait then
 			release the lock it is waiting on. */
@@ -5154,6 +5285,8 @@ no_gap_lock:
 
 			switch (err) {
 			case DB_SUCCESS:
+				ut_ad(
+				    !trx->lock.was_chosen_as_deadlock_victim);
 				/* The lock was granted while we were
 				searching for the last committed version.
 				Do a normal locking read. */
@@ -5168,7 +5301,16 @@ no_gap_lock:
 			case DB_LOCK_WAIT:
 				ut_ad(!dict_index_is_spatial(index));
 				err = DB_SUCCESS;
+				if (prebuilt->skip_locked) {
+					goto next_rec;
+				}
 				break;
+		        case DB_LOCK_WAIT_TIMEOUT:
+				if (prebuilt->skip_locked) {
+					err = DB_SUCCESS;
+					goto next_rec;
+				}
+				/* fall through */
 			default:
 				ut_error;
 			}
@@ -5188,7 +5330,13 @@ no_gap_lock:
 			} else {
 				goto lock_wait_or_error;
 			}
-
+			break;
+		case DB_LOCK_WAIT_TIMEOUT:
+			if (prebuilt->skip_locked) {
+				err = DB_SUCCESS;
+				goto next_rec;
+			}
+			/* fall through */
 		default:
 
 			goto lock_wait_or_error;
@@ -5198,6 +5346,7 @@ no_gap_lock:
 		a previous version of the record */
 
 		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+		    || prebuilt->table->is_temporary()
 		    || prebuilt->table->no_rollback()) {
 
 			/* Do nothing: we let a non-locking SELECT read the
@@ -5210,18 +5359,24 @@ no_gap_lock:
 			high force recovery level set, we try to avoid crashes
 			by skipping this lookup */
 
-			if (!lock_clust_rec_cons_read_sees(
-				    rec, index, offsets, &trx->read_view)) {
+			err = row_sel_clust_sees(rec, *index, offsets,
+						 trx->read_view);
+
+			switch (err) {
+			default:
+				goto lock_wait_or_error;
+			case DB_SUCCESS:
+				break;
+			case DB_SUCCESS_LOCKED_REC:
 				ut_ad(srv_force_recovery
 				      < SRV_FORCE_NO_UNDO_LOG_SCAN);
 				rec_t*	old_vers;
 				/* The following call returns 'offsets'
 				associated with 'old_vers' */
 				err = row_sel_build_prev_vers_for_mysql(
-					&trx->read_view, clust_index,
-					prebuilt, rec, &offsets, &heap,
-					&old_vers, need_vrow ? &vrow : NULL,
-					&mtr);
+					prebuilt, clust_index,
+					rec, &offsets, &heap, &old_vers,
+					need_vrow ? &vrow : nullptr, &mtr);
 
 				if (err != DB_SUCCESS) {
 
@@ -5246,9 +5401,13 @@ no_gap_lock:
 
 			ut_ad(!dict_index_is_clust(index));
 
-			if (!srv_read_only_mode
-			    && !lock_sec_rec_cons_read_sees(
-					rec, index, &trx->read_view)) {
+			if (!srv_read_only_mode) {
+				trx_id_t trx_id = page_get_max_trx_id(
+					page_align(rec));
+				ut_ad(trx_id);
+				if (trx->read_view.sees(trx_id)) {
+					goto locks_ok;
+				}
 				/* We should look at the clustered index.
 				However, as this is a non-locking read,
 				we can skip the clustered index lookup if
@@ -5358,13 +5517,15 @@ requires_clust_rec:
 						      &offsets, &heap,
 						      need_vrow ? &vrow : NULL,
 						      &mtr);
+		if (err == DB_LOCK_WAIT && prebuilt->skip_locked) {
+			err = lock_trx_handle_wait(trx);
+		}
 		switch (err) {
 		case DB_SUCCESS:
 			if (clust_rec == NULL) {
 				/* The record did not exist in the read view */
 				ut_ad(prebuilt->select_lock_type == LOCK_NONE
 				      || dict_index_is_spatial(index));
-
 				goto next_rec;
 			}
 			break;
@@ -5377,6 +5538,13 @@ requires_clust_rec:
 			}
 			err = DB_SUCCESS;
 			break;
+		case DB_LOCK_WAIT_TIMEOUT:
+		case DB_LOCK_WAIT:
+			if (prebuilt->skip_locked) {
+				err = DB_SUCCESS;
+				goto next_rec;
+			}
+			/* fall through */
 		default:
 			vrow = NULL;
 			goto lock_wait_or_error;
@@ -5452,9 +5620,7 @@ use_covering_index:
 	    && !prebuilt->templ_contains_blob
 	    && !prebuilt->clust_index_was_generated
 	    && !prebuilt->used_in_HANDLER
-	    && prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE
 	    && !prebuilt->in_fts_query) {
-
 		/* Inside an update, for example, we do not cache rows,
 		since we may use the cursor position to do the actual
 		update, that is why we require ...lock_type == LOCK_NONE.
@@ -5519,29 +5685,8 @@ use_covering_index:
 		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
 			goto next_rec;
 		}
-
 	} else {
-		if (UNIV_UNLIKELY
-		    (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
-			/* CHECK TABLE: fetch the row */
-
-			if (result_rec != rec
-			    && !prebuilt->need_to_access_clustered) {
-				/* We used 'offsets' for the clust
-				rec, recalculate them for 'rec' */
-				offsets = rec_get_offsets(rec, index, offsets,
-							  index->n_core_fields,
-							  ULINT_UNDEFINED,
-							  &heap);
-				result_rec = rec;
-			}
-
-			memcpy(buf + 4, result_rec
-			       - rec_offs_extra_size(offsets),
-			       rec_offs_size(offsets));
-			mach_write_to_4(buf,
-					rec_offs_extra_size(offsets) + 4);
-		} else if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+		if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
 			/* The record was not yet converted to MySQL format. */
 			if (!row_sel_store_mysql_rec(
 				    buf, prebuilt, result_rec, vrow,
@@ -5609,6 +5754,7 @@ next_rec:
 			  == ROW_READ_DID_SEMI_CONSISTENT)) {
 		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
 	}
+next_rec_after_check:
 	did_semi_consistent_read = false;
 	prebuilt->new_rec_locks = 0;
 	vrow = NULL;
@@ -5632,9 +5778,7 @@ next_rec:
 
 	if (spatial_search) {
 		/* No need to do store restore for R-tree */
-		mtr.commit();
-		mtr.start();
-		mtr_extra_clust_savepoint = 0;
+		mtr.rollback_to_savepoint(0);
 	} else if (mtr_extra_clust_savepoint) {
 		/* We must release any clustered index latches
 		if we are moving to the next non-clustered
@@ -5642,9 +5786,10 @@ next_rec:
 		order if we would access a different clustered
 		index page right away without releasing the previous. */
 		mtr.rollback_to_savepoint(mtr_extra_clust_savepoint);
-		mtr_extra_clust_savepoint = 0;
 	}
 
+	mtr_extra_clust_savepoint = 0;
+
 	if (moves_up) {
 		if (UNIV_UNLIKELY(spatial_search)) {
 			if (rtr_pcur_move_to_next(
@@ -5652,24 +5797,20 @@ next_rec:
 				goto rec_loop;
 			}
 		} else {
-			const buf_block_t* block = btr_pcur_get_block(pcur);
-			/* This is based on btr_pcur_move_to_next(),
-			but avoids infinite read loop of a corrupted page. */
+			/* This is based on btr_pcur_move_to_next() */
 			ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
 			ut_ad(pcur->latch_mode != BTR_NO_LATCHES);
-			pcur->old_stored = false;
+			pcur->old_rec = nullptr;
 			if (btr_pcur_is_after_last_on_page(pcur)) {
 				if (btr_pcur_is_after_last_in_tree(pcur)) {
 					goto not_moved;
 				}
-				btr_pcur_move_to_next_page(pcur, &mtr);
-				if (UNIV_UNLIKELY(btr_pcur_get_block(pcur)
-						  == block)) {
-					err = DB_CORRUPTION;
+				err = btr_pcur_move_to_next_page(pcur, &mtr);
+				if (err != DB_SUCCESS) {
 					goto lock_wait_or_error;
 				}
-			} else {
-				btr_pcur_move_to_next_on_page(pcur);
+			} else if (!btr_pcur_move_to_next_on_page(pcur)) {
+				goto corrupted;
 			}
 
 			goto rec_loop;
@@ -5678,6 +5819,11 @@ next_rec:
 		if (btr_pcur_move_to_prev(pcur, &mtr)) {
 			goto rec_loop;
 		}
+		if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+corrupted:
+			err = DB_CORRUPTION;
+			goto normal_return;
+		}
 	}
 
 not_moved:
@@ -5705,13 +5851,6 @@ lock_table_wait:
 	mtr_extra_clust_savepoint = 0;
 
 	trx->error_state = err;
-
-	/* The following is a patch for MySQL */
-
-	if (thr->is_active) {
-		que_thr_stop_for_mysql(thr);
-	}
-
 	thr->lock_state = QUE_THR_LOCK_ROW;
 
 	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
@@ -5766,16 +5905,6 @@ lock_table_wait:
 	goto func_exit;
 
 normal_return:
-	/*-------------------------------------------------------------*/
-	{
-		/* handler_index_cond_check() may pull TR_table search
-		   which initates another row_search_mvcc(). */
-		ut_d(ulint n_active_thrs= trx->lock.n_active_thrs);
-		ut_d(trx->lock.n_active_thrs= 1);
-		thr->stop_no_error();
-		ut_d(trx->lock.n_active_thrs= n_active_thrs - 1);
-	}
-
 	mtr.commit();
 
 	DEBUG_SYNC_C("row_search_for_mysql_before_return");
@@ -5830,8 +5959,6 @@ func_exit:
 		}
 	}
 
-	ut_ad(!sync_check_iterate(sync_check()));
-
 	DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
 
 	DBUG_RETURN(err);
@@ -5914,18 +6041,11 @@ row_count_rtree_recs(
 					prebuilt->mysql_row_len);
 	buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
 
-	ulint cnt = 1000;
+	ulint direction = 0;
 
-	ret = row_search_for_mysql(buf, PAGE_CUR_WITHIN, prebuilt, 0, 0);
 loop:
-	/* Check thd->killed every 1,000 scanned rows */
-	if (--cnt == 0) {
-		if (trx_is_interrupted(prebuilt->trx)) {
-			ret = DB_INTERRUPTED;
-			goto func_exit;
-		}
-		cnt = 1000;
-	}
+	ret = row_search_mvcc(buf, PAGE_CUR_WITHIN, prebuilt, 0, direction);
+	direction = ROW_SEL_NEXT;
 
 	switch (ret) {
 	case DB_SUCCESS:
@@ -5947,12 +6067,774 @@ func_exit:
 		return(ret);
 	}
 
-	*n_rows = *n_rows + 1;
+	++*n_rows;
+	goto loop;
+}
 
-	ret = row_search_for_mysql(
-		buf, PAGE_CUR_WITHIN, prebuilt, 0, ROW_SEL_NEXT);
+/** Check if a version of a clustered index record and a secondary
+index record match.
+
+@param prebuilt       index and transaction
+@param clust_rec      a version of a clustered index record
+@param clust_index    clustered index
+@param clust_offsets  rec_get_offsets(clust_rec, clust_index)
+@param rec            secondary index leaf page record
+@param offsets        rec_get_offsets(rec, index)
+@return an error code
+@retval DB_SUCCESS             if rec matches clust_rec
+@retval DB_SUCCESS_LOCKED_REC  if rec does not match clust_rec
+*/
+static dberr_t row_check_index_match(row_prebuilt_t *prebuilt,
+                                     const rec_t *clust_rec,
+                                     const dict_index_t *clust_index,
+                                     const rec_offs *clust_offsets,
+                                     const rec_t *rec,
+                                     const dict_index_t *index,
+                                     const rec_offs *offsets)
+{
+  ut_ad(index == prebuilt->index);
 
-	goto loop;
+  ib_vcol_row vc(index->has_virtual() ? mem_heap_create(256) : nullptr);
+
+  const uint16_t n= index->n_user_defined_cols;
+
+  for (uint16_t i= 0; i < n; i++)
+  {
+    ulint pos= 0;
+    ulint len, sec_len;
+
+    const dict_field_t &ifield= index->fields[i];
+    const byte *sec_field= rec_get_nth_field(rec, offsets, i, &sec_len);
+    const byte *field;
+
+    if (ifield.col->is_virtual())
+    {
+      /* Virtual column values must be reconstructed from the base columns. */
+      row_ext_t *ext;
+      byte *record= vc.record(prebuilt->trx->mysql_thd, clust_index,
+                              &prebuilt->m_mysql_table);
+      const dict_v_col_t *v_col= reinterpret_cast<const dict_v_col_t*>
+        (ifield.col);
+      dtuple_t *row= row_build(ROW_COPY_POINTERS,
+                               clust_index, clust_rec, clust_offsets,
+                               nullptr, nullptr, nullptr, &ext, vc.heap);
+      if (dfield_t *vfield=
+          innobase_get_computed_value(row, v_col, clust_index, &vc.heap,
+                                      nullptr, nullptr,
+                                      prebuilt->trx->mysql_thd,
+                                      prebuilt->m_mysql_table,
+                                      record, nullptr, nullptr))
+      {
+        len= vfield->len;
+        field= static_cast<byte*>(vfield->data);
+      }
+      else
+      {
+        innobase_report_computed_value_failed(row);
+        return DB_COMPUTE_VALUE_FAILED;
+      }
+    }
+    else
+    {
+      pos= dict_col_get_clust_pos(ifield.col, clust_index);
+      field= rec_get_nth_cfield(clust_rec, clust_index, clust_offsets, pos,
+                                &len);
+      if (len == UNIV_SQL_NULL)
+      {
+        if (sec_len == UNIV_SQL_NULL)
+          continue;
+        return DB_SUCCESS_LOCKED_REC;
+      }
+      if (sec_len == UNIV_SQL_NULL)
+        return DB_SUCCESS_LOCKED_REC;
+
+      if (rec_offs_nth_extern(clust_offsets, pos))
+      {
+        if (len == BTR_EXTERN_FIELD_REF_SIZE)
+          goto compare_blobs;
+        len-= BTR_EXTERN_FIELD_REF_SIZE;
+      }
+
+      if (ifield.prefix_len)
+      {
+        len=
+          dtype_get_at_most_n_mbchars(ifield.col->prtype, ifield.col->mbminlen,
+                                      ifield.col->mbmaxlen,
+                                      ifield.prefix_len, len,
+                                      reinterpret_cast<const char*>(field));
+        if (len < sec_len)
+          goto check_for_blob;
+      }
+      else
+      {
+check_for_blob:
+        if (rec_offs_nth_extern(clust_offsets, pos))
+        {
+compare_blobs:
+          if (!row_sel_sec_rec_is_for_blob(ifield.col->mtype,
+                                           ifield.col->prtype,
+                                           ifield.col->mbminlen,
+                                           ifield.col->mbmaxlen,
+                                           field, len, sec_field, sec_len,
+                                           ifield.prefix_len,
+                                           clust_index->table))
+            return DB_SUCCESS_LOCKED_REC;
+          continue;
+        }
+      }
+    }
+
+    if (cmp_data_data(ifield.col->mtype, ifield.col->prtype,
+                      field, len, sec_field, sec_len))
+      return DB_SUCCESS_LOCKED_REC;
+  }
+
+  return DB_SUCCESS;
+}
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt    index and transaction
+@param n_rows      number of records counted
+
+@return error code
+@retval DB_SUCCESS  if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+{
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs_init(offsets_);
+
+  *n_rows= 0;
+  dict_index_t *const index= prebuilt->index;
+
+  if (!index->is_btree())
+    return DB_CORRUPTION;
+
+  mem_heap_t *heap= mem_heap_create(100);
+
+  dtuple_t *prev_entry= nullptr;
+  mtr_t mtr;
+  mtr.start();
+
+  dict_index_t *clust_index= dict_table_get_first_index(prebuilt->table);
+  prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+  dberr_t err= prebuilt->pcur->open_leaf(true, index, BTR_SEARCH_LEAF, &mtr);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  {
+func_exit:
+    mtr.commit();
+    mem_heap_free(heap);
+    return err;
+  }
+
+  if (const trx_id_t bulk_trx_id= index->table->bulk_trx_id)
+    if (!prebuilt->trx->read_view.changes_visible(bulk_trx_id))
+      goto func_exit;
+
+  ReadView check_table_extended_view;
+  ReadView &view=
+    prebuilt->need_to_access_clustered &&
+    !prebuilt->table->is_temporary() &&
+    prebuilt->trx->isolation_level != TRX_ISO_READ_UNCOMMITTED
+    ? check_table_extended_view : prebuilt->trx->read_view;
+  if (&view == &check_table_extended_view)
+    check_table_extended_view.set_creator_trx_id(prebuilt->trx->id);
+
+page_loop:
+  if (&view == &check_table_extended_view)
+    /* In CHECK TABLE...EXTENDED, we make a copy of purge_sys.end_view
+    while holding a shared latch on the index leaf page.
+    Should a currently active purge batch desire to remove any further
+    records from this page, it would be blocked by our page latch.
+
+    We will consult check_table_extended_view to determine if a
+    clustered index record corresponding to a secondary index record
+    is visible to the current purge batch. Right after we have made our
+    copy, purge_sys.end_view is free to be changed again.
+
+    If we have an orphan secondary index record, we may attempt to
+    request a clustered index record version that cannot be retrieved
+    any more because the undo log records may have been freed
+    (according to the purge_sys.end_view). In such a case,
+    trx_undo_get_undo_rec() would cause
+    trx_undo_prev_version_build() and trx_undo_prev_version_build()
+    to return DB_MISSING_HISTORY. */
+    static_cast<ReadViewBase&>(check_table_extended_view)=
+      purge_sys_t::end_view_guard{}.view();
+
+rec_loop:
+  ut_ad(err == DB_SUCCESS);
+
+  if (!btr_pcur_move_to_next_on_page(prebuilt->pcur))
+  {
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
+
+  const rec_t *rec= btr_pcur_get_rec(prebuilt->pcur);
+  rec_offs *offsets= offsets_;
+
+  if (page_rec_is_supremum(rec))
+  {
+  next_page:
+    if (btr_pcur_is_after_last_in_tree(prebuilt->pcur))
+      goto func_exit;
+    err= btr_pcur_move_to_next_page(prebuilt->pcur, &mtr);
+    if (err == DB_SUCCESS && trx_is_interrupted(prebuilt->trx))
+      err= DB_INTERRUPTED;
+    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+      goto func_exit;
+    goto page_loop;
+  }
+
+  offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+                           ULINT_UNDEFINED, &heap);
+
+  const auto info_bits=
+    rec_get_info_bits(rec, prebuilt->table->not_redundant());
+  const bool rec_deleted= info_bits & REC_INFO_DELETED_FLAG;
+
+  if (UNIV_UNLIKELY(info_bits & REC_INFO_MIN_REC_FLAG))
+  {
+    if (*n_rows || !index->is_instant())
+    {
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                          "InnoDB: invalid record encountered");
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+    }
+    goto next_rec;
+  }
+
+  if (prebuilt->table->is_temporary())
+  {
+  count_or_not:
+    if (rec_deleted)
+      goto next_rec;
+  }
+  else if (index->is_clust())
+  {
+    if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED)
+      goto count_or_not;
+
+    trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+    if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+        UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+    {
+    invalid_trx_id:
+      if (prebuilt->autoinc_error == DB_SUCCESS)
+        push_warning_printf(prebuilt->trx->mysql_thd,
+                            Sql_condition::WARN_LEVEL_WARN,
+                            ER_NOT_KEYFILE,
+                            "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+                            " exceeds the system-wide maximum",
+                            rec_trx_id);
+      prebuilt->autoinc_error= DB_CORRUPTION;
+      goto next_rec;
+    }
+
+    if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+    {
+      ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+      rec_t *old_vers;
+      /* The following call returns 'offsets' associated with 'old_vers' */
+      err= row_sel_build_prev_vers_for_mysql(prebuilt, index, rec, &offsets,
+                                             &heap, &old_vers, nullptr, &mtr);
+
+      if (err != DB_SUCCESS)
+        goto func_exit;
+
+      if (old_vers)
+      {
+        rec= old_vers;
+        rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+        if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+            UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+          goto invalid_trx_id;
+
+        if (!rec_get_deleted_flag(rec, prebuilt->table->not_redundant()))
+          goto count_row;
+      }
+      else
+        offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+                                 ULINT_UNDEFINED, &heap);
+      goto next_rec;
+    }
+    else if (!rec_deleted && !rec_trx_id);
+    else if (!check_table_extended_view.changes_visible(rec_trx_id));
+    else if (prebuilt->autoinc_error == DB_SUCCESS)
+    {
+      const char *msg= rec_deleted
+        ? "Unpurged clustered index record"
+        : "Clustered index record with stale history";
+
+      ib::warn w;
+      w << msg << " in table " << index->table->name << ": "
+        << rec_offsets_print(rec, offsets);
+      prebuilt->autoinc_error= DB_MISSING_HISTORY;
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN,
+                          ER_NOT_KEYFILE, "InnoDB: %s", w.m_oss.str().c_str());
+    }
+
+    goto count_or_not;
+  }
+  else if (const trx_id_t page_trx_id= page_get_max_trx_id(page_align(rec)))
+  {
+    if (page_trx_id >= trx_sys.get_max_trx_id())
+      goto invalid_PAGE_MAX_TRX_ID;
+    if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
+    else if (&view == &check_table_extended_view || rec_deleted ||
+             !view.sees(page_trx_id))
+    {
+      bool got_extended_match= &view == &check_table_extended_view;
+      const auto savepoint= mtr.get_savepoint();
+
+      row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, index, offsets);
+      err= btr_pcur_open_with_no_init(prebuilt->clust_ref,
+                                      PAGE_CUR_LE, BTR_SEARCH_LEAF,
+                                      prebuilt->clust_pcur, &mtr);
+      if (err != DB_SUCCESS)
+        goto func_exit;
+
+      const rec_t *clust_rec= btr_pcur_get_rec(prebuilt->clust_pcur);
+
+      /* Note: only if the search ends up on a non-infimum record is the
+      low_match value the real match to the search tuple */
+
+      if (!page_rec_is_user_rec(clust_rec) ||
+          btr_pcur_get_low_match(prebuilt->clust_pcur) < clust_index->n_uniq)
+      {
+        if (!rec_deleted)
+        {
+        not_found:
+          /* MDEV-29823 FIXME: There is a race condition between
+          rollback, purge, and possibly other SQL connections that
+          are creating and releasing read views. At the time
+          row_undo_mod_del_mark_or_remove_sec_low() is executing
+          rollback on a secondary index record, purge_sys.view
+          may not allow it to delete the record, and it will be
+          delete-marked. Eventually purge_sys.view would advance,
+          but the delete-marked record could never be removed,
+          because no undo log record was ever added to
+          the purge queue by trx_purge_add_undo_to_history().
+
+          For now, we will not flag an error about orphan secondary index
+          records that are delete-marked; we will only warn about them. */
+
+          if (!rec_deleted || prebuilt->autoinc_error == DB_SUCCESS)
+          {
+            ib::error_or_warn w(!rec_deleted);
+            w << "Clustered index record not found for index "
+              << index->name << " of table " << index->table->name
+              << ": " << rec_offsets_print(rec, offsets);
+            push_warning_printf(prebuilt->trx->mysql_thd,
+                                Sql_condition::WARN_LEVEL_WARN,
+                                ER_NOT_KEYFILE, "InnoDB: %s",
+                                w.m_oss.str().c_str());
+          }
+
+          if (prebuilt->autoinc_error == DB_SUCCESS)
+            prebuilt->autoinc_error= rec_deleted
+              ? DB_MISSING_HISTORY
+              : DB_CORRUPTION;
+        }
+        else if (&view == &check_table_extended_view)
+        extended_not_found:
+          if (view.changes_visible(page_trx_id))
+            goto not_found;
+      did_not_find:
+        mtr.rollback_to_savepoint(savepoint);
+        goto next_rec;
+      }
+
+      rec_offs *clust_offsets;
+      trx_id_t rec_trx_id;
+      rec_t *old_vers= nullptr;
+
+      bool found_in_view= false;
+      trx_id_t visible_trx_id= ~0ULL;
+
+      if (ulint trx_id_offset= clust_index->trx_id_offset)
+      {
+        clust_offsets= nullptr;
+      read_trx_id:
+        rec_trx_id= trx_read_trx_id(clust_rec + trx_id_offset);
+
+        if (clust_rec[trx_id_offset + DATA_TRX_ID_LEN] & 0x80)
+        {
+          if (UNIV_UNLIKELY
+              (rec_get_deleted_flag(clust_rec,
+                                    prebuilt->table->not_redundant())))
+          {
+            err= DB_CORRUPTION;
+            goto func_exit;
+          }
+
+          /* This is the oldest available record version (fresh insert). */
+          if (!view.changes_visible(rec_trx_id))
+          {
+            if (rec_trx_id >= view.low_limit_id() &&
+                UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+              goto invalid_rec_trx_id;
+            if (got_extended_match)
+              goto check_latest_version;
+            goto did_not_find;
+          }
+        }
+      }
+      else
+      {
+        clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                       clust_index->n_core_fields,
+                                       ULINT_UNDEFINED, &heap);
+        ulint trx_id_pos= clust_index->n_uniq ? clust_index->n_uniq : 1;
+        ulint len;
+        trx_id_offset= rec_get_nth_field_offs(clust_offsets, trx_id_pos, &len);
+        ut_ad(len == DATA_TRX_ID_LEN);
+        goto read_trx_id;
+      }
+
+      if (got_extended_match)
+      {
+      check_latest_version:
+        /* In CHECK TABLE...EXTENDED, always check if the secondary
+        index record matches the latest clustered index record
+        version, no matter if it is visible in our own read view.
+
+        If the latest clustered index version is delete-marked and
+        purgeable, it is not safe to fetch any BLOBs for column prefix
+        indexes because they may already have been freed. */
+        if (rec_trx_id &&
+            rec_get_deleted_flag(clust_rec,
+                                 prebuilt->table->not_redundant()) &&
+            purge_sys.is_purgeable(rec_trx_id))
+          goto did_not_find;
+
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+        err= row_check_index_match(prebuilt,
+                                   clust_rec, clust_index, clust_offsets,
+                                   rec, index, offsets);
+
+        switch (err) {
+        default:
+          goto func_exit;
+        case DB_SUCCESS_LOCKED_REC:
+        case DB_SUCCESS:
+          break;
+        }
+
+        got_extended_match= err == DB_SUCCESS;
+        err= DB_SUCCESS;
+
+        if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+          /* While CHECK TABLE ... EXTENDED checks for a matching
+          clustered index record version for each secondary index
+          record, it must count only those records that belong to its
+          own read view.
+
+          If the latest version of clust_rec matches rec but is not
+          in our read view, there may still be an older version of
+          clust_rec that not only matches rec but is in our view.
+          We must evaluate old versions before deciding whether rec
+          should be counted. */
+          goto check_old_vers;
+
+        /* Remember that this is the visible clust_rec for rec,
+        and whether it matches rec. */
+        visible_trx_id= rec_trx_id;
+        found_in_view= got_extended_match &&
+          !rec_get_deleted_flag(clust_rec,
+                                prebuilt->table->not_redundant());
+
+        if (!got_extended_match)
+          goto check_old_vers;
+
+        if (!found_in_view)
+          goto did_not_find;
+
+      found_match:
+        mtr.rollback_to_savepoint(savepoint);
+        goto count_row;
+      }
+      else if (!view.changes_visible(rec_trx_id))
+      {
+      check_old_vers:
+        if (rec_trx_id >= view.low_limit_id() &&
+            UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+        {
+        invalid_rec_trx_id:
+          if (prebuilt->autoinc_error == DB_SUCCESS)
+            push_warning_printf(prebuilt->trx->mysql_thd,
+                                Sql_condition::WARN_LEVEL_WARN,
+                                ER_NOT_KEYFILE,
+                                "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+                                " exceeds the system-wide maximum",
+                                rec_trx_id);
+          goto not_found;
+        }
+
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+
+        row_sel_reset_old_vers_heap(prebuilt);
+        /* The following is adapted from row_vers_build_for_consistent_read()
+        because when using check_table_extended_view, we must
+        consider every available version of the clustered index record. */
+        mem_heap_t *vers_heap= nullptr;
+
+        for (;;)
+        {
+          mem_heap_t *prev_heap= vers_heap;
+          vers_heap= mem_heap_create(1024);
+          err= trx_undo_prev_version_build(clust_rec,
+                                           clust_index, clust_offsets,
+                                           vers_heap, &old_vers,
+                                           nullptr, nullptr, 0);
+          if (prev_heap)
+            mem_heap_free(prev_heap);
+          if (err != DB_SUCCESS)
+          {
+          old_vers_err:
+            mem_heap_free(vers_heap);
+            if (err == DB_MISSING_HISTORY)
+            {
+              err= DB_SUCCESS;
+              if (got_extended_match)
+                goto did_not_find;
+              goto not_found;
+            }
+            goto func_exit;
+          }
+
+          if (UNIV_UNLIKELY(!old_vers))
+          {
+            mem_heap_free(vers_heap);
+            /* We did not find a matching clustered index record version
+            for the secondary index record. Normal CHECK TABLE will simply
+            not count the secondary index record; CHECK TABLE ... EXTENDED
+            will flag such orphan records if appropriate.
+
+            A secondary index record may may be "temporarily orphan"
+            if purge is in progress. We will only flag them if
+            everything up to PAGE_MAX_TRX_ID has been fully purged.
+
+            "Temporary orphans" may be produced when
+            row_undo_mod_clust() resets the DB_TRX_ID of the latest
+            clust_rec version or when trx_undo_prev_version_build()
+            encounters a BLOB that may have been freed according to
+            purge_sys.view (not purge_sys.end_view). */
+            if (&view == &check_table_extended_view && !got_extended_match)
+              goto extended_not_found;
+            goto did_not_find;
+          }
+
+          clust_rec= old_vers;
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, clust_offsets,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+
+          rec_trx_id= row_get_rec_trx_id(clust_rec, clust_index,
+                                         clust_offsets);
+
+          if (UNIV_UNLIKELY(rec_trx_id >=
+                            prebuilt->trx->read_view.low_limit_id() &&
+                            rec_trx_id >= trx_sys.get_max_trx_id()))
+          {
+            mem_heap_free(vers_heap);
+            goto invalid_rec_trx_id;
+          }
+
+          const bool rec_visible=
+            prebuilt->trx->read_view.changes_visible(rec_trx_id);
+          const bool clust_rec_deleted=
+            rec_get_deleted_flag(clust_rec, prebuilt->table->not_redundant());
+
+          if (&view != &prebuilt->trx->read_view)
+          {
+            /* It is not safe to fetch BLOBs of committed delete-marked
+            records that may have been freed in purge. */
+            err= clust_rec_deleted && rec_trx_id &&
+              purge_sys.is_purgeable(rec_trx_id)
+              ? DB_SUCCESS_LOCKED_REC
+              : row_check_index_match(prebuilt,
+                                      clust_rec, clust_index, clust_offsets,
+                                      rec, index, offsets);
+
+            switch (err) {
+            default:
+              goto old_vers_err;
+            case DB_SUCCESS_LOCKED_REC:
+              if (rec_visible && !~visible_trx_id)
+                visible_trx_id= rec_trx_id;
+              continue;
+            case DB_SUCCESS:
+              got_extended_match= true;
+              if (!rec_visible)
+                continue;
+              if (!~visible_trx_id)
+              {
+                visible_trx_id= rec_trx_id;
+                found_in_view= !clust_rec_deleted;
+              }
+              mem_heap_free(vers_heap);
+              if (!found_in_view)
+                goto did_not_find;
+              goto found_match;
+            }
+          }
+          else if (rec_visible)
+          {
+            if (!clust_rec_deleted)
+            {
+              clust_rec= rec_copy(mem_heap_alloc(heap,
+                                                 rec_offs_size(clust_offsets)),
+                                  clust_rec, clust_offsets);
+              rec_offs_make_valid(clust_rec, clust_index, true, clust_offsets);
+            }
+            mem_heap_free(vers_heap);
+            if (clust_rec_deleted)
+              goto did_not_find;
+            goto check_match;
+          }
+        }
+      }
+      else if (rec_get_deleted_flag(clust_rec,
+                                    prebuilt->table->not_redundant()))
+        goto did_not_find;
+
+      ut_ad(clust_rec);
+      ut_ad(&view != &check_table_extended_view);
+
+      /* If we had to go to an earlier version of row or the secondary
+      index record is delete marked, then it may be that the secondary
+      index record corresponding to clust_rec (or old_vers) is not
+      rec; in that case we must ignore such row because in our
+      snapshot rec would not have existed. Remember that from rec we
+      cannot see directly which transaction id corresponds to it: we
+      have to go to the clustered index record. A query where we want
+      to fetch all rows where the secondary index value is in some
+      interval would return a wrong result if we would not drop rows
+      which we come to visit through secondary index records that
+      would not really exist in our snapshot. */
+
+      if (rec_deleted)
+      {
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+      check_match:
+        /* This clustered index record version exists in
+        prebuilt->trx->read_view and is not delete-marked.
+        By design, any BLOBs in it are not allowed to be
+        freed in the purge of committed transaction history. */
+        err= row_check_index_match(prebuilt, clust_rec, clust_index,
+                                   clust_offsets, rec, index, offsets);
+        switch (err) {
+        case DB_SUCCESS:
+          break;
+        case DB_SUCCESS_LOCKED_REC:
+          err= DB_SUCCESS;
+          goto did_not_find;
+        default:
+          goto func_exit;
+        }
+      }
+
+      mtr.rollback_to_savepoint(savepoint);
+    }
+  }
+  else
+  {
+  invalid_PAGE_MAX_TRX_ID:
+    if (UNIV_LIKELY(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN))
+    {
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                          "InnoDB: Invalid PAGE_MAX_TRX_ID=%llu"
+                          " in index '%-.200s'",
+                          page_trx_id, index->name());
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+    }
+    goto next_rec;
+  }
+
+count_row:
+  ++*n_rows;
+
+  if (prev_entry)
+  {
+    ulint matched_fields= 0;
+    int cmp= cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+                                       &matched_fields);
+    const char* msg;
+
+    if (UNIV_LIKELY(cmp < 0));
+    else if (cmp > 0)
+    {
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+      msg= "index records in a wrong order in ";
+not_ok:
+      ib::error() << msg << index->name << " of table " << index->table->name
+                  << ": " << *prev_entry << ", "
+                  << rec_offsets_print(rec, offsets);
+    }
+    else if (index->is_unique() && matched_fields >=
+             dict_index_get_n_ordering_defined_by_user(index))
+    {
+      /* NULL values in unique indexes are considered not to be duplicates */
+      for (ulint i= 0; i < dict_index_get_n_ordering_defined_by_user(index);
+           i++)
+        if (dfield_is_null(dtuple_get_nth_field(prev_entry, i)))
+          goto next_rec;
+
+      if (prebuilt->autoinc_error == DB_SUCCESS)
+        prebuilt->autoinc_error= DB_DUPLICATE_KEY;
+      msg= "duplicate key in ";
+      goto not_ok;
+    }
+  }
+
+next_rec:
+  ut_ad(err == DB_SUCCESS);
+
+  {
+    mem_heap_t *tmp_heap= nullptr;
+
+    /* Empty the heap on each round.  But preserve offsets[]
+    for the row_rec_to_index_entry() call, by copying them
+    into a separate memory heap when needed. */
+    if (UNIV_UNLIKELY(offsets != offsets_))
+    {
+      ulint size= rec_offs_get_n_alloc(offsets) * sizeof *offsets;
+      tmp_heap= mem_heap_create(size);
+      offsets= static_cast<rec_offs*>(mem_heap_dup(tmp_heap, offsets, size));
+    }
+
+    mem_heap_empty(heap);
+    prev_entry= row_rec_to_index_entry(rec, index, offsets, heap);
+
+    if (UNIV_LIKELY_NULL(tmp_heap))
+      mem_heap_free(tmp_heap);
+  }
+
+  if (btr_pcur_is_after_last_on_page(prebuilt->pcur))
+    goto next_page;
+
+  goto rec_loop;
 }
 
 /*******************************************************************//**
@@ -6014,8 +6896,9 @@ row_search_get_max_rec(
 	btr_pcur_t	pcur;
 	const rec_t*	rec;
 	/* Open at the high/right end (false), and init cursor */
-	btr_pcur_open_at_index_side(
-		false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr);
+	if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
 
 	do {
 		const page_t*	page;
@@ -6031,8 +6914,6 @@ row_search_get_max_rec(
 		btr_pcur_move_before_first_on_page(&pcur);
 	} while (btr_pcur_move_to_prev(&pcur, mtr));
 
-	btr_pcur_close(&pcur);
-
 	ut_ad(!rec
 	      || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
 		   & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
index 82c880a5920..50196e78092 100644
--- a/storage/innobase/row/row0uins.cc
+++ b/storage/innobase/row/row0uins.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -35,7 +35,6 @@ Created 2/25/1997 Heikki Tuuri
 #include "mach0data.h"
 #include "row0undo.h"
 #include "row0vers.h"
-#include "row0log.h"
 #include "trx0trx.h"
 #include "trx0rec.h"
 #include "row0row.h"
@@ -44,6 +43,7 @@ Created 2/25/1997 Heikki Tuuri
 #include "ibuf0ibuf.h"
 #include "log0log.h"
 #include "fil0fil.h"
+#include <mysql/service_thd_mdl.h>
 
 /*************************************************************************
 IMPORTANT NOTE: Any operation that generates redo MUST check that there
@@ -68,8 +68,18 @@ row_undo_ins_remove_clust_rec(
 	dberr_t		err;
 	ulint		n_tries	= 0;
 	mtr_t		mtr;
-	dict_index_t*	index	= node->pcur.btr_cur.index;
-	bool		online;
+	dict_index_t*	index	= node->pcur.index();
+	table_id_t table_id = 0;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+restart:
+	MDL_ticket* mdl_ticket = nullptr;
+	ut_ad(!table_id || dict_locked
+	      || !node->trx->dict_operation_lock_mode);
+	dict_table_t *table = table_id
+		? dict_table_open_on_id(table_id, dict_locked,
+					DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
+					node->trx->mysql_thd, &mdl_ticket)
+		: nullptr;
 
 	ut_ad(index->is_primary());
 	ut_ad(node->trx->in_rollback);
@@ -78,21 +88,10 @@ row_undo_ins_remove_clust_rec(
 	if (index->table->is_temporary()) {
 		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
-		ut_ad(!dict_index_is_online_ddl(index));
 		ut_ad(index->table->id >= DICT_HDR_FIRST_ID);
-		online = false;
 	} else {
 		index->set_modified(mtr);
 		ut_ad(lock_table_has_locks(index->table));
-		online = dict_index_is_online_ddl(index);
-		if (online) {
-			ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
-			ut_ad(node->trx->dict_operation_lock_mode
-			      != RW_X_LATCH);
-			ut_ad(node->table->id != DICT_INDEXES_ID);
-			ut_ad(node->table->id != DICT_COLUMNS_ID);
-			mtr_s_lock_index(index, &mtr);
-		}
 	}
 
 	/* This is similar to row_undo_mod_clust(). The DDL thread may
@@ -100,13 +99,11 @@ row_undo_ins_remove_clust_rec(
 	We must log the removal, so that the row will be correctly
 	purged. However, we can log the removal out of sync with the
 	B-tree modification. */
-	ut_a(btr_pcur_restore_position(
-	      online ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
-	      : (node->rec_type == TRX_UNDO_INSERT_METADATA)
-		  ? BTR_MODIFY_TREE
-		  : BTR_MODIFY_LEAF,
-	      &node->pcur, &mtr) == btr_pcur_t::SAME_ALL);
-
+	ut_a(node->pcur.restore_position(
+	      (node->rec_type == TRX_UNDO_INSERT_METADATA)
+		? BTR_MODIFY_TREE
+		: BTR_MODIFY_LEAF,
+	      &mtr) == btr_pcur_t::SAME_ALL);
 	rec_t* rec = btr_pcur_get_rec(&node->pcur);
 
 	ut_ad(rec_get_trx_id(rec, index) == node->trx->id
@@ -116,55 +113,88 @@ row_undo_ins_remove_clust_rec(
 	ut_ad(rec_is_metadata(rec, index->table->not_redundant())
 	      == (node->rec_type == TRX_UNDO_INSERT_METADATA));
 
-	if (online && dict_index_is_online_ddl(index)) {
-		mem_heap_t*	heap	= NULL;
-		const rec_offs*	offsets	= rec_get_offsets(
-			rec, index, NULL, index->n_core_fields,
-			ULINT_UNDEFINED, &heap);
-		row_log_table_delete(rec, index, offsets, NULL);
-		mem_heap_free(heap);
-	} else {
-		switch (node->table->id) {
-		case DICT_INDEXES_ID:
-			ut_ad(!online);
-			ut_ad(node->trx->dict_operation_lock_mode
-			      == RW_X_LATCH);
-			ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
-
-			dict_drop_index_tree(&node->pcur, node->trx, &mtr);
-			mtr.commit();
-
-			mtr.start();
-			ut_a(btr_pcur_restore_position(BTR_MODIFY_LEAF,
-			      &node->pcur, &mtr)== btr_pcur_t::SAME_ALL);
+	switch (node->table->id) {
+	case DICT_COLUMNS_ID:
+		/* This is rolling back an INSERT into SYS_COLUMNS.
+		If it was part of an instant ALTER TABLE operation, we
+		must evict the table definition, so that it can be
+		reloaded after the dictionary operation has been
+		completed. At this point, any corresponding operation
+		to the metadata record will have been rolled back. */
+		ut_ad(node->trx->dict_operation_lock_mode);
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		if (rec_get_n_fields_old(rec)
+		    != DICT_NUM_FIELDS__SYS_COLUMNS
+		    || (rec_get_1byte_offs_flag(rec)
+			? rec_1_get_field_end_info(rec, 0) != 8
+			: rec_2_get_field_end_info(rec, 0) != 8)) {
 			break;
-		case DICT_COLUMNS_ID:
-			/* This is rolling back an INSERT into SYS_COLUMNS.
-			If it was part of an instant ALTER TABLE operation, we
-			must evict the table definition, so that it can be
-			reloaded after the dictionary operation has been
-			completed. At this point, any corresponding operation
-			to the metadata record will have been rolled back. */
-			ut_ad(!online);
-			ut_ad(node->trx->dict_operation_lock_mode
-			      == RW_X_LATCH);
-			ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
-			if (rec_get_n_fields_old(rec)
-			    != DICT_NUM_FIELDS__SYS_COLUMNS) {
-				break;
+		}
+		static_assert(!DICT_FLD__SYS_COLUMNS__TABLE_ID, "");
+		node->trx->evict_table(mach_read_from_8(rec));
+		break;
+	case DICT_INDEXES_ID:
+		ut_ad(node->trx->dict_operation_lock_mode);
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		if (!table_id) {
+			table_id = mach_read_from_8(rec);
+			if (table_id) {
+				mtr.commit();
+				goto restart;
 			}
-			ulint len;
-			const byte* data = rec_get_nth_field_old(
-				rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
-			if (len != 8) {
-				break;
+			ut_ad("corrupted SYS_INDEXES record" == 0);
+		}
+
+		pfs_os_file_t d = OS_FILE_CLOSED;
+
+		if (const uint32_t space_id = dict_drop_index_tree(
+			    &node->pcur, node->trx, &mtr)) {
+			if (table) {
+				lock_release_on_rollback(node->trx,
+							 table);
+				if (!dict_locked) {
+					dict_sys.lock(SRW_LOCK_CALL);
+				}
+				if (table->release()) {
+					dict_sys.remove(table);
+				} else if (table->space_id
+					   == space_id) {
+					table->space = nullptr;
+					table->file_unreadable = true;
+				}
+				if (!dict_locked) {
+					dict_sys.unlock();
+				}
+				table = nullptr;
+				if (!mdl_ticket);
+				else if (MDL_context* mdl_context =
+					 static_cast<MDL_context*>(
+						 thd_mdl_context(
+							 node->trx->
+							 mysql_thd))) {
+					mdl_context->release_lock(
+						mdl_ticket);
+					mdl_ticket = nullptr;
+				}
 			}
-			node->trx->evict_table(mach_read_from_8(data));
+
+			d = fil_delete_tablespace(space_id);
+		}
+
+		mtr.commit();
+
+		if (d != OS_FILE_CLOSED) {
+			os_file_close(d);
 		}
+
+		mtr.start();
+		ut_a(node->pcur.restore_position(
+			BTR_MODIFY_LEAF, &mtr) == btr_pcur_t::SAME_ALL);
 	}
 
-	if (btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr)) {
-		err = DB_SUCCESS;
+	err = btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr);
+
+	if (err != DB_FAIL) {
 		goto func_exit;
 	}
 
@@ -177,8 +207,8 @@ retry:
 	} else {
 		index->set_modified(mtr);
 	}
-	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-	      &node->pcur, &mtr) == btr_pcur_t::SAME_ALL);
+	ut_a(node->pcur.restore_position(BTR_PURGE_TREE, &mtr)
+	     == btr_pcur_t::SAME_ALL);
 
 	btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true,
 				   &mtr);
@@ -194,7 +224,7 @@ retry:
 
 		n_tries++;
 
-		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
 
 		goto retry;
 	}
@@ -207,6 +237,12 @@ func_exit:
 	}
 
 	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+	if (UNIV_LIKELY_NULL(table)) {
+		dict_table_close(table, dict_locked,
+				 node->trx->mysql_thd, mdl_ticket);
+	}
+
 	return(err);
 }
 
@@ -217,7 +253,7 @@ static MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
 row_undo_ins_remove_sec_low(
 /*========================*/
-	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
 				depending on whether we wish optimistic or
 				pessimistic descent down the index tree */
 	dict_index_t*	index,	/*!< in: index */
@@ -229,29 +265,38 @@ row_undo_ins_remove_sec_low(
 	mtr_t			mtr;
 	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
 
+	pcur.btr_cur.page_cur.index = index;
 	row_mtr_start(&mtr, index, !modify_leaf);
 
-	if (modify_leaf) {
-		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
-		mtr_s_lock_index(index, &mtr);
-	} else {
-		ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
-		mtr_sx_lock_index(index, &mtr);
-	}
-
-	if (row_log_online_op_try(index, entry, 0)) {
-		goto func_exit_no_pcur;
-	}
+	if (index->is_spatial()) {
+		mode = modify_leaf
+			? btr_latch_mode(BTR_MODIFY_LEAF
+					 | BTR_RTREE_DELETE_MARK
+					 | BTR_RTREE_UNDO_INS)
+			: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+		btr_pcur_get_btr_cur(&pcur)->thr = thr;
+		if (rtr_search(entry, mode, &pcur, &mtr)) {
+			goto func_exit;
+		}
 
-	if (dict_index_is_spatial(index)) {
-		if (modify_leaf) {
-			mode |= BTR_RTREE_DELETE_MARK;
+		if (rec_get_deleted_flag(
+			    btr_pcur_get_rec(&pcur),
+			    dict_table_is_comp(index->table))) {
+			ib::error() << "Record found in index " << index->name
+				<< " is deleted marked on insert rollback.";
+			ut_ad(0);
 		}
-		btr_pcur_get_btr_cur(&pcur)->thr = thr;
-		mode |= BTR_RTREE_UNDO_INS;
+		goto found;
+	} else if (modify_leaf) {
+		mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+		mtr_s_lock_index(index, &mtr);
+	} else {
+		ut_ad(mode == BTR_PURGE_TREE);
+		mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+		mtr_x_lock_index(index, &mtr);
 	}
 
-	switch (row_search_index_entry(index, entry, mode, &pcur, &mtr)) {
+	switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
 	case ROW_BUFFERED:
 	case ROW_NOT_DELETED_REF:
 		/* These are invalid outcomes, because the mode passed
@@ -261,20 +306,11 @@ row_undo_ins_remove_sec_low(
 	case ROW_NOT_FOUND:
 		break;
 	case ROW_FOUND:
-		if (dict_index_is_spatial(index)
-		    && rec_get_deleted_flag(
-			    btr_pcur_get_rec(&pcur),
-			    dict_table_is_comp(index->table))) {
-			ib::error() << "Record found in index " << index->name
-				<< " is deleted marked on insert rollback.";
-			ut_ad(0);
-		}
-
+        found:
 		btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
 
 		if (modify_leaf) {
-			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
-				? DB_SUCCESS : DB_FAIL;
+			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
 		} else {
 			/* Passing rollback=false here, because we are
 			deleting a secondary index record: the distinction
@@ -285,8 +321,8 @@ row_undo_ins_remove_sec_low(
 		}
 	}
 
+func_exit:
 	btr_pcur_close(&pcur);
-func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(err);
@@ -318,9 +354,7 @@ row_undo_ins_remove_sec(
 
 	/* Try then pessimistic descent to the B-tree */
 retry:
-	err = row_undo_ins_remove_sec_low(
-		BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-		index, entry, thr);
+	err = row_undo_ins_remove_sec_low(BTR_PURGE_TREE, index, entry, thr);
 
 	/* The delete operation may fail if we have little
 	file space left: TODO: easiest to crash the database
@@ -330,7 +364,7 @@ retry:
 
 		n_tries++;
 
-		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
 
 		goto retry;
 	}
@@ -344,7 +378,7 @@ retry:
 static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
 {
 	dict_index_t*	clust_index;
-	byte*		ptr;
+	const byte*	ptr;
 	undo_no_t	undo_no;
 	table_id_t	table_id;
 	ulint		dummy;
@@ -363,11 +397,11 @@ static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
 		node->table = dict_table_open_on_id(table_id, dict_locked,
 						    DICT_TABLE_OP_NORMAL);
 	} else if (!dict_locked) {
-		mutex_enter(&dict_sys.mutex);
-		node->table = dict_sys.get_temporary_table(table_id);
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.freeze(SRW_LOCK_CALL);
+		node->table = dict_sys.acquire_temporary_table(table_id);
+		dict_sys.unfreeze();
 	} else {
-		node->table = dict_sys.get_temporary_table(table_id);
+		node->table = dict_sys.acquire_temporary_table(table_id);
 	}
 
 	if (!node->table) {
@@ -380,19 +414,26 @@ static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
 		goto close_table;
 	case TRX_UNDO_INSERT_METADATA:
 	case TRX_UNDO_INSERT_REC:
+	case TRX_UNDO_EMPTY:
 		break;
 	case TRX_UNDO_RENAME_TABLE:
 		dict_table_t* table = node->table;
 		ut_ad(!table->is_temporary());
-		ut_ad(dict_table_is_file_per_table(table)
+		ut_ad(table->file_unreadable
+		      || dict_table_is_file_per_table(table)
 		      == !is_system_tablespace(table->space_id));
 		size_t len = mach_read_from_2(node->undo_rec)
 			+ size_t(node->undo_rec - ptr) - 2;
-		ptr[len] = 0;
-		const char* name = reinterpret_cast<char*>(ptr);
-		if (strcmp(table->name.m_name, name)) {
-			dict_table_rename_in_cache(table, name, false,
-						   table_id != 0);
+		const span<const char> name(reinterpret_cast<const char*>(ptr),
+					    len);
+		if (strlen(table->name.m_name) != len
+		    || memcmp(table->name.m_name, ptr, len)) {
+			dict_table_rename_in_cache(table, name, true);
+		} else if (table->space && table->space->id) {
+			const auto s = table->space->name();
+			if (len != s.size() || memcmp(ptr, s.data(), len)) {
+				table->rename_tablespace(name, true);
+			}
 		}
 		goto close_table;
 	}
@@ -408,7 +449,7 @@ close_table:
 		would probably be better to just drop all temporary
 		tables (and temporary undo log records) of the current
 		connection, instead of doing this rollback. */
-		dict_table_close(node->table, dict_locked, FALSE);
+		dict_table_close(node->table, dict_locked);
 		node->table = NULL;
 		return false;
 	} else {
@@ -416,11 +457,16 @@ close_table:
 		clust_index = dict_table_get_first_index(node->table);
 
 		if (clust_index != NULL) {
-			if (node->rec_type == TRX_UNDO_INSERT_REC) {
+			switch (node->rec_type) {
+			case TRX_UNDO_INSERT_REC:
 				ptr = trx_undo_rec_get_row_ref(
 					ptr, clust_index, &node->ref,
 					node->heap);
-			} else {
+				break;
+			case TRX_UNDO_EMPTY:
+				node->ref = nullptr;
+				return true;
+			default:
 				node->ref = &trx_undo_metadata;
 				if (!row_undo_search_clust_to_pcur(node)) {
 					/* An error probably occurred during
@@ -464,16 +510,15 @@ row_undo_ins_remove_sec_rec(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	dberr_t		err	= DB_SUCCESS;
-	dict_index_t*	index	= node->index;
+	dict_index_t*	index;
 	mem_heap_t*	heap;
 
 	heap = mem_heap_create(1024);
 
-	while (index != NULL) {
-		dtuple_t*	entry;
-
-		if (index->type & DICT_FTS) {
-			dict_table_next_uncorrupted_index(index);
+	for (index = node->index; index;
+             index = dict_table_get_next_index(index)) {
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
 			continue;
 		}
 
@@ -481,7 +526,7 @@ row_undo_ins_remove_sec_rec(
 		always contain all fields of the index. It does not
 		matter if any indexes were created afterwards; all
 		index entries can be reconstructed from the row. */
-		entry = row_build_index_entry(
+		dtuple_t* entry = row_build_index_entry(
 			node->row, node->ext, index, heap);
 		if (UNIV_UNLIKELY(!entry)) {
 			/* The database must have crashed after
@@ -504,7 +549,6 @@ row_undo_ins_remove_sec_rec(
 		}
 
 		mem_heap_empty(heap);
-		dict_table_next_uncorrupted_index(index);
 	}
 
 func_exit:
@@ -527,12 +571,15 @@ row_undo_ins(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	dberr_t	err;
-	bool dict_locked = node->trx->dict_operation_lock_mode == RW_X_LATCH;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
 
 	if (!row_undo_ins_parse_undo_rec(node, dict_locked)) {
 		return DB_SUCCESS;
 	}
 
+	ut_ad(node->table->is_temporary()
+	      || lock_table_has_locks(node->table));
+
 	/* Iterate over all the indexes and undo the insert.*/
 
 	node->index = dict_table_get_first_index(node->table);
@@ -546,8 +593,6 @@ row_undo_ins(
 		/* Skip the clustered index (the first index) */
 		node->index = dict_table_get_next_index(node->index);
 
-		dict_table_skip_corrupt_index(node->index);
-
 		err = row_undo_ins_remove_sec_rec(node, thr);
 
 		if (err != DB_SUCCESS) {
@@ -556,21 +601,19 @@ row_undo_ins(
 
 		log_free_check();
 
-		if (node->table->id == DICT_INDEXES_ID) {
-			ut_ad(!node->table->is_temporary());
-			if (!dict_locked) {
-				mutex_enter(&dict_sys.mutex);
-			}
+		if (!dict_locked && node->table->id == DICT_INDEXES_ID) {
+			dict_sys.lock(SRW_LOCK_CALL);
 			err = row_undo_ins_remove_clust_rec(node);
-			if (!dict_locked) {
-				mutex_exit(&dict_sys.mutex);
-			}
+			dict_sys.unlock();
 		} else {
+			ut_ad(node->table->id != DICT_INDEXES_ID
+			      || !node->table->is_temporary());
 			err = row_undo_ins_remove_clust_rec(node);
 		}
 
 		if (err == DB_SUCCESS && node->table->stat_initialized) {
-			/* Not protected by dict_sys.mutex for
+			/* Not protected by dict_sys.latch
+			or table->stats_mutex_lock() for
 			performance reasons, we would rather get garbage
 			in stat_n_rows (which is just an estimate anyway)
 			than protecting the following code with a latch. */
@@ -579,7 +622,7 @@ row_undo_ins(
 			/* Do not attempt to update statistics when
 			executing ROLLBACK in the InnoDB SQL
 			interpreter, because in that case we would
-			already be holding dict_sys.mutex, which
+			already be holding dict_sys.latch, which
 			would be acquired when updating statistics. */
 			if (!dict_locked) {
 				dict_stats_update_if_needed(node->table,
@@ -592,9 +635,13 @@ row_undo_ins(
 		log_free_check();
 		ut_ad(!node->table->is_temporary());
 		err = row_undo_ins_remove_clust_rec(node);
+		break;
+	case TRX_UNDO_EMPTY:
+		err = node->table->clear(thr);
+		break;
 	}
 
-	dict_table_close(node->table, dict_locked, FALSE);
+	dict_table_close(node->table, dict_locked);
 
 	node->table = NULL;
 
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
index bea2baa3cd6..50e15e03cc9 100644
--- a/storage/innobase/row/row0umod.cc
+++ b/storage/innobase/row/row0umod.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,7 +36,6 @@ Created 2/27/1997 Heikki Tuuri
 #include "ibuf0ibuf.h"
 #include "row0undo.h"
 #include "row0vers.h"
-#include "row0log.h"
 #include "trx0trx.h"
 #include "trx0rec.h"
 #include "row0row.h"
@@ -80,17 +79,12 @@ row_undo_mod_clust_low(
 	mem_heap_t**	offsets_heap,
 				/*!< in/out: memory heap that can be emptied */
 	mem_heap_t*	heap,	/*!< in/out: memory heap */
-	const dtuple_t**rebuilt_old_pk,
-				/*!< out: row_log_table_get_pk()
-				before the update, or NULL if
-				the table is not being rebuilt online or
-				the PRIMARY KEY definition does not change */
 	byte*		sys,	/*!< out: DB_TRX_ID, DB_ROLL_PTR
 				for row_log_table_delete() */
 	que_thr_t*	thr,	/*!< in: query thread */
 	mtr_t*		mtr,	/*!< in: mtr; must be committed before
 				latching any further pages */
-	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+	btr_latch_mode	mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
 {
 	btr_pcur_t*	pcur;
 	btr_cur_t*	btr_cur;
@@ -99,10 +93,10 @@ row_undo_mod_clust_low(
 	pcur = &node->pcur;
 	btr_cur = btr_pcur_get_btr_cur(pcur);
 
-	ut_d(auto pcur_restore_result =)
-	btr_pcur_restore_position(mode, pcur, mtr);
+	if (pcur->restore_position(mode, mtr) != btr_pcur_t::SAME_ALL) {
+		return DB_CORRUPTION;
+	}
 
-	ut_ad(pcur_restore_result == btr_pcur_t::SAME_ALL);
 	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
 			     btr_cur_get_index(btr_cur))
 	      == thr_get_trx(thr)->id
@@ -111,18 +105,9 @@ row_undo_mod_clust_low(
 	      || node->update->info_bits == REC_INFO_METADATA_ADD
 	      || node->update->info_bits == REC_INFO_METADATA_ALTER);
 
-	if (mode != BTR_MODIFY_LEAF
-	    && dict_index_is_online_ddl(btr_cur_get_index(btr_cur))) {
-		*rebuilt_old_pk = row_log_table_get_pk(
-			btr_cur_get_rec(btr_cur),
-			btr_cur_get_index(btr_cur), NULL, sys, &heap);
-	} else {
-		*rebuilt_old_pk = NULL;
-	}
-
 	if (mode != BTR_MODIFY_TREE) {
-		ut_ad((mode & ulint(~BTR_ALREADY_S_LATCHED))
-		      == BTR_MODIFY_LEAF);
+		ut_ad(mode == BTR_MODIFY_LEAF
+		      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED);
 
 		err = btr_cur_optimistic_update(
 			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
@@ -148,26 +133,57 @@ row_undo_mod_clust_low(
 		    && node->ref == &trx_undo_metadata
 		    && btr_cur_get_index(btr_cur)->table->instant
 		    && node->update->info_bits == REC_INFO_METADATA_ADD) {
-			btr_reset_instant(*btr_cur_get_index(btr_cur), false,
-					  mtr);
+			btr_reset_instant(*btr_cur->index(), false, mtr);
 		}
 	}
 
-	if (err == DB_SUCCESS
-	    && btr_cur_get_index(btr_cur)->table->id == DICT_COLUMNS_ID) {
+	if (err != DB_SUCCESS) {
+		return err;
+	}
+
+	switch (const auto id = btr_cur_get_index(btr_cur)->table->id) {
+		unsigned c;
+	case DICT_TABLES_ID:
+		if (node->trx != trx_roll_crash_recv_trx) {
+			break;
+		}
+		c = DICT_COL__SYS_TABLES__ID;
+		goto evict;
+	case DICT_INDEXES_ID:
+		if (node->trx != trx_roll_crash_recv_trx) {
+			break;
+		} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC
+			   && btr_cur_get_rec(btr_cur)
+			   [8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+			   == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) {
+			/* We are rolling back the DELETE of metadata
+			for a failed ADD INDEX operation. This does
+			not affect any cached table definition,
+			because we are filtering out such indexes in
+			dict_load_indexes(). */
+			break;
+		}
+		/* fall through */
+	case DICT_COLUMNS_ID:
+		static_assert(!DICT_COL__SYS_INDEXES__TABLE_ID, "");
+		static_assert(!DICT_COL__SYS_COLUMNS__TABLE_ID, "");
+		c = DICT_COL__SYS_COLUMNS__TABLE_ID;
 		/* This is rolling back an UPDATE or DELETE on SYS_COLUMNS.
 		If it was part of an instant ALTER TABLE operation, we
 		must evict the table definition, so that it can be
 		reloaded after the dictionary operation has been
 		completed. At this point, any corresponding operation
 		to the metadata record will have been rolled back. */
-		const dfield_t& table_id = *dtuple_get_nth_field(node->row, 0);
+	evict:
+		const dfield_t& table_id = *dtuple_get_nth_field(node->row, c);
 		ut_ad(dfield_get_len(&table_id) == 8);
-		node->trx->evict_table(mach_read_from_8(static_cast<byte*>(
-					table_id.data)));
+		node->trx->evict_table(mach_read_from_8(
+					       static_cast<byte*>(
+						       table_id.data)),
+				       id == DICT_COLUMNS_ID);
 	}
 
-	return(err);
+	return DB_SUCCESS;
 }
 
 /** Get the byte offset of the DB_TRX_ID column
@@ -199,28 +215,23 @@ static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
 }
 
 /** Determine if rollback must execute a purge-like operation.
-@param[in,out]	node	row undo
-@param[in,out]	mtr	mini-transaction
+@param node   row undo
 @return	whether the record should be purged */
-static bool row_undo_mod_must_purge(undo_node_t* node, mtr_t* mtr)
+static bool row_undo_mod_must_purge(const undo_node_t &node)
 {
-	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
-	ut_ad(!node->table->is_temporary());
-
-	btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&node->pcur);
-	ut_ad(btr_cur->index->is_primary());
-	DEBUG_SYNC_C("rollback_purge_clust");
+  ut_ad(node.rec_type == TRX_UNDO_UPD_DEL_REC);
+  ut_ad(!node.table->is_temporary());
 
-	mtr->s_lock(&purge_sys.latch, __FILE__, __LINE__);
-
-	if (!purge_sys.changes_visible(node->new_trx_id, node->table->name)) {
-		return false;
-	}
+  const btr_cur_t &btr_cur= node.pcur.btr_cur;
+  ut_ad(btr_cur.index()->is_primary());
+  DEBUG_SYNC_C("rollback_purge_clust");
 
-	const rec_t* rec = btr_cur_get_rec(btr_cur);
+  if (!purge_sys.is_purgeable(node.new_trx_id))
+    return false;
 
-	return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur->index))
-		== node->new_trx_id;
+  const rec_t *rec= btr_cur_get_rec(&btr_cur);
+  return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur.index())) ==
+    node.new_trx_id;
 }
 
 /***********************************************************//**
@@ -238,13 +249,9 @@ row_undo_mod_clust(
 	mtr_t		mtr;
 	dberr_t		err;
 	dict_index_t*	index;
-	bool		online;
 
 	ut_ad(thr_get_trx(thr) == node->trx);
-	ut_ad(node->trx->dict_operation_lock_mode);
 	ut_ad(node->trx->in_rollback);
-	ut_ad(rw_lock_own_flagged(&dict_sys.latch,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
 
 	log_free_check();
 	pcur = &node->pcur;
@@ -259,26 +266,16 @@ row_undo_mod_clust(
 		ut_ad(lock_table_has_locks(index->table));
 	}
 
-	online = dict_index_is_online_ddl(index);
-	if (online) {
-		ut_ad(node->trx->dict_operation_lock_mode != RW_X_LATCH);
-		mtr_s_lock_index(index, &mtr);
-	}
-
 	mem_heap_t*	heap		= mem_heap_create(1024);
 	mem_heap_t*	offsets_heap	= NULL;
 	rec_offs*	offsets		= NULL;
-	const dtuple_t*	rebuilt_old_pk;
 	byte		sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
 
 	/* Try optimistic processing of the record, keeping changes within
 	the index page */
 
 	err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
-				     heap, &rebuilt_old_pk, sys,
-				     thr, &mtr, online
-				     ? BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED
-				     : BTR_MODIFY_LEAF);
+				     heap, sys, thr, &mtr, BTR_MODIFY_LEAF);
 
 	if (err != DB_SUCCESS) {
 		btr_pcur_commit_specify_mtr(pcur, &mtr);
@@ -293,44 +290,12 @@ row_undo_mod_clust(
 			index->set_modified(mtr);
 		}
 
-		err = row_undo_mod_clust_low(
-			node, &offsets, &offsets_heap,
-			heap, &rebuilt_old_pk, sys,
-			thr, &mtr, BTR_MODIFY_TREE);
+		err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+					     heap, sys, thr, &mtr,
+					     BTR_MODIFY_TREE);
 		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
 	}
 
-	/* Online rebuild cannot be initiated while we are holding
-	dict_sys.latch and index->lock. (It can be aborted.) */
-	ut_ad(online || !dict_index_is_online_ddl(index));
-
-	if (err == DB_SUCCESS && online) {
-
-		ut_ad(rw_lock_own_flagged(
-				&index->lock,
-				RW_LOCK_FLAG_S | RW_LOCK_FLAG_X
-				| RW_LOCK_FLAG_SX));
-
-		switch (node->rec_type) {
-		case TRX_UNDO_DEL_MARK_REC:
-			row_log_table_insert(
-				btr_pcur_get_rec(pcur), index, offsets);
-			break;
-		case TRX_UNDO_UPD_EXIST_REC:
-			row_log_table_update(
-				btr_pcur_get_rec(pcur), index, offsets,
-				rebuilt_old_pk);
-			break;
-		case TRX_UNDO_UPD_DEL_REC:
-			row_log_table_delete(
-				btr_pcur_get_rec(pcur), index, offsets, sys);
-			break;
-		default:
-			ut_ad(0);
-			break;
-		}
-	}
-
 	/**
 	* when scrubbing, and records gets cleared,
 	*   the transaction id is not present afterwards.
@@ -358,47 +323,55 @@ row_undo_mod_clust(
 		ut_ad(node->new_trx_id);
 
 		mtr.start();
-		if (btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr) !=
+		if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) !=
 		    btr_pcur_t::SAME_ALL) {
 			goto mtr_commit_exit;
 		}
 
+		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					   dict_table_is_comp(node->table)));
+
 		if (index->table->is_temporary()) {
 			mtr.set_log_mode(MTR_LOG_NO_REDO);
-		} else {
-			if (!row_undo_mod_must_purge(node, &mtr)) {
+			err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+							&mtr);
+			if (err != DB_FAIL) {
 				goto mtr_commit_exit;
 			}
+			err = DB_SUCCESS;
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+		} else {
 			index->set_modified(mtr);
+			if (!row_undo_mod_must_purge(*node)) {
+				goto mtr_commit_exit;
+			}
+			err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+							&mtr);
+			if (err != DB_FAIL) {
+				goto mtr_commit_exit;
+			}
+			err = DB_SUCCESS;
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
 		}
 
-		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
-					   dict_table_is_comp(node->table)));
-		if (btr_cur_optimistic_delete(&pcur->btr_cur, 0, &mtr)) {
-			goto mtr_commit_exit;
-		}
-
-		btr_pcur_commit_specify_mtr(pcur, &mtr);
-
 		mtr.start();
-		if (btr_pcur_restore_position(
-			    BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
-			    pcur, &mtr) != btr_pcur_t::SAME_ALL) {
+		if (pcur->restore_position(BTR_PURGE_TREE, &mtr) !=
+		    btr_pcur_t::SAME_ALL) {
 			goto mtr_commit_exit;
 		}
 
+		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					   dict_table_is_comp(node->table)));
+
 		if (index->table->is_temporary()) {
 			mtr.set_log_mode(MTR_LOG_NO_REDO);
 		} else {
-			if (!row_undo_mod_must_purge(node, &mtr)) {
+			if (!row_undo_mod_must_purge(*node)) {
 				goto mtr_commit_exit;
 			}
 			index->set_modified(mtr);
 		}
 
-		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
-					   dict_table_is_comp(node->table)));
-
 		/* This operation is analogous to purge, we can free
 		also inherited externally stored fields. We can also
 		assume that the record was complete (including BLOBs),
@@ -407,25 +380,20 @@ row_undo_mod_clust(
 		rollback=false, just like purge does. */
 		btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0,
 					   false, &mtr);
-		ut_ad(err == DB_SUCCESS
-		      || err == DB_OUT_OF_FILE_SPACE);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
 	} else if (!index->table->is_temporary() && node->new_trx_id) {
 		/* We rolled back a record so that it still exists.
 		We must reset the DB_TRX_ID if the history is no
 		longer accessible by any active read view. */
 
 		mtr.start();
-		if (btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr)
-		    != btr_pcur_t::SAME_ALL) {
-			goto mtr_commit_exit;
-		}
-		rec_t* rec = btr_pcur_get_rec(pcur);
-		mtr.s_lock(&purge_sys.latch, __FILE__, __LINE__);
-		if (!purge_sys.changes_visible(node->new_trx_id,
-					       node->table->name)) {
+		if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr)
+		    != btr_pcur_t::SAME_ALL
+		    || !purge_sys.is_purgeable(node->new_trx_id)) {
 			goto mtr_commit_exit;
 		}
 
+		rec_t* rec = btr_pcur_get_rec(pcur);
 		ulint trx_id_offset = index->trx_id_offset;
 		ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
 		/* Reserve enough offsets for the PRIMARY KEY and
@@ -481,7 +449,7 @@ row_undo_mod_clust(
 				mtr.memset(block, offs, DATA_TRX_ID_LEN, 0);
 				offs += DATA_TRX_ID_LEN;
 				mtr.write<1,mtr_t::MAYBE_NOP>(*block,
-							      block->frame
+							      block->page.frame
 							      + offs, 0x80U);
 				mtr.memset(block, offs + 1,
 					   DATA_ROLL_PTR_LEN - 1, 0);
@@ -513,7 +481,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	que_thr_t*	thr,	/*!< in: query thread */
 	dict_index_t*	index,	/*!< in: index */
 	dtuple_t*	entry,	/*!< in: index entry */
-	ulint		mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+	btr_latch_mode	mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
 				BTR_MODIFY_TREE */
 {
 	btr_pcur_t		pcur;
@@ -521,25 +489,36 @@ row_undo_mod_del_mark_or_remove_sec_low(
 	dberr_t			err	= DB_SUCCESS;
 	mtr_t			mtr;
 	mtr_t			mtr_vers;
-	row_search_result	search_result;
 	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
 
 	row_mtr_start(&mtr, index, !modify_leaf);
 
-	if (!index->is_committed()) {
+	pcur.btr_cur.page_cur.index = index;
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (index->is_spatial()) {
+		mode = modify_leaf
+			? btr_latch_mode(BTR_MODIFY_LEAF
+					 | BTR_RTREE_DELETE_MARK
+					 | BTR_RTREE_UNDO_INS)
+			: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+		btr_cur->thr = thr;
+		if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+			goto found;
+		} else {
+			goto func_exit;
+		}
+	} else if (!index->is_committed()) {
 		/* The index->online_status may change if the index is
 		or was being created online, but not committed yet. It
 		is protected by index->lock. */
 		if (modify_leaf) {
-			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+			mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
 			mtr_s_lock_index(index, &mtr);
 		} else {
-			ut_ad(mode == (BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE));
-			mtr_sx_lock_index(index, &mtr);
-		}
-
-		if (row_log_online_op_try(index, entry, 0)) {
-			goto func_exit_no_pcur;
+			ut_ad(mode == BTR_PURGE_TREE);
+			mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+			mtr_x_lock_index(index, &mtr);
 		}
 	} else {
 		/* For secondary indexes,
@@ -548,20 +527,8 @@ row_undo_mod_del_mark_or_remove_sec_low(
 		ut_ad(!dict_index_is_online_ddl(index));
 	}
 
-	btr_cur = btr_pcur_get_btr_cur(&pcur);
-
-	if (dict_index_is_spatial(index)) {
-		if (modify_leaf) {
-			btr_cur->thr = thr;
-			mode |= BTR_RTREE_DELETE_MARK;
-		}
-		mode |= BTR_RTREE_UNDO_INS;
-	}
-
-	search_result = row_search_index_entry(index, entry, mode,
-					       &pcur, &mtr);
-
-	switch (UNIV_EXPECT(search_result, ROW_FOUND)) {
+	switch (UNIV_EXPECT(row_search_index_entry(entry, mode, &pcur, &mtr),
+			    ROW_FOUND)) {
 	case ROW_NOT_FOUND:
 		/* In crash recovery, the secondary index record may
 		be missing if the UPDATE did not have time to insert
@@ -583,14 +550,15 @@ row_undo_mod_del_mark_or_remove_sec_low(
 		ut_error;
 	}
 
+found:
 	/* We should remove the index record if no prior version of the row,
 	which cannot be purged yet, requires its existence. If some requires,
 	we should delete mark the record. */
 
 	mtr_vers.start();
 
-	ut_a(btr_pcur_restore_position(BTR_SEARCH_LEAF, &node->pcur, &mtr_vers)
-	    == btr_pcur_t::SAME_ALL);
+	ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) ==
+	      btr_pcur_t::SAME_ALL);
 
 	/* For temporary table, we can skip to check older version of
 	clustered index entry, because there is no MVCC or purge. */
@@ -615,8 +583,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
 		}
 
 		if (modify_leaf) {
-			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
-				? DB_SUCCESS : DB_FAIL;
+			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
 		} else {
 			/* Passing rollback=false,
 			because we are deleting a secondary index record:
@@ -636,7 +603,6 @@ row_undo_mod_del_mark_or_remove_sec_low(
 
 func_exit:
 	btr_pcur_close(&pcur);
-func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(err);
@@ -670,7 +636,7 @@ row_undo_mod_del_mark_or_remove_sec(
 	}
 
 	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
-		entry, BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE);
+		entry, BTR_PURGE_TREE);
 	return(err);
 }
 
@@ -688,7 +654,7 @@ static MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
 row_undo_mod_del_unmark_sec_and_undo_update(
 /*========================================*/
-	ulint		mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
+	btr_latch_mode	mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
 				BTR_MODIFY_TREE */
 	que_thr_t*	thr,	/*!< in: query thread */
 	dict_index_t*	index,	/*!< in: index */
@@ -703,51 +669,42 @@ row_undo_mod_del_unmark_sec_and_undo_update(
 	trx_t*			trx		= thr_get_trx(thr);
 	const ulint		flags
 		= BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
-	row_search_result	search_result;
-	ulint			orig_mode = mode;
+	const auto		orig_mode = mode;
 
+	pcur.btr_cur.page_cur.index = index;
 	ut_ad(trx->id != 0);
 
-	if (dict_index_is_spatial(index)) {
+	if (index->is_spatial()) {
 		/* FIXME: Currently we do a 2-pass search for the undo
 		due to avoid undel-mark a wrong rec in rolling back in
 		partial update.  Later, we could log some info in
 		secondary index updates to avoid this. */
-		ut_ad(mode & BTR_MODIFY_LEAF);
-		mode |= BTR_RTREE_DELETE_MARK;
+		static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+		ut_ad(!(mode & 8));
+		mode = btr_latch_mode(mode | BTR_RTREE_DELETE_MARK);
 	}
 
 try_again:
-	row_mtr_start(&mtr, index, !(mode & BTR_MODIFY_LEAF));
+	row_mtr_start(&mtr, index, mode & 8);
 
-	if (!index->is_committed()) {
-		/* The index->online_status may change if the index is
-		or was being created online, but not committed yet. It
-		is protected by index->lock. */
-		if (mode == BTR_MODIFY_LEAF) {
-			mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
-			mtr_s_lock_index(index, &mtr);
-		} else {
-			ut_ad(mode == BTR_MODIFY_TREE);
-			mtr_sx_lock_index(index, &mtr);
-		}
+	btr_cur->thr = thr;
 
-		if (row_log_online_op_try(index, entry, trx->id)) {
-			goto func_exit_no_pcur;
+	if (index->is_spatial()) {
+		if (!rtr_search(entry, mode, &pcur, &mtr)) {
+			goto found;
 		}
-	} else {
-		/* For secondary indexes,
-		index->online_status==ONLINE_INDEX_COMPLETE if
-		index->is_committed(). */
-		ut_ad(!dict_index_is_online_ddl(index));
-	}
 
-	btr_cur->thr = thr;
+		if (mode != orig_mode && btr_cur->rtr_info->fd_del) {
+			mode = orig_mode;
+			btr_pcur_close(&pcur);
+			mtr.commit();
+			goto try_again;
+		}
 
-	search_result = row_search_index_entry(index, entry, mode,
-					       &pcur, &mtr);
+		goto not_found;
+	}
 
-	switch (search_result) {
+	switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
 		mem_heap_t*	heap;
 		mem_heap_t*	offsets_heap;
 		rec_offs*	offsets;
@@ -758,45 +715,27 @@ try_again:
 		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
 		ut_error;
 	case ROW_NOT_FOUND:
-		/* For spatial index, if first search didn't find an
-		undel-marked rec, try to find a del-marked rec. */
-		if (dict_index_is_spatial(index) && btr_cur->rtr_info->fd_del) {
-			if (mode != orig_mode) {
-				mode = orig_mode;
-				btr_pcur_close(&pcur);
-				mtr_commit(&mtr);
-				goto try_again;
-			}
-		}
-
-		if (index->is_committed()) {
-			/* During online secondary index creation, it
-			is possible that MySQL is waiting for a
-			meta-data lock upgrade before invoking
-			ha_innobase::commit_inplace_alter_table()
-			while this ROLLBACK is executing. InnoDB has
-			finished building the index, but it does not
-			yet exist in MySQL. In this case, we suppress
-			the printout to the error log. */
+not_found:
+		if (btr_cur->up_match >= dict_index_get_n_unique(index)
+		    || btr_cur->low_match >= dict_index_get_n_unique(index)) {
 			ib::warn() << "Record in index " << index->name
 				<< " of table " << index->table->name
-				<< " was not found on rollback, trying to"
-				" insert: " << *entry
+				<< " was not found on rollback, and"
+				" a duplicate exists: "
+				<< *entry
 				<< " at: " << rec_index_print(
 					btr_cur_get_rec(btr_cur), index);
-		}
-
-		if (btr_cur->up_match >= dict_index_get_n_unique(index)
-		    || btr_cur->low_match >= dict_index_get_n_unique(index)) {
-			if (index->is_committed()) {
-				ib::warn() << "Record in index " << index->name
-					<< " was not found on rollback, and"
-					" a duplicate exists";
-			}
 			err = DB_DUPLICATE_KEY;
 			break;
 		}
 
+		ib::warn() << "Record in index " << index->name
+			<< " of table " << index->table->name
+			<< " was not found on rollback, trying to insert: "
+			<< *entry
+			<< " at: " << rec_index_print(
+				btr_cur_get_rec(btr_cur), index);
+
 		/* Insert the missing record that we were trying to
 		delete-unmark. */
 		big_rec_t*	big_rec;
@@ -834,6 +773,7 @@ try_again:
 
 		break;
 	case ROW_FOUND:
+found:
 		btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
 					   btr_cur_get_rec(btr_cur), &mtr);
 		heap = mem_heap_create(
@@ -879,44 +819,12 @@ try_again:
 	}
 
 	btr_pcur_close(&pcur);
-func_exit_no_pcur:
 	mtr_commit(&mtr);
 
 	return(err);
 }
 
 /***********************************************************//**
-Flags a secondary index corrupted. */
-static MY_ATTRIBUTE((nonnull))
-void
-row_undo_mod_sec_flag_corrupted(
-/*============================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	dict_index_t*	index)	/*!< in: secondary index */
-{
-	ut_ad(!dict_index_is_clust(index));
-
-	switch (trx->dict_operation_lock_mode) {
-	case RW_S_LATCH:
-		/* Because row_undo() is holding an S-latch
-		on the data dictionary during normal rollback,
-		we can only mark the index corrupted in the
-		data dictionary cache. TODO: fix this somehow.*/
-		mutex_enter(&dict_sys.mutex);
-		dict_set_corrupted_index_cache_only(index);
-		mutex_exit(&dict_sys.mutex);
-		break;
-	default:
-		ut_ad(0);
-		/* fall through */
-	case RW_X_LATCH:
-		/* This should be the rollback of a data dictionary
-		transaction. */
-		dict_set_corrupted(index, trx, "rollback");
-	}
-}
-
-/***********************************************************//**
 Undoes a modify in secondary indexes when undo record type is UPD_DEL.
 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
 static MY_ATTRIBUTE((nonnull, warn_unused_result))
@@ -934,12 +842,11 @@ row_undo_mod_upd_del_sec(
 
 	heap = mem_heap_create(1024);
 
-	while (node->index != NULL) {
-		dict_index_t*	index	= node->index;
-		dtuple_t*	entry;
+	do {
+		dict_index_t* index = node->index;
 
-		if (index->type & DICT_FTS) {
-			dict_table_next_uncorrupted_index(node->index);
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
 			continue;
 		}
 
@@ -950,7 +857,7 @@ row_undo_mod_upd_del_sec(
 		time when the undo log record was written. When we get
 		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
 		it should always cover all affected indexes. */
-		entry = row_build_index_entry(
+		dtuple_t* entry = row_build_index_entry(
 			node->row, node->ext, index, heap);
 
 		if (UNIV_UNLIKELY(!entry)) {
@@ -975,8 +882,7 @@ row_undo_mod_upd_del_sec(
 		}
 
 		mem_heap_empty(heap);
-		dict_table_next_uncorrupted_index(node->index);
-	}
+	} while ((node->index = dict_table_get_next_index(node->index)));
 
 	mem_heap_free(heap);
 
@@ -1000,12 +906,11 @@ row_undo_mod_del_mark_sec(
 
 	heap = mem_heap_create(1024);
 
-	while (node->index != NULL) {
-		dict_index_t*	index	= node->index;
-		dtuple_t*	entry;
+	do {
+		dict_index_t* index = node->index;
 
-		if (index->type == DICT_FTS) {
-			dict_table_next_uncorrupted_index(node->index);
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
 			continue;
 		}
 
@@ -1016,7 +921,7 @@ row_undo_mod_del_mark_sec(
 		time when the undo log record was written. When we get
 		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
 		it should always cover all affected indexes. */
-		entry = row_build_index_entry(
+		dtuple_t* entry = row_build_index_entry(
 			node->row, node->ext, index, heap);
 
 		ut_a(entry);
@@ -1029,8 +934,7 @@ row_undo_mod_del_mark_sec(
 		}
 
 		if (err == DB_DUPLICATE_KEY) {
-			row_undo_mod_sec_flag_corrupted(
-				thr_get_trx(thr), index);
+			index->type |= DICT_CORRUPT;
 			err = DB_SUCCESS;
 			/* Do not return any error to the caller. The
 			duplicate will be reported by ALTER TABLE or
@@ -1043,8 +947,7 @@ row_undo_mod_del_mark_sec(
 		}
 
 		mem_heap_empty(heap);
-		dict_table_next_uncorrupted_index(node->index);
-	}
+	} while ((node->index = dict_table_get_next_index(node->index)));
 
 	mem_heap_free(heap);
 
@@ -1061,48 +964,33 @@ row_undo_mod_upd_exist_sec(
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	mem_heap_t*	heap;
-	dberr_t		err	= DB_SUCCESS;
-
-	if (node->index == NULL
-	    || ((node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
-		/* No change in secondary indexes */
-
-		return(err);
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		return DB_SUCCESS;
 	}
 
-	heap = mem_heap_create(1024);
+	mem_heap_t* heap = mem_heap_create(1024);
+	dberr_t err = DB_SUCCESS;
 
+	do {
+		dict_index_t* index = node->index;
 
-	while (node->index != NULL) {
-		dict_index_t*	index	= node->index;
-		dtuple_t*	entry;
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
 
-		if (dict_index_is_spatial(index)) {
-			if (!row_upd_changes_ord_field_binary_func(
-				index, node->update,
+		if (!row_upd_changes_ord_field_binary_func(
+			index, node->update,
 #ifdef UNIV_DEBUG
-				thr,
+			thr,
 #endif /* UNIV_DEBUG */
-                                node->row,
-				node->ext, ROW_BUILD_FOR_UNDO)) {
-				dict_table_next_uncorrupted_index(node->index);
-				continue;
-			}
-		} else {
-			if (index->type == DICT_FTS
-			    || !row_upd_changes_ord_field_binary(index,
-								 node->update,
-								 thr, node->row,
-								 node->ext)) {
-				dict_table_next_uncorrupted_index(node->index);
-				continue;
-			}
+			node->row, node->ext, ROW_BUILD_FOR_UNDO)) {
+			continue;
 		}
 
 		/* Build the newest version of the index entry */
-		entry = row_build_index_entry(node->row, node->ext,
-					      index, heap);
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
 		if (UNIV_UNLIKELY(!entry)) {
 			/* The server must have crashed in
 			row_upd_clust_rec_by_insert() before
@@ -1154,17 +1042,10 @@ row_undo_mod_upd_exist_sec(
 		the secondary index record if we updated its fields
 		but alphabetically they stayed the same, e.g.,
 		'abc' -> 'aBc'. */
-		if (dict_index_is_spatial(index)) {
-			entry = row_build_index_entry_low(node->undo_row,
-							  node->undo_ext,
-							  index, heap,
-							  ROW_BUILD_FOR_UNDO);
-		} else {
-			entry = row_build_index_entry(node->undo_row,
-						      node->undo_ext,
-						      index, heap);
-		}
-
+		entry = row_build_index_entry_low(node->undo_row,
+						  node->undo_ext,
+						  index, heap,
+						  ROW_BUILD_FOR_UNDO);
 		ut_a(entry);
 
 		err = row_undo_mod_del_unmark_sec_and_undo_update(
@@ -1175,16 +1056,14 @@ row_undo_mod_upd_exist_sec(
 		}
 
 		if (err == DB_DUPLICATE_KEY) {
-			row_undo_mod_sec_flag_corrupted(
-				thr_get_trx(thr), index);
+			index->type |= DICT_CORRUPT;
 			err = DB_SUCCESS;
 		} else if (err != DB_SUCCESS) {
 			break;
 		}
 
 		mem_heap_empty(heap);
-		dict_table_next_uncorrupted_index(node->index);
-	}
+	} while ((node->index = dict_table_get_next_index(node->index)));
 
 	mem_heap_free(heap);
 
@@ -1197,7 +1076,6 @@ row_undo_mod_upd_exist_sec(
 static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
 {
 	dict_index_t*	clust_index;
-	byte*		ptr;
 	undo_no_t	undo_no;
 	table_id_t	table_id;
 	trx_id_t	trx_id;
@@ -1212,19 +1090,20 @@ static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
 	ut_ad(node->trx->in_rollback);
 	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
 
-	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
-				    &dummy_extern, &undo_no, &table_id);
+	const byte *ptr = trx_undo_rec_get_pars(
+		node->undo_rec, &type, &cmpl_info,
+		&dummy_extern, &undo_no, &table_id);
 	node->rec_type = type;
 
 	if (node->state == UNDO_UPDATE_PERSISTENT) {
 		node->table = dict_table_open_on_id(table_id, dict_locked,
 						    DICT_TABLE_OP_NORMAL);
 	} else if (!dict_locked) {
-		mutex_enter(&dict_sys.mutex);
-		node->table = dict_sys.get_temporary_table(table_id);
-		mutex_exit(&dict_sys.mutex);
+		dict_sys.freeze(SRW_LOCK_CALL);
+		node->table = dict_sys.acquire_temporary_table(table_id);
+		dict_sys.unfreeze();
 	} else {
-		node->table = dict_sys.get_temporary_table(table_id);
+		node->table = dict_sys.acquire_temporary_table(table_id);
 	}
 
 	if (!node->table) {
@@ -1244,7 +1123,7 @@ close_table:
 		would probably be better to just drop all temporary
 		tables (and temporary undo log records) of the current
 		connection, instead of doing this rollback. */
-		dict_table_close(node->table, dict_locked, FALSE);
+		dict_table_close(node->table, dict_locked);
 		node->table = NULL;
 		return false;
 	}
@@ -1330,15 +1209,16 @@ row_undo_mod(
 	undo_node_t*	node,	/*!< in: row undo node */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
-	dberr_t	err;
+	dberr_t	err = DB_SUCCESS;
 	ut_ad(thr_get_trx(thr) == node->trx);
-	const bool dict_locked = node->trx->dict_operation_lock_mode
-		== RW_X_LATCH;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
 
 	if (!row_undo_mod_parse_undo_rec(node, dict_locked)) {
 		return DB_SUCCESS;
 	}
 
+	ut_ad(node->table->is_temporary()
+	      || lock_table_has_locks(node->table));
 	node->index = dict_table_get_first_index(node->table);
 	ut_ad(dict_index_is_clust(node->index));
 
@@ -1349,23 +1229,20 @@ row_undo_mod(
 
 	/* Skip the clustered index (the first index) */
 	node->index = dict_table_get_next_index(node->index);
-
-	/* Skip all corrupted secondary index */
-	dict_table_skip_corrupt_index(node->index);
-
-	switch (node->rec_type) {
-	case TRX_UNDO_UPD_EXIST_REC:
-		err = row_undo_mod_upd_exist_sec(node, thr);
-		break;
-	case TRX_UNDO_DEL_MARK_REC:
-		err = row_undo_mod_del_mark_sec(node, thr);
-		break;
-	case TRX_UNDO_UPD_DEL_REC:
-		err = row_undo_mod_upd_del_sec(node, thr);
-		break;
-	default:
-		ut_error;
-		err = DB_ERROR;
+	if (node->index) {
+		switch (node->rec_type) {
+		case TRX_UNDO_UPD_EXIST_REC:
+			err = row_undo_mod_upd_exist_sec(node, thr);
+			break;
+		case TRX_UNDO_DEL_MARK_REC:
+			err = row_undo_mod_del_mark_sec(node, thr);
+			break;
+		case TRX_UNDO_UPD_DEL_REC:
+			err = row_undo_mod_upd_del_sec(node, thr);
+			break;
+		default:
+			MY_ASSERT_UNREACHABLE();
+		}
 	}
 
 	if (err == DB_SUCCESS) {
@@ -1394,7 +1271,7 @@ rollback_clust:
 			/* Do not attempt to update statistics when
 			executing ROLLBACK in the InnoDB SQL
 			interpreter, because in that case we would
-			already be holding dict_sys.mutex, which
+			already be holding dict_sys.latch, which
 			would be acquired when updating statistics. */
 			if (update_statistics && !dict_locked) {
 				dict_stats_update_if_needed(node->table,
@@ -1405,7 +1282,7 @@ rollback_clust:
 		}
 	}
 
-	dict_table_close(node->table, dict_locked, FALSE);
+	dict_table_close(node->table, dict_locked);
 
 	node->table = NULL;
 
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
index 3ac8e434f35..4d6d779eee6 100644
--- a/storage/innobase/row/row0undo.cc
+++ b/storage/innobase/row/row0undo.cc
@@ -256,21 +256,6 @@ func_exit:
 	return(found);
 }
 
-/** Try to truncate the undo logs.
-@param[in,out]	trx	transaction */
-static void row_undo_try_truncate(trx_t* trx)
-{
-	if (trx_undo_t*	undo = trx->rsegs.m_redo.undo) {
-		ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
-		trx_undo_truncate_end(*undo, trx->undo_no, false);
-	}
-
-	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
-		ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
-		trx_undo_truncate_end(*undo, trx->undo_no, true);
-	}
-}
-
 /** Get the latest undo log record for rollback.
 @param[in,out]	node		rollback context
 @return	whether an undo log record was fetched */
@@ -280,13 +265,14 @@ static bool row_undo_rec_get(undo_node_t* node)
 
 	if (trx->pages_undone) {
 		trx->pages_undone = 0;
-		row_undo_try_truncate(trx);
+		trx_undo_try_truncate(*trx);
 	}
 
 	trx_undo_t*	undo	= NULL;
 	trx_undo_t*	update	= trx->rsegs.m_redo.undo;
 	trx_undo_t*	temp	= trx->rsegs.m_noredo.undo;
 	const undo_no_t	limit	= trx->roll_limit;
+	bool		is_temp = false;
 
 	ut_ad(!update || !temp || update->empty() || temp->empty()
 	      || update->top_undo_no != temp->top_undo_no);
@@ -300,15 +286,14 @@ static bool row_undo_rec_get(undo_node_t* node)
 	}
 
 	if (temp && !temp->empty() && temp->top_undo_no >= limit) {
-		if (!undo) {
-			undo = temp;
-		} else if (undo->top_undo_no < temp->top_undo_no) {
+		if (!undo || undo->top_undo_no < temp->top_undo_no) {
 			undo = temp;
+			is_temp = true;
 		}
 	}
 
 	if (undo == NULL) {
-		row_undo_try_truncate(trx);
+		trx_undo_try_truncate(*trx);
 		/* Mark any ROLLBACK TO SAVEPOINT completed, so that
 		if the transaction object is committed and reused
 		later, we will default to a full ROLLBACK. */
@@ -321,13 +306,18 @@ static bool row_undo_rec_get(undo_node_t* node)
 	ut_ad(limit <= undo->top_undo_no);
 
 	node->roll_ptr = trx_undo_build_roll_ptr(
-		false, undo->rseg->id, undo->top_page_no, undo->top_offset);
+		false, trx_sys.rseg_id(undo->rseg, !is_temp),
+		undo->top_page_no, undo->top_offset);
 
 	mtr_t	mtr;
 	mtr.start();
 
-	buf_block_t* undo_page = trx_undo_page_get_s_latched(
-		page_id_t(undo->rseg->space->id, undo->top_page_no), &mtr);
+	buf_block_t* undo_page = buf_page_get(
+		page_id_t(undo->rseg->space->id, undo->top_page_no),
+		0, RW_S_LATCH, &mtr);
+	if (!undo_page) {
+		return false;
+	}
 
 	uint16_t offset = undo->top_offset;
 
@@ -348,11 +338,15 @@ static bool row_undo_rec_get(undo_node_t* node)
 		ut_ad(undo->empty());
 	}
 
-	node->undo_rec = trx_undo_rec_copy(undo_page->frame + offset,
+	node->undo_rec = trx_undo_rec_copy(undo_page->page.frame + offset,
 					   node->heap);
 	mtr.commit();
 
-	switch (trx_undo_rec_get_type(node->undo_rec)) {
+	if (UNIV_UNLIKELY(!node->undo_rec)) {
+		return false;
+	}
+
+	switch (node->undo_rec[2] & (TRX_UNDO_CMPL_INFO_MULT - 1)) {
 	case TRX_UNDO_INSERT_METADATA:
 		/* This record type was introduced in MDEV-11369
 		instant ADD COLUMN, which was implemented after
@@ -364,14 +358,14 @@ static bool row_undo_rec_get(undo_node_t* node)
 		ut_ad(undo == update);
 		/* fall through */
 	case TRX_UNDO_INSERT_REC:
+	case TRX_UNDO_EMPTY:
 		node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
-		node->state = undo == temp
+		node->state = is_temp
 			? UNDO_INSERT_TEMPORARY : UNDO_INSERT_PERSISTENT;
 		break;
 	default:
-		node->state = undo == temp
+		node->state = is_temp
 			? UNDO_UPDATE_TEMPORARY : UNDO_UPDATE_PERSISTENT;
-		break;
 	}
 
 	trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no(
@@ -399,19 +393,6 @@ row_undo(
 		return DB_SUCCESS;
 	}
 
-	/* Prevent prepare_inplace_alter_table_dict() from adding
-	dict_table_t::indexes while we are processing the record.
-	Recovered transactions are not protected by MDL, and the
-	secondary index creation is not protected by table locks
-	for online operation. (A table lock would only be acquired
-	when committing the ALTER TABLE operation.) */
-	trx_t* trx = node->trx;
-	const bool locked_data_dict = !trx->dict_operation_lock_mode;
-
-	if (UNIV_UNLIKELY(locked_data_dict)) {
-		row_mysql_freeze_data_dictionary(trx);
-	}
-
 	dberr_t err;
 
 	switch (node->state) {
@@ -428,11 +409,6 @@ row_undo(
 		err = DB_CORRUPTION;
 	}
 
-	if (locked_data_dict) {
-
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
 	node->state = UNDO_NODE_FETCH_NEXT;
 	btr_pcur_close(&(node->pcur));
 
@@ -460,7 +436,7 @@ row_undo_step(
 
 	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
 
-	if (UNIV_UNLIKELY(trx_get_dict_operation(trx) == TRX_DICT_OP_NONE
+	if (UNIV_UNLIKELY(!trx->dict_operation
 			  && !srv_undo_sources
 			  && srv_shutdown_state != SRV_SHUTDOWN_NONE)
 	    && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index 066e3d43d27..fe88fce58a2 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2023, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -127,10 +127,6 @@ row_upd_changes_first_fields_binary(
 Checks if index currently is mentioned as a referenced index in a foreign
 key constraint.
 
-NOTE that since we do not hold dict_sys.latch when leaving the
-function, it may be that the referencing table has been dropped when
-we leave this function: this function is only for heuristic use!
-
 @return true if referenced */
 static
 bool
@@ -139,64 +135,44 @@ row_upd_index_is_referenced(
 	dict_index_t*	index,	/*!< in: index */
 	trx_t*		trx)	/*!< in: transaction */
 {
-	dict_table_t*	table		= index->table;
-
-	if (table->referenced_set.empty()) {
-		return false;
-	}
-
-	const bool froze_data_dict = !trx->dict_operation_lock_mode;
-	if (froze_data_dict) {
-		row_mysql_freeze_data_dictionary(trx);
-	}
-
-	dict_foreign_set::iterator	it
-		= std::find_if(table->referenced_set.begin(),
-			       table->referenced_set.end(),
-			       dict_foreign_with_index(index));
-
-	const bool is_referenced = (it != table->referenced_set.end());
-
-	if (froze_data_dict) {
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
-	return is_referenced;
+  dict_table_t *table= index->table;
+  /* The pointers in table->referenced_set are safe to dereference
+  thanks to the SQL layer having acquired MDL on all (grand)parent tables. */
+  dict_foreign_set::iterator end= table->referenced_set.end();
+  return end != std::find_if(table->referenced_set.begin(), end,
+                             dict_foreign_with_index(index));
 }
 
 #ifdef WITH_WSREP
 static
-ibool
+bool
 wsrep_row_upd_index_is_foreign(
 /*========================*/
 	dict_index_t*	index,	/*!< in: index */
 	trx_t*		trx)	/*!< in: transaction */
 {
-	dict_table_t*	table		= index->table;
-	ibool		froze_data_dict	= FALSE;
-	ibool		is_referenced	= FALSE;
+  if (!trx->is_wsrep())
+    return false;
 
-	if (table->foreign_set.empty()) {
-		return(FALSE);
-	}
-
-	if (trx->dict_operation_lock_mode == 0) {
-		row_mysql_freeze_data_dictionary(trx);
-		froze_data_dict = TRUE;
-	}
+  dict_table_t *table= index->table;
 
-	dict_foreign_set::iterator	it
-		= std::find_if(table->foreign_set.begin(),
-			       table->foreign_set.end(),
-			       dict_foreign_with_foreign_index(index));
+  if (table->foreign_set.empty())
+    return false;
 
-	is_referenced = (it != table->foreign_set.end());
+  /* No MDL protects dereferencing the members of table->foreign_set. */
+  const bool no_lock= !trx->dict_operation_lock_mode;
+  if (no_lock)
+    dict_sys.freeze(SRW_LOCK_CALL);
 
-	if (froze_data_dict) {
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
+  auto end= table->foreign_set.end();
+  const bool is_referenced= end !=
+    std::find_if(table->foreign_set.begin(), end,
+                 [index](const dict_foreign_t* f)
+                 {return f->foreign_index == index;});
+  if (no_lock)
+    dict_sys.unfreeze();
 
-	return(is_referenced);
+  return is_referenced;
 }
 #endif /* WITH_WSREP */
 
@@ -224,10 +200,8 @@ row_upd_check_references_constraints(
 	dict_foreign_t*	foreign;
 	mem_heap_t*	heap;
 	dtuple_t*	entry;
-	trx_t*		trx;
 	const rec_t*	rec;
 	dberr_t		err;
-	ibool		got_s_lock	= FALSE;
 
 	DBUG_ENTER("row_upd_check_references_constraints");
 
@@ -235,8 +209,6 @@ row_upd_check_references_constraints(
 		DBUG_RETURN(DB_SUCCESS);
 	}
 
-	trx = thr_get_trx(thr);
-
 	rec = btr_pcur_get_rec(pcur);
 	ut_ad(rec_offs_validate(rec, index, offsets));
 
@@ -250,12 +222,6 @@ row_upd_check_references_constraints(
 
 	mtr->start();
 
-	if (trx->dict_operation_lock_mode == 0) {
-		got_s_lock = TRUE;
-
-		row_mysql_freeze_data_dictionary(trx);
-	}
-
 	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
 			    "foreign_constraint_check_for_insert");
 
@@ -275,34 +241,19 @@ row_upd_check_references_constraints(
 			|| row_upd_changes_first_fields_binary(
 				entry, index, node->update,
 				foreign->n_fields))) {
-			dict_table_t*	foreign_table = foreign->foreign_table;
-
-			dict_table_t*	ref_table = NULL;
-
-			if (foreign_table == NULL) {
+			dict_table_t*	ref_table = nullptr;
 
+			if (!foreign->foreign_table) {
 				ref_table = dict_table_open_on_name(
 					foreign->foreign_table_name_lookup,
-					FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+					false, DICT_ERR_IGNORE_NONE);
 			}
 
-			if (foreign_table) {
-				foreign_table->inc_fk_checks();
-			}
-
-			/* NOTE that if the thread ends up waiting for a lock
-			we will release dict_sys.latch temporarily!
-			But the inc_fk_checks() protects foreign_table from
-			being dropped while the check is running. */
-
 			err = row_ins_check_foreign_constraint(
 				FALSE, foreign, table, entry, thr);
 
-			if (foreign_table) {
-				foreign_table->dec_fk_checks();
-			}
-			if (ref_table != NULL) {
-				dict_table_close(ref_table, FALSE, FALSE);
+			if (ref_table) {
+				dict_table_close(ref_table);
 			}
 
 			if (err != DB_SUCCESS) {
@@ -314,10 +265,6 @@ row_upd_check_references_constraints(
 	err = DB_SUCCESS;
 
 func_exit:
-	if (got_s_lock) {
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
 	mem_heap_free(heap);
 
 	DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
@@ -341,18 +288,13 @@ wsrep_row_upd_check_foreign_constraints(
 	dict_foreign_t*	foreign;
 	mem_heap_t*	heap;
 	dtuple_t*	entry;
-	trx_t*		trx;
 	const rec_t*	rec;
 	dberr_t		err;
-	ibool		got_s_lock	= FALSE;
-	ibool		opened     	= FALSE;
 
 	if (table->foreign_set.empty()) {
 		return(DB_SUCCESS);
 	}
 
-	trx = thr_get_trx(thr);
-
 	/* TODO: make native slave thread bail out here */
 
 	rec = btr_pcur_get_rec(pcur);
@@ -366,12 +308,6 @@ wsrep_row_upd_check_foreign_constraints(
 
 	mtr_start(mtr);
 
-	if (trx->dict_operation_lock_mode == 0) {
-		got_s_lock = TRUE;
-
-		row_mysql_freeze_data_dictionary(trx);
-	}
-
 	for (dict_foreign_set::iterator it = table->foreign_set.begin();
 	     it != table->foreign_set.end();
 	     ++it) {
@@ -388,27 +324,21 @@ wsrep_row_upd_check_foreign_constraints(
 				entry, index, node->update,
 				foreign->n_fields))) {
 
-			if (foreign->referenced_table == NULL) {
+			dict_table_t *opened = nullptr;
+
+			if (!foreign->referenced_table) {
 				foreign->referenced_table =
 					dict_table_open_on_name(
 					  foreign->referenced_table_name_lookup,
-					  FALSE, FALSE, DICT_ERR_IGNORE_NONE);
-				opened = (foreign->referenced_table) ? TRUE : FALSE;
+					  false, DICT_ERR_IGNORE_NONE);
+				opened = foreign->referenced_table;
 			}
 
-			/* NOTE that if the thread ends up waiting for a lock
-			we will release dict_sys.latch temporarily!
-			But the counter on the table protects 'foreign' from
-			being dropped while the check is running. */
-
 			err = row_ins_check_foreign_constraint(
 				TRUE, foreign, table, entry, thr);
 
-			if (foreign->referenced_table) {
-				if (opened == TRUE) {
-					dict_table_close(foreign->referenced_table, FALSE, FALSE);
-					opened = FALSE;
-				}
+			if (opened) {
+				dict_table_close(opened);
 			}
 
 			if (err != DB_SUCCESS) {
@@ -419,10 +349,6 @@ wsrep_row_upd_check_foreign_constraints(
 
 	err = DB_SUCCESS;
 func_exit:
-	if (got_s_lock) {
-		row_mysql_unfreeze_data_dictionary(trx);
-	}
-
 	mem_heap_free(heap);
 
 	return(err);
@@ -543,46 +469,6 @@ row_upd_changes_field_size_or_external(
 	return(FALSE);
 }
 
-/***********************************************************//**
-Returns true if row update contains disowned external fields.
-@return true if the update contains disowned external fields. */
-bool
-row_upd_changes_disowned_external(
-/*==============================*/
-	const upd_t*	update)	/*!< in: update vector */
-{
-	const upd_field_t*	upd_field;
-	const dfield_t*		new_val;
-	ulint			new_len;
-	ulint                   n_fields;
-	ulint			i;
-
-	n_fields = upd_get_n_fields(update);
-
-	for (i = 0; i < n_fields; i++) {
-		const byte*	field_ref;
-
-		upd_field = upd_get_nth_field(update, i);
-		new_val = &(upd_field->new_val);
-		new_len = dfield_get_len(new_val);
-
-		if (!dfield_is_ext(new_val)) {
-			continue;
-		}
-
-		ut_ad(new_len >= BTR_EXTERN_FIELD_REF_SIZE);
-
-		field_ref = static_cast<const byte*>(dfield_get_data(new_val))
-			    + new_len - BTR_EXTERN_FIELD_REF_SIZE;
-
-		if (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG) {
-			return(true);
-		}
-	}
-
-	return(false);
-}
-
 /***************************************************************//**
 Builds an update vector from those fields which in a secondary index entry
 differ from a record that has the equal ordering fields. NOTE: we compare
@@ -1146,16 +1032,7 @@ row_upd_replace_vcol(
 		/* If there is no index on the column, do not bother for
 		value update */
 		if (!col->m_col.ord_part) {
-			dict_index_t*	clust_index
-				= dict_table_get_first_index(table);
-
-			/* Skip the column if there is no online alter
-			table in progress or it is not being indexed
-			in new table */
-			if (!dict_index_is_online_ddl(clust_index)
-			    || !row_log_col_is_indexed(clust_index, col_no)) {
-				continue;
-			}
+			continue;
 		}
 
 		dfield = dtuple_get_nth_v_field(row, col_no);
@@ -1345,9 +1222,6 @@ row_upd_changes_ord_field_binary_func(
 	ulint			i;
 	const dict_index_t*	clust_index;
 
-	ut_ad(thr);
-	ut_ad(thr->graph);
-	ut_ad(thr->graph->trx);
 	ut_ad(!index->table->skip_alter_undo);
 
 	n_unique = dict_index_get_n_unique(index);
@@ -1547,9 +1421,11 @@ row_upd_changes_ord_field_binary_func(
 					trx_rollback_recovered()
 					when the server had crashed before
 					storing the field. */
-					ut_ad(thr->graph->trx->is_recovered);
-					ut_ad(thr->graph->trx
-					      == trx_roll_crash_recv_trx);
+					ut_ad(!thr
+					      || thr->graph->trx->is_recovered);
+					ut_ad(!thr
+					      || thr->graph->trx
+					         == trx_roll_crash_recv_trx);
 					return(TRUE);
 				}
 
@@ -1956,25 +1832,28 @@ row_upd_sec_index_entry(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	mtr_t			mtr;
-	const rec_t*		rec;
 	btr_pcur_t		pcur;
 	mem_heap_t*		heap;
 	dtuple_t*		entry;
 	dict_index_t*		index;
-	btr_cur_t*		btr_cur;
 	dberr_t			err	= DB_SUCCESS;
 	trx_t*			trx	= thr_get_trx(thr);
-	ulint			mode;
+	btr_latch_mode		mode;
 	ulint			flags;
 	enum row_search_result	search_result;
 
 	ut_ad(trx->id != 0);
 
 	index = node->index;
+	ut_ad(index->is_committed());
+
+	/* For secondary indexes, index->online_status==ONLINE_INDEX_COMPLETE
+	if index->is_committed(). */
+	ut_ad(!dict_index_is_online_ddl(index));
 
 	const bool referenced = row_upd_index_is_referenced(index, trx);
 #ifdef WITH_WSREP
-	bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+	const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
 #endif /* WITH_WSREP */
 
 	heap = mem_heap_create(1024);
@@ -1989,6 +1868,7 @@ row_upd_sec_index_entry(
 			    "before_row_upd_sec_index_entry");
 
 	mtr.start();
+	mode = BTR_MODIFY_LEAF;
 
 	switch (index->table->space_id) {
 	case SRV_TMP_SPACE_ID:
@@ -2000,83 +1880,37 @@ row_upd_sec_index_entry(
 		/* fall through */
 	case IBUF_SPACE_ID:
 		flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		if (!referenced) {
+			mode = BTR_DELETE_MARK_LEAF;
+		}
 		break;
 	}
 
-	bool uncommitted = !index->is_committed();
-
-	if (uncommitted) {
-		/* The index->online_status may change if the index is
-		or was being created online, but not committed yet. It
-		is protected by index->lock. */
-
-		mtr_s_lock_index(index, &mtr);
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = thr;
+	pcur.btr_cur.page_cur.index = index;
 
-		switch (dict_index_get_online_status(index)) {
-		case ONLINE_INDEX_COMPLETE:
-			/* This is a normal index. Do not log anything.
-			Perform the update on the index tree directly. */
-			break;
-		case ONLINE_INDEX_CREATION:
-			/* Log a DELETE and optionally INSERT. */
-			row_log_online_op(index, entry, 0);
-
-			if (!node->is_delete) {
-				mem_heap_empty(heap);
-				entry = row_build_index_entry(
-					node->upd_row, node->upd_ext,
-					index, heap);
-				ut_a(entry);
-				row_log_online_op(index, entry, trx->id);
-			}
-			/* fall through */
-		case ONLINE_INDEX_ABORTED:
-		case ONLINE_INDEX_ABORTED_DROPPED:
-			mtr_commit(&mtr);
-			goto func_exit;
+	if (index->is_spatial()) {
+		mode = btr_latch_mode(BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK);
+		if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+			goto found;
 		}
 
-		/* We can only buffer delete-mark operations if there
-		are no foreign key constraints referring to the index.
-		Change buffering is disabled for temporary tables and
-		spatial index. */
-		mode = (referenced || index->table->is_temporary()
-			|| dict_index_is_spatial(index))
-			? BTR_MODIFY_LEAF_ALREADY_S_LATCHED
-			: BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED;
-	} else {
-		/* For secondary indexes,
-		index->online_status==ONLINE_INDEX_COMPLETE if
-		index->is_committed(). */
-		ut_ad(!dict_index_is_online_ddl(index));
-
-		/* We can only buffer delete-mark operations if there
-		are no foreign key constraints referring to the index.
-		Change buffering is disabled for temporary tables and
-		spatial index. */
-		mode = (referenced || index->table->is_temporary()
-			|| dict_index_is_spatial(index))
-			? BTR_MODIFY_LEAF
-			: BTR_DELETE_MARK_LEAF;
-	}
+		if (pcur.btr_cur.rtr_info->fd_del) {
+			/* We found the record, but a delete marked */
+			goto close;
+		}
 
-	if (dict_index_is_spatial(index)) {
-		ut_ad(mode & BTR_MODIFY_LEAF);
-		mode |= BTR_RTREE_DELETE_MARK;
+		goto not_found;
 	}
 
-	/* Set the query thread, so that ibuf_insert_low() will be
-	able to invoke thd_get_trx(). */
-	btr_pcur_get_btr_cur(&pcur)->thr = thr;
-
-	search_result = row_search_index_entry(index, entry, mode,
-					       &pcur, &mtr);
-
-	btr_cur = btr_pcur_get_btr_cur(&pcur);
-
-	rec = btr_cur_get_rec(btr_cur);
+	search_result = row_search_index_entry(entry, mode, &pcur, &mtr);
 
 	switch (search_result) {
+	const rec_t* rec;
 	case ROW_NOT_DELETED_REF:	/* should only occur for BTR_DELETE */
 		ut_error;
 		break;
@@ -2085,24 +1919,8 @@ row_upd_sec_index_entry(
 		break;
 
 	case ROW_NOT_FOUND:
-		if (!index->is_committed()) {
-			/* When online CREATE INDEX copied the update
-			that we already made to the clustered index,
-			and completed the secondary index creation
-			before we got here, the old secondary index
-			record would not exist. The CREATE INDEX
-			should be waiting for a MySQL meta-data lock
-			upgrade at least until this UPDATE returns.
-			After that point, set_committed(true) would be
-			invoked by commit_inplace_alter_table(). */
-			break;
-		}
-
-		if (dict_index_is_spatial(index) && btr_cur->rtr_info->fd_del) {
-			/* We found the record, but a delete marked */
-			break;
-		}
-
+not_found:
+		rec = btr_pcur_get_rec(&pcur);
 		ib::error()
 			<< "Record in index " << index->name
 			<< " of table " << index->table->name
@@ -2116,7 +1934,9 @@ row_upd_sec_index_entry(
 #endif /* UNIV_DEBUG */
 		break;
 	case ROW_FOUND:
+found:
 		ut_ad(err == DB_SUCCESS);
+		rec = btr_pcur_get_rec(&pcur);
 
 		/* Delete mark the old index record; it can already be
 		delete marked if we return after a lock wait in
@@ -2125,14 +1945,14 @@ row_upd_sec_index_entry(
 			    rec, dict_table_is_comp(index->table))) {
 			err = lock_sec_rec_modify_check_and_lock(
 				flags,
-				btr_cur_get_block(btr_cur),
-				btr_cur_get_rec(btr_cur), index, thr, &mtr);
+				btr_pcur_get_block(&pcur),
+				btr_pcur_get_rec(&pcur), index, thr, &mtr);
 			if (err != DB_SUCCESS) {
 				break;
 			}
 
-			btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
-						  btr_cur_get_rec(btr_cur),
+			btr_rec_set_deleted<true>(btr_pcur_get_block(&pcur),
+						  btr_pcur_get_rec(&pcur),
 						  &mtr);
 #ifdef WITH_WSREP
 			if (!referenced && foreign
@@ -2191,6 +2011,7 @@ row_upd_sec_index_entry(
 		}
 	}
 
+close:
 	btr_pcur_close(&pcur);
 	mtr_commit(&mtr);
 
@@ -2204,35 +2025,11 @@ row_upd_sec_index_entry(
 	DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
 			    "before_row_upd_sec_new_index_entry");
 
-	uncommitted = !index->is_committed();
-	if (uncommitted) {
-		mtr.start();
-		/* The index->online_status may change if the index is
-		being rollbacked. It is protected by index->lock. */
-
-		mtr_s_lock_index(index, &mtr);
-
-		switch (dict_index_get_online_status(index)) {
-		case ONLINE_INDEX_COMPLETE:
-		case ONLINE_INDEX_CREATION:
-		       break;
-		case ONLINE_INDEX_ABORTED:
-		case ONLINE_INDEX_ABORTED_DROPPED:
-		       mtr_commit(&mtr);
-		       goto func_exit;
-		}
-
-	}
-
 	/* Build a new index entry */
 	entry = row_build_index_entry(node->upd_row, node->upd_ext,
 				      index, heap);
 	ut_a(entry);
 
-	if (uncommitted) {
-		mtr_commit(&mtr);
-	}
-
 	/* Insert new index entry */
 	err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete);
 
@@ -2553,7 +2350,6 @@ row_upd_clust_rec(
 	btr_pcur_t*	pcur;
 	btr_cur_t*	btr_cur;
 	dberr_t		err;
-	const dtuple_t*	rebuilt_old_pk	= NULL;
 
 	ut_ad(dict_index_is_clust(index));
 	ut_ad(!thr_get_trx(thr)->in_rollback);
@@ -2567,11 +2363,6 @@ row_upd_clust_rec(
 				    dict_table_is_comp(index->table)));
 	ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
 
-	if (dict_index_is_online_ddl(index)) {
-		rebuilt_old_pk = row_log_table_get_pk(
-			btr_cur_get_rec(btr_cur), index, offsets, NULL, &heap);
-	}
-
 	/* Try optimistic updating of the record, keeping changes within
 	the page; we do not check locks because we assume the x-lock on the
 	record to update */
@@ -2589,7 +2380,7 @@ row_upd_clust_rec(
 	}
 
 	if (err == DB_SUCCESS) {
-		goto success;
+		goto func_exit;
 	}
 
 	if (buf_pool.running_out()) {
@@ -2618,7 +2409,7 @@ row_upd_clust_rec(
 	the same transaction do not modify the record in the meantime.
 	Therefore we can assert that the restoration of the cursor succeeds. */
 
-	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr) ==
+	ut_a(pcur->restore_position(BTR_MODIFY_TREE, mtr) ==
 	    btr_pcur_t::SAME_ALL);
 
 	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
@@ -2642,15 +2433,6 @@ row_upd_clust_rec(
 		DEBUG_SYNC_C("after_row_upd_extern");
 	}
 
-	if (err == DB_SUCCESS) {
-success:
-		if (dict_index_is_online_ddl(index)) {
-			row_log_table_update(
-				btr_cur_get_rec(btr_cur),
-				index, offsets, rebuilt_old_pk);
-		}
-	}
-
 func_exit:
 	if (heap) {
 		mem_heap_free(heap);
@@ -2776,6 +2558,10 @@ row_upd_clust_step(
 
 	index = dict_table_get_first_index(node->table);
 
+	if (index->is_corrupted()) {
+		return DB_TABLE_CORRUPT;
+	}
+
 	const bool referenced = row_upd_index_is_referenced(index, trx);
 #ifdef WITH_WSREP
 	const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
@@ -2810,57 +2596,30 @@ row_upd_clust_step(
 
 	ut_a(pcur->rel_pos == BTR_PCUR_ON);
 
-	ulint	mode;
+	btr_latch_mode mode;
 
 	DEBUG_SYNC_C_IF_THD(trx->mysql_thd, "innodb_row_upd_clust_step_enter");
 
 	if (dict_index_is_online_ddl(index)) {
 		ut_ad(node->table->id != DICT_INDEXES_ID);
-		mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED;
+		mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
 		mtr_s_lock_index(index, &mtr);
 	} else {
 		mode = BTR_MODIFY_LEAF;
 	}
 
-	if (btr_pcur_restore_position(mode, pcur, &mtr) !=
-	    btr_pcur_t::SAME_ALL) {
+	if (pcur->restore_position(mode, &mtr) != btr_pcur_t::SAME_ALL) {
 		err = DB_RECORD_NOT_FOUND;
 		goto exit_func;
 	}
 
-	/* If this is a row in SYS_INDEXES table of the data dictionary,
-	then we have to free the file segments of the index tree associated
-	with the index */
-
-	if (node->is_delete == PLAIN_DELETE
-	    && node->table->id == DICT_INDEXES_ID) {
-
-		ut_ad(!dict_index_is_online_ddl(index));
-
-		dict_drop_index_tree(pcur, trx, &mtr);
-
-		mtr.commit();
-
-		mtr.start();
-		index->set_modified(mtr);
-
-		if (btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, &mtr) !=
-		    btr_pcur_t::SAME_ALL) {
-			err = DB_ERROR;
-
-			mtr.commit();
-
-			return(err);
-		}
-	}
-
 	rec = btr_pcur_get_rec(pcur);
 	offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
 				  ULINT_UNDEFINED, &heap);
 
 	if (!flags && !node->has_clust_rec_x_lock) {
 		err = lock_clust_rec_modify_check_and_lock(
-			0, btr_pcur_get_block(pcur),
+			btr_pcur_get_block(pcur),
 			rec, index, offsets, thr);
 		if (err != DB_SUCCESS) {
 			goto exit_func;
@@ -2869,8 +2628,8 @@ row_upd_clust_step(
 
 	ut_ad(index->table->no_rollback() || index->table->is_temporary()
 	      || row_get_rec_trx_id(rec, index, offsets) == trx->id
-	      || lock_trx_has_expl_x_lock(trx, index->table,
-					  btr_pcur_get_block(pcur),
+	      || lock_trx_has_expl_x_lock(*trx, *index->table,
+					  btr_pcur_get_block(pcur)->page.id(),
 					  page_rec_get_heap_no(rec)));
 
 	if (node->is_delete == PLAIN_DELETE) {
@@ -3017,14 +2776,12 @@ row_upd(
 	DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
 
 	do {
-		/* Skip corrupted index */
-		dict_table_skip_corrupt_index(node->index);
-
 		if (!node->index) {
 			break;
 		}
 
-		if (node->index->type != DICT_FTS) {
+		if (!(node->index->type & (DICT_FTS | DICT_CORRUPT))
+		    && node->index->is_committed()) {
 			err = row_upd_sec_step(node, thr);
 
 			if (err != DB_SUCCESS) {
@@ -3091,7 +2848,7 @@ row_upd_step(
 			/* It may be that the current session has not yet
 			started its transaction, or it has been committed: */
 
-			err = lock_table(0, node->table, LOCK_IX, thr);
+			err = lock_table(node->table, nullptr, LOCK_IX, thr);
 
 			if (err != DB_SUCCESS) {
 
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
index 4774bef49ea..a4fc32cc5a8 100644
--- a/storage/innobase/row/row0vers.cc
+++ b/storage/innobase/row/row0vers.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -77,7 +77,7 @@ index record.
 @param[in]	offsets		rec_get_offsets(rec, index)
 @param[in,out]	mtr		mini-transaction
 @return	the active transaction; state must be rechecked after
-trx_mutex_enter(), and trx->release_reference() must be invoked
+acquiring trx->mutex, and trx->release_reference() must be invoked
 @retval	NULL if the record was committed */
 UNIV_INLINE
 trx_t*
@@ -104,6 +104,9 @@ row_vers_impl_x_locked_low(
 	DBUG_ENTER("row_vers_impl_x_locked_low");
 
 	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(clust_rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
 
 	if (ulint trx_id_offset = clust_index->trx_id_offset) {
 		trx_id = mach_read_from_6(clust_rec + trx_id_offset);
@@ -190,14 +193,14 @@ row_vers_impl_x_locked_low(
 		heap = mem_heap_create(1024);
 
 		trx_undo_prev_version_build(
-			clust_rec, mtr, version, clust_index, clust_offsets,
+			version, clust_index, clust_offsets,
 			heap, &prev_version, NULL,
 			dict_index_has_virtual(index) ? &vrow : NULL, 0);
 
-		trx_mutex_enter(trx);
+		ut_d(trx->mutex_lock());
 		const bool committed = trx_state_eq(
 			trx, TRX_STATE_COMMITTED_IN_MEMORY);
-		trx_mutex_exit(trx);
+		ut_d(trx->mutex_unlock());
 
 		/* The oldest visible clustered index version must not be
 		delete-marked, because we never start a transaction by
@@ -383,7 +386,7 @@ index record.
 @param[in]	index	secondary index
 @param[in]	offsets	rec_get_offsets(rec, index)
 @return	the active transaction; state must be rechecked after
-trx_mutex_enter(), and trx->release_reference() must be invoked
+acquiring trx->mutex, and trx->release_reference() must be invoked
 @retval	NULL if the record was committed */
 trx_t*
 row_vers_impl_x_locked(
@@ -397,7 +400,7 @@ row_vers_impl_x_locked(
 	const rec_t*	clust_rec;
 	dict_index_t*	clust_index;
 
-	ut_ad(!lock_mutex_own());
+	lock_sys.assert_unlocked();
 
 	mtr_start(&mtr);
 
@@ -527,6 +530,10 @@ row_vers_build_cur_vrow_low(
 			 = DATA_MISSING;
 	}
 
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
 	version = rec;
 
 	/* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE
@@ -543,7 +550,7 @@ row_vers_build_cur_vrow_low(
 			version, clust_index, clust_offsets);
 
 		trx_undo_prev_version_build(
-			rec, mtr, version, clust_index, clust_offsets,
+			version, clust_index, clust_offsets,
 			heap, &prev_version, NULL, vrow, status);
 
 		if (heap2) {
@@ -643,6 +650,10 @@ row_vers_vc_matches_cluster(
 	/* First compare non-virtual columns (primary keys) */
 	ut_ad(index->n_fields == n_fields);
 	ut_ad(n_fields == dtuple_get_n_fields(icentry));
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
 	{
 		const dfield_t* a = ientry->fields;
 		const dfield_t* b = icentry->fields;
@@ -684,7 +695,7 @@ row_vers_vc_matches_cluster(
 		ut_ad(roll_ptr != 0);
 
 		trx_undo_prev_version_build(
-			rec, mtr, version, clust_index, clust_offsets,
+			version, clust_index, clust_offsets,
 			heap, &prev_version, NULL, vrow,
 			TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE);
 
@@ -858,7 +869,7 @@ static bool dtuple_vcol_data_missing(const dtuple_t &tuple,
 }
 
 /** Finds out if a version of the record, where the version >= the current
-purge view, should have ientry as its secondary index entry. We check
+purge_sys.view, should have ientry as its secondary index entry. We check
 if there is any not delete marked version of the record where the trx
 id >= purge view, and the secondary index entry == ientry; exactly in
 this case we return TRUE.
@@ -1040,11 +1051,12 @@ unsafe_to_purge:
 		heap = mem_heap_create(1024);
 		vrow = NULL;
 
-		trx_undo_prev_version_build(rec, mtr, version,
+		trx_undo_prev_version_build(version,
 					    clust_index, clust_offsets,
-					    heap, &prev_version, NULL,
+					    heap, &prev_version, nullptr,
 					    dict_index_has_virtual(index)
-						? &vrow : NULL, 0);
+					    ? &vrow : nullptr,
+					    TRX_UNDO_CHECK_PURGEABILITY);
 		mem_heap_free(heap2); /* free version and clust_offsets */
 
 		if (!prev_version) {
@@ -1127,7 +1139,9 @@ nochange_index:
 Constructs the version of a clustered index record which a consistent
 read should see. We assume that the trx id stored in rec is such that
 the consistent read should not see rec in its present version.
-@return DB_SUCCESS or DB_MISSING_HISTORY */
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
 dberr_t
 row_vers_build_for_consistent_read(
 /*===============================*/
@@ -1162,13 +1176,12 @@ row_vers_build_for_consistent_read(
 	ut_ad(index->is_primary());
 	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
 					      | MTR_MEMO_PAGE_S_FIX));
-	ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S));
 
 	ut_ad(rec_offs_validate(rec, index, *offsets));
 
 	trx_id = row_get_rec_trx_id(rec, index, *offsets);
 
-	ut_ad(!view->changes_visible(trx_id, index->table->name));
+	ut_ad(!view->changes_visible(trx_id));
 
 	ut_ad(!vrow || !(*vrow));
 
@@ -1186,12 +1199,10 @@ row_vers_build_for_consistent_read(
 		/* If purge can't see the record then we can't rely on
 		the UNDO log record. */
 
-		bool	purge_sees = trx_undo_prev_version_build(
-			rec, mtr, version, index, *offsets, heap,
+		err = trx_undo_prev_version_build(
+			version, index, *offsets, heap,
 			&prev_version, NULL, vrow, 0);
 
-		err  = (purge_sees) ? DB_SUCCESS : DB_MISSING_HISTORY;
-
 		if (prev_heap != NULL) {
 			mem_heap_free(prev_heap);
 		}
@@ -1213,7 +1224,7 @@ row_vers_build_for_consistent_read(
 
 		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
 
-		if (view->changes_visible(trx_id, index->table->name)) {
+		if (view->changes_visible(trx_id)) {
 
 			/* The view already sees this version: we can copy
 			it to in_heap and return */
@@ -1230,8 +1241,11 @@ row_vers_build_for_consistent_read(
 				dtuple_dup_v_fld(*vrow, in_heap);
 			}
 			break;
+		} else if (trx_id >= view->low_limit_id()
+			   && trx_id >= trx_sys.get_max_trx_id()) {
+			err = DB_CORRUPTION;
+			break;
 		}
-
 		version = prev_version;
 	}
 
@@ -1240,6 +1254,10 @@ row_vers_build_for_consistent_read(
 	return(err);
 }
 
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */
+# pragma GCC optimize ("O0")
+#endif
 /*****************************************************************//**
 Constructs the last committed version of a clustered index record,
 which should be seen by a semi-consistent read. */
@@ -1275,7 +1293,6 @@ row_vers_build_for_semi_consistent_read(
 	ut_ad(index->is_primary());
 	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
 					      | MTR_MEMO_PAGE_S_FIX));
-	ut_ad(!rw_lock_own(&(purge_sys.latch), RW_LOCK_S));
 
 	ut_ad(rec_offs_validate(rec, index, *offsets));
 
@@ -1345,10 +1362,9 @@ committed_version_trx:
 		heap2 = heap;
 		heap = mem_heap_create(1024);
 
-		if (!trx_undo_prev_version_build(rec, mtr, version, index,
-						 *offsets, heap,
-						 &prev_version,
-						 in_heap, vrow, 0)) {
+		if (trx_undo_prev_version_build(version, index, *offsets, heap,
+						&prev_version, in_heap, vrow,
+						0) != DB_SUCCESS) {
 			mem_heap_free(heap);
 			heap = heap2;
 			heap2 = NULL;
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index 174027cedda..b6496d03908 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -75,27 +75,19 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN},
 
-	{"metadata_table_handles_closed", "metadata",
-	 "Number of table handles closed",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_TABLE_CLOSE},
-
-	{"metadata_table_reference_count", "metadata",
-	 "Table reference counter",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_TABLE_REFERENCE},
-
 	/* ========== Counters for Lock Module ========== */
 	{"module_lock", "lock", "Lock Module",
 	 MONITOR_MODULE,
 	 MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK},
 
 	{"lock_deadlocks", "lock", "Number of deadlocks",
-	 MONITOR_DEFAULT_ON,
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
 	 MONITOR_DEFAULT_START, MONITOR_DEADLOCK},
 
 	{"lock_timeouts", "lock", "Number of lock timeouts",
-	 MONITOR_DEFAULT_ON,
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
 	 MONITOR_DEFAULT_START, MONITOR_TIMEOUT},
 
 	{"lock_rec_lock_waits", "lock",
@@ -481,34 +473,16 @@ static monitor_info_t	innodb_counter_info[] =
 	/* Cumulative counter for LRU batch pages flushed */
 	{"buffer_LRU_batch_flush_total_pages", "buffer",
 	 "Total pages flushed as part of LRU batches",
-	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_FLUSH_COUNT,
-	 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE},
-
-	{"buffer_LRU_batches_flush", "buffer",
-	 "Number of LRU batches",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
-	 MONITOR_LRU_BATCH_FLUSH_COUNT},
-
-	{"buffer_LRU_batch_flush_pages", "buffer",
-	 "Pages queued as an LRU batch",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
-	 MONITOR_LRU_BATCH_FLUSH_PAGES},
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE},
 
 	/* Cumulative counter for LRU batch pages flushed */
 	{"buffer_LRU_batch_evict_total_pages", "buffer",
 	 "Total pages evicted as part of LRU batches",
-	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_EVICT_COUNT,
-	 MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
-
-	{"buffer_LRU_batches_evict", "buffer",
-	 "Number of LRU batches",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
-	 MONITOR_LRU_BATCH_EVICT_COUNT},
-
-	{"buffer_LRU_batch_evict_pages", "buffer",
-	 "Pages queued as an LRU batch",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
-	 MONITOR_LRU_BATCH_EVICT_PAGES},
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
 
 	{"buffer_LRU_single_flush_failure_count", "Buffer",
 	 "Number of times attempt to flush a single page from LRU failed",
@@ -727,11 +701,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT},
 
-	{"trx_active_transactions", "transaction",
-	 "Number of active transactions",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_TRX_ACTIVE},
-
 	{"trx_rseg_history_len", "transaction",
 	 "Length of the TRX_RSEG_HISTORY list",
 	 static_cast<monitor_type_t>(
@@ -940,7 +909,7 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX},
 
 	{"index_page_splits", "index", "Number of index page splits",
-	 MONITOR_NONE,
+	 MONITOR_EXISTING,
 	 MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT},
 
 	{"index_page_merge_attempts", "index",
@@ -978,7 +947,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 static_cast<monitor_type_t>(
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH},
-#endif /* BTR_CUR_HASH_ADAPT */
 
 	{"adaptive_hash_searches_btree", "adaptive_hash_index",
 	 "Number of searches using B-tree on an index search",
@@ -986,7 +954,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE},
 
-#ifdef BTR_CUR_HASH_ADAPT
 	{"adaptive_hash_pages_added", "adaptive_hash_index",
 	 "Number of index pages on which the Adaptive Hash Index is built",
 	 MONITOR_NONE,
@@ -1109,11 +1076,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS},
 
-	{"innodb_background_drop_table_usec", "server",
-	 "Time (in microseconds) spent to process drop table list",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND},
-
 	{"innodb_log_flush_usec", "server",
 	 "Time (in microseconds) spent to flush log records",
 	 MONITOR_NONE,
@@ -1154,60 +1116,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE},
 
-	{"innodb_rwlock_s_spin_waits", "server",
-	 "Number of rwlock spin waits due to shared latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_WAITS},
-
-	{"innodb_rwlock_x_spin_waits", "server",
-	 "Number of rwlock spin waits due to exclusive latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_WAITS},
-
-	{"innodb_rwlock_sx_spin_waits", "server",
-	 "Number of rwlock spin waits due to sx latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS},
-
-	{"innodb_rwlock_s_spin_rounds", "server",
-	 "Number of rwlock spin loop rounds due to shared latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS},
-
-	{"innodb_rwlock_x_spin_rounds", "server",
-	 "Number of rwlock spin loop rounds due to exclusive latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS},
-
-	{"innodb_rwlock_sx_spin_rounds", "server",
-	 "Number of rwlock spin loop rounds due to sx latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS},
-
-	{"innodb_rwlock_s_os_waits", "server",
-	 "Number of OS waits due to shared latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_S_OS_WAITS},
-
-	{"innodb_rwlock_x_os_waits", "server",
-	 "Number of OS waits due to exclusive latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_X_OS_WAITS},
-
-	{"innodb_rwlock_sx_os_waits", "server",
-	 "Number of OS waits due to sx latch request",
-	 static_cast<monitor_type_t>(
-	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
-	 MONITOR_DEFAULT_START, MONITOR_OVLD_RWLOCK_SX_OS_WAITS},
-
 	/* ========== Counters for DML operations ========== */
 	{"module_dml", "dml", "Statistics for DMLs",
 	 MONITOR_MODULE,
@@ -1262,11 +1170,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX},
 
-	{"ddl_background_drop_tables", "ddl",
-	 "Number of tables in background drop table list",
-	 MONITOR_NONE,
-	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_TABLE},
-
 	{"ddl_online_create_index", "ddl",
 	 "Number of indexes being created online",
 	 MONITOR_NONE,
@@ -1309,16 +1212,6 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_ICP_MATCH},
 
-	/* ========== Mutex monitoring on/off ========== */
-	{"latch_status", "Latch counters",
-	 "Collect latch counters to display via SHOW ENGING INNODB MUTEX",
-	 MONITOR_MODULE,
-	 MONITOR_DEFAULT_START, MONITOR_MODULE_LATCHES},
-
-	{"latch", "sync", "Latch monitoring control",
-	 MONITOR_HIDDEN,
-	 MONITOR_DEFAULT_START, MONITOR_LATCHES},
-
 	/* ========== To turn on/off reset all counters ========== */
 	{"all", "All Counters", "Turn on/off and reset all counters",
 	 MONITOR_MODULE,
@@ -1482,27 +1375,12 @@ srv_mon_set_module_control(
 /****************************************************************//**
 Get transaction system's rollback segment size in pages
 @return size in pages */
-static
-ulint
-srv_mon_get_rseg_size(void)
-/*=======================*/
+TPOOL_SUPPRESS_TSAN static ulint srv_mon_get_rseg_size()
 {
-	ulint		i;
-	ulint		value = 0;
-
-	/* rseg_array is a static array, so we can go through it without
-	mutex protection. In addition, we provide an estimate of the
-	total rollback segment size and to avoid mutex contention we
-	don't acquire the rseg->mutex" */
-	for (i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-		const trx_rseg_t*	rseg = trx_sys.rseg_array[i];
-
-		if (rseg != NULL) {
-			value += rseg->curr_size;
-		}
-	}
-
-	return(value);
+  ulint size= 0;
+  for (const auto &rseg : trx_sys.rseg_array)
+    size+= rseg.curr_size;
+  return size;
 }
 
 /****************************************************************//**
@@ -1533,10 +1411,12 @@ srv_mon_process_existing_counter(
 
 	/* Get the value from corresponding global variable */
 	switch (monitor_id) {
-	/* export_vars.innodb_buffer_pool_reads. Num Reads from
-	disk (page not in buffer) */
+	case MONITOR_INDEX_SPLIT:
+		value = buf_pool.pages_split;
+		break;
+
 	case MONITOR_OVLD_BUF_POOL_READS:
-		value = srv_stats.buf_pool_reads;
+		value = buf_pool.stat.n_pages_read;
 		break;
 
 	/* innodb_buffer_pool_read_requests, the number of logical
@@ -1597,7 +1477,7 @@ srv_mon_process_existing_counter(
 
 	/* innodb_buffer_pool_bytes_dirty */
 	case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY:
-		value = buf_pool.stat.flush_list_bytes;
+		value = buf_pool.flush_list_bytes;
 		break;
 
 	/* innodb_buffer_pool_pages_free */
@@ -1615,6 +1495,14 @@ srv_mon_process_existing_counter(
 		value = buf_pool.stat.n_pages_written;
 		break;
 
+	case MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE:
+		value = buf_lru_flush_page_count;
+		break;
+
+	case MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE:
+		value = buf_lru_freed_page_count;
+		break;
+
 	/* innodb_pages_read */
 	case MONITOR_OVLD_PAGES_READ:
 		value = buf_pool.stat.n_pages_read;
@@ -1715,42 +1603,6 @@ srv_mon_process_existing_counter(
 		value = srv_page_size;
 		break;
 
-	case MONITOR_OVLD_RWLOCK_S_SPIN_WAITS:
-		value = rw_lock_stats.rw_s_spin_wait_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_X_SPIN_WAITS:
-		value = rw_lock_stats.rw_x_spin_wait_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_SX_SPIN_WAITS:
-		value = rw_lock_stats.rw_sx_spin_wait_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_S_SPIN_ROUNDS:
-		value = rw_lock_stats.rw_s_spin_round_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_X_SPIN_ROUNDS:
-		value = rw_lock_stats.rw_x_spin_round_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_SX_SPIN_ROUNDS:
-		value = rw_lock_stats.rw_sx_spin_round_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_S_OS_WAITS:
-		value = rw_lock_stats.rw_s_os_wait_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_X_OS_WAITS:
-		value = rw_lock_stats.rw_x_os_wait_count;
-		break;
-
-	case MONITOR_OVLD_RWLOCK_SX_OS_WAITS:
-		value = rw_lock_stats.rw_sx_os_wait_count;
-		break;
-
 	case MONITOR_OVLD_BUFFER_POOL_SIZE:
 		value = srv_buf_pool_size;
 		break;
@@ -1797,36 +1649,42 @@ srv_mon_process_existing_counter(
 
 	/* innodb_row_lock_current_waits */
 	case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
-		value = srv_stats.n_lock_wait_current_count;
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_pending();
 		break;
 
 	/* innodb_row_lock_time */
 	case MONITOR_OVLD_LOCK_WAIT_TIME:
-		value = srv_stats.n_lock_wait_time / 1000;
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_time_cumulative() / 1000;
 		break;
 
 	/* innodb_row_lock_time_max */
 	case MONITOR_OVLD_LOCK_MAX_WAIT_TIME:
-		value = lock_sys.n_lock_max_wait_time / 1000;
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_time_max() / 1000;
 		break;
 
 	/* innodb_row_lock_time_avg */
 	case MONITOR_OVLD_LOCK_AVG_WAIT_TIME:
-		if (srv_stats.n_lock_wait_count > 0) {
-			value = srv_stats.n_lock_wait_time / 1000
-				/ srv_stats.n_lock_wait_count;
+		mysql_mutex_lock(&lock_sys.wait_mutex);
+		if (auto count = lock_sys.get_wait_cumulative()) {
+			value = lock_sys.get_wait_time_cumulative() / 1000
+				/ count;
 		} else {
 			value = 0;
 		}
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
 		break;
 
 	/* innodb_row_lock_waits */
 	case MONITOR_OVLD_ROW_LOCK_WAIT:
-		value = srv_stats.n_lock_wait_count;
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_cumulative();
 		break;
 
 	case MONITOR_RSEG_HISTORY_LEN:
-		value = trx_sys.rseg_history_len;
+		value = trx_sys.history_size_approx();
 		break;
 
 	case MONITOR_RSEG_CUR_SIZE:
@@ -1921,11 +1779,11 @@ srv_mon_process_existing_counter(
 	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
 		value = btr_cur_n_sea;
 		break;
-#endif /* BTR_CUR_HASH_ADAPT */
 
 	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE:
 		value = btr_cur_n_non_sea;
 		break;
+#endif /* BTR_CUR_HASH_ADAPT */
 
         case MONITOR_OVLD_PAGE_COMPRESS_SAVED:
 		value = srv_stats.page_compression_saved;
@@ -1948,6 +1806,12 @@ srv_mon_process_existing_counter(
         case MONITOR_OVLD_PAGES_DECRYPTED:
 		value = srv_stats.pages_decrypted;
 		break;
+	case MONITOR_DEADLOCK:
+		value = lock_sys.deadlocks;
+		break;
+	case MONITOR_TIMEOUT:
+		value = lock_sys.timeouts;
+		break;
 
 	default:
 		ut_error;
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 337460fc4d2..2e9f5a0eff8 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -40,9 +40,6 @@ Created 10/8/1995 Heikki Tuuri
 *******************************************************/
 
 #include "my_global.h"
-// JAN: TODO: MySQL 5.7 missing header
-//#include "my_thread.h"
-//
 #include "mysql/psi/mysql_stage.h"
 #include "mysql/psi/psi.h"
 
@@ -62,7 +59,6 @@ Created 10/8/1995 Heikki Tuuri
 #include "srv0mon.h"
 #include "srv0srv.h"
 #include "srv0start.h"
-#include "sync0sync.h"
 #include "trx0i_s.h"
 #include "trx0purge.h"
 #include "ut0crc32.h"
@@ -75,9 +71,11 @@ Created 10/8/1995 Heikki Tuuri
 #include <list>
 #include "log.h"
 
+#include "transactional_lock_guard.h"
+
 #include <my_service_manager.h>
 /* The following is the maximum allowed duration of a lock wait. */
-UNIV_INTERN ulong	srv_fatal_semaphore_wait_threshold =  DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT;
+ulong	srv_fatal_semaphore_wait_threshold =  DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT;
 
 /* How much data manipulation language (DML) statements need to be delayed,
 in microseconds, in order to reduce the lagging of the purge thread. */
@@ -185,7 +183,7 @@ mutex before switching to blocking wait on the mutex */
 /** Check whether the number of failed nonblocking mutex
 acquisition attempts exceeds maximum allowed value. If so,
 srv_printf_innodb_monitor() will request mutex acquisition
-with mutex_enter(), which will wait until it gets the mutex. */
+with mysql_mutex_lock(), which will wait until it gets the mutex. */
 #define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
 
 /** copy of innodb_buffer_pool_size */
@@ -212,6 +210,9 @@ ulong srv_buf_pool_load_pages_abort = LONG_MAX;
 /** Lock table size in bytes */
 ulint	srv_lock_table_size	= ULINT_MAX;
 
+/** the value of innodb_checksum_algorithm */
+ulong	srv_checksum_algorithm;
+
 /** innodb_read_io_threads */
 uint	srv_n_read_io_threads;
 /** innodb_write_io_threads */
@@ -346,22 +347,25 @@ ulint	srv_truncated_status_writes;
 ulong	srv_available_undo_logs;
 
 /* Defragmentation */
-UNIV_INTERN my_bool	srv_defragment;
+my_bool	srv_defragment;
 /** innodb_defragment_n_pages */
-UNIV_INTERN uint	srv_defragment_n_pages;
-UNIV_INTERN uint	srv_defragment_stats_accuracy;
+uint	srv_defragment_n_pages;
+uint	srv_defragment_stats_accuracy;
 /** innodb_defragment_fill_factor_n_recs */
-UNIV_INTERN uint	srv_defragment_fill_factor_n_recs;
+uint	srv_defragment_fill_factor_n_recs;
 /** innodb_defragment_fill_factor */
-UNIV_INTERN double	srv_defragment_fill_factor;
+double	srv_defragment_fill_factor;
 /** innodb_defragment_frequency */
-UNIV_INTERN uint	srv_defragment_frequency;
+uint	srv_defragment_frequency;
 /** derived from innodb_defragment_frequency;
 @see innodb_defragment_frequency_update() */
-UNIV_INTERN ulonglong	srv_defragment_interval;
+ulonglong	srv_defragment_interval;
 
 /** Current mode of operation */
-UNIV_INTERN enum srv_operation_mode srv_operation;
+enum srv_operation_mode srv_operation;
+
+/** whether this is the server's first start after mariabackup --prepare */
+bool srv_start_after_restore;
 
 /* Set the following to 0 if you want InnoDB to write messages on
 stderr on startup/shutdown. Not enabled on the embedded server. */
@@ -379,20 +383,18 @@ my_bool srv_immediate_scrub_data_uncompressed;
 
 static time_t	srv_last_monitor_time;
 
-static ib_mutex_t	srv_innodb_monitor_mutex;
+static mysql_mutex_t srv_innodb_monitor_mutex;
 
 /** Mutex protecting page_zip_stat_per_index */
-ib_mutex_t	page_zip_stat_per_index_mutex;
+mysql_mutex_t page_zip_stat_per_index_mutex;
 
-/* Mutex for locking srv_monitor_file. Not created if srv_read_only_mode */
-ib_mutex_t	srv_monitor_file_mutex;
+/** Mutex for locking srv_monitor_file */
+mysql_mutex_t srv_monitor_file_mutex;
 
 /** Temporary file for innodb monitor output */
 FILE*	srv_monitor_file;
-/** Mutex for locking srv_misc_tmpfile. Not created if srv_read_only_mode.
-This mutex has a very low rank; threads reserving it should not
-acquire any further latches or sleep before releasing this one. */
-ib_mutex_t	srv_misc_tmpfile_mutex;
+/** Mutex for locking srv_misc_tmpfile */
+mysql_mutex_t srv_misc_tmpfile_mutex;
 /** Temporary file for miscellanous diagnostic output */
 FILE*	srv_misc_tmpfile;
 
@@ -414,7 +416,7 @@ second. */
 static time_t	srv_last_log_flush_time;
 
 /** Buffer pool dump status frequence in percentages */
-UNIV_INTERN ulong srv_buf_dump_status_frequency;
+ulong srv_buf_dump_status_frequency;
 
 /*
 	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
@@ -434,19 +436,13 @@ lock			--	semaphore;
 kernel			--	kernel;
 
 query thread execution:
-(a) without lock mutex
+(a) without lock_sys.latch
 reserved		--	process executing in user mode;
-(b) with lock mutex reserved
+(b) with lock_sys.latch reserved
 			--	process executing in kernel mode;
 
-The server has several backgroind threads all running at the same
-priority as user threads. It periodically checks if here is anything
-happening in the server which requires intervention of the master
-thread. Such situations may be, for example, when flushing of dirty
-blocks is needed in the buffer pool or old version of database rows
-have to be cleaned away (purged). The user can configure a separate
-dedicated purge thread(s) too, in which case the master thread does not
-do any purging.
+The server has several background threads all running at the same
+priority as user threads.
 
 The threads which we call user threads serve the queries of the MySQL
 server. They run at normal priority.
@@ -468,7 +464,7 @@ priority of the background thread so that it will be scheduled and it
 can release the resource.  This solution is called priority inheritance
 in real-time programming.  A drawback of this solution is that the overhead
 of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100
-MHz Pentium, because the thread has to call os_thread_get_curr_id.  This may
+MHz Pentium, because the thread has to call pthread_self.  This may
 be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note
 that the thread cannot store the information in the resource , say mutex,
 itself, because competing threads could wipe out the information if it is
@@ -490,7 +486,7 @@ in a traditional Unix implementation. */
 
 /** The server system struct */
 struct srv_sys_t{
-	ib_mutex_t	tasks_mutex;		/*!< variable protecting the
+	mysql_mutex_t	tasks_mutex;		/*!< variable protecting the
 						tasks queue */
 	UT_LIST_BASE_NODE_T(que_thr_t)
 			tasks;			/*!< task queue */
@@ -510,8 +506,28 @@ static srv_sys_t	srv_sys;
 struct purge_coordinator_state
 {
   /** Snapshot of the last history length before the purge call.*/
-  size_t m_history_length= 0;
-  Atomic_counter<int> m_running{0};
+  size_t m_history_length;
+  Atomic_counter<int> m_running;
+private:
+  ulint count;
+  ulint n_use_threads;
+  ulint n_threads;
+
+  ulint lsn_lwm;
+  ulint lsn_hwm;
+  ulonglong start_time;
+  ulint lsn_age_factor;
+
+  static constexpr ulint adaptive_purge_threshold= 20;
+  static constexpr ulint safety_net= 20;
+  ulint series[innodb_purge_threads_MAX + 1];
+
+  inline void compute_series();
+  inline void lazy_init();
+  void refresh(bool full);
+
+public:
+  inline void do_purge();
 };
 
 static purge_coordinator_state purge_state;
@@ -593,13 +609,6 @@ static void thread_pool_thread_end()
 }
 
 
-#ifndef DBUG_OFF
-static void dbug_after_task_callback()
-{
-  ut_ad(!sync_check_iterate(sync_check()));
-}
-#endif
-
 void srv_thread_pool_init()
 {
   DBUG_ASSERT(!srv_thread_pool);
@@ -611,9 +620,6 @@ void srv_thread_pool_init()
 #endif
   srv_thread_pool->set_thread_callbacks(thread_pool_thread_init,
                                         thread_pool_thread_end);
-#ifndef DBUG_OFF
-  tpool::set_after_task_callback(dbug_after_task_callback);
-#endif
 }
 
 
@@ -629,25 +635,15 @@ static bool need_srv_free;
 /** Initialize the server. */
 static void srv_init()
 {
-	mutex_create(LATCH_ID_SRV_INNODB_MONITOR, &srv_innodb_monitor_mutex);
-
-	if (!srv_read_only_mode) {
-		mutex_create(LATCH_ID_SRV_SYS_TASKS, &srv_sys.tasks_mutex);
-
-		UT_LIST_INIT(srv_sys.tasks, &que_thr_t::queue);
-	}
+	mysql_mutex_init(srv_innodb_monitor_mutex_key,
+			 &srv_innodb_monitor_mutex, nullptr);
+	mysql_mutex_init(srv_threads_mutex_key, &srv_sys.tasks_mutex, nullptr);
+	UT_LIST_INIT(srv_sys.tasks, &que_thr_t::queue);
 
 	need_srv_free = true;
 
-	/* page_zip_stat_per_index_mutex is acquired from:
-	1. page_zip_compress() (after SYNC_FSP)
-	2. page_zip_decompress()
-	3. i_s_cmp_per_index_fill_low() (where SYNC_DICT is acquired)
-	4. innodb_cmp_per_index_update(), no other latches
-	since we do not acquire any other latches while holding this mutex,
-	it can have very low level. We pick SYNC_ANY_LATCH for it. */
-	mutex_create(LATCH_ID_PAGE_ZIP_STAT_PER_INDEX,
-		     &page_zip_stat_per_index_mutex);
+	mysql_mutex_init(page_zip_stat_per_index_mutex_key,
+			 &page_zip_stat_per_index_mutex, nullptr);
 
 	/* Initialize some INFORMATION SCHEMA internal structures */
 	trx_i_s_cache_init(trx_i_s_cache);
@@ -663,12 +659,9 @@ srv_free(void)
 		return;
 	}
 
-	mutex_free(&srv_innodb_monitor_mutex);
-	mutex_free(&page_zip_stat_per_index_mutex);
-
-	if (!srv_read_only_mode) {
-		mutex_free(&srv_sys.tasks_mutex);
-	}
+	mysql_mutex_destroy(&srv_innodb_monitor_mutex);
+	mysql_mutex_destroy(&page_zip_stat_per_index_mutex);
+	mysql_mutex_destroy(&srv_sys.tasks_mutex);
 
 	trx_i_s_cache_free(trx_i_s_cache);
 	srv_thread_pool_end();
@@ -676,27 +669,28 @@ srv_free(void)
 
 /*********************************************************************//**
 Boots the InnoDB server. */
-void
-srv_boot(void)
-/*==========*/
+void srv_boot()
 {
-	srv_thread_pool_init();
-	sync_check_init();
-	trx_pool_init();
-	row_mysql_init();
-	srv_init();
+#ifndef NO_ELISION
+  if (transactional_lock_enabled())
+    sql_print_information("InnoDB: Using transactional memory");
+#endif
+  buf_dblwr.init();
+  srv_thread_pool_init();
+  trx_pool_init();
+  srv_init();
 }
 
 /******************************************************************//**
 Refreshes the values used to calculate per-second averages. */
 static void srv_refresh_innodb_monitor_stats(time_t current_time)
 {
-	mutex_enter(&srv_innodb_monitor_mutex);
+	mysql_mutex_lock(&srv_innodb_monitor_mutex);
 
 	if (difftime(current_time, srv_last_monitor_time) < 60) {
 		/* We referesh InnoDB Monitor values so that averages are
 		printed from at most 60 last seconds */
-		mutex_exit(&srv_innodb_monitor_mutex);
+		mysql_mutex_unlock(&srv_innodb_monitor_mutex);
 		return;
 	}
 
@@ -706,8 +700,8 @@ static void srv_refresh_innodb_monitor_stats(time_t current_time)
 
 #ifdef BTR_CUR_HASH_ADAPT
 	btr_cur_n_sea_old = btr_cur_n_sea;
-#endif /* BTR_CUR_HASH_ADAPT */
 	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+#endif /* BTR_CUR_HASH_ADAPT */
 
 	log_refresh_stats();
 
@@ -723,7 +717,7 @@ static void srv_refresh_innodb_monitor_stats(time_t current_time)
 	srv_n_system_rows_deleted_old = srv_stats.n_system_rows_deleted;
 	srv_n_system_rows_read_old = srv_stats.n_system_rows_read;
 
-	mutex_exit(&srv_innodb_monitor_mutex);
+	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
 }
 
 /******************************************************************//**
@@ -734,8 +728,7 @@ ibool
 srv_printf_innodb_monitor(
 /*======================*/
 	FILE*	file,		/*!< in: output stream */
-	ibool	nowait,		/*!< in: whether to wait for the
-				lock_sys_t:: mutex */
+	ibool	nowait,		/*!< in: whether to wait for lock_sys.latch */
 	ulint*	trx_start_pos,	/*!< out: file position of the start of
 				the list of active transactions */
 	ulint*	trx_end)	/*!< out: file position of the end of
@@ -745,7 +738,7 @@ srv_printf_innodb_monitor(
 	time_t	current_time;
 	ibool	ret;
 
-	mutex_enter(&srv_innodb_monitor_mutex);
+	mysql_mutex_lock(&srv_innodb_monitor_mutex);
 
 	current_time = time(NULL);
 
@@ -772,18 +765,18 @@ srv_printf_innodb_monitor(
 	      "-----------------\n", file);
 	srv_print_master_thread_info(file);
 
+	/* This section is intentionally left blank, for tools like "innotop" */
 	fputs("----------\n"
 	      "SEMAPHORES\n"
 	      "----------\n", file);
-
-	sync_print(file);
+	/* End of intentionally blank section */
 
 	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
-	order level in sync0sync.h, while dict_foreign_err_mutex has a very
-	low level 135. Therefore we can reserve the latter mutex here without
+	order level, while dict_foreign_err_mutex has a very low level.
+	Therefore we can reserve the latter mutex here without
 	a danger of a deadlock of threads. */
 
-	mutex_enter(&dict_foreign_err_mutex);
+	mysql_mutex_lock(&dict_foreign_err_mutex);
 
 	if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) {
 		fputs("------------------------\n"
@@ -792,12 +785,12 @@ srv_printf_innodb_monitor(
 		ut_copy_file(file, dict_foreign_err_file);
 	}
 
-	mutex_exit(&dict_foreign_err_mutex);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
 
 	/* Only if lock_print_info_summary proceeds correctly,
 	before we call the lock_print_info_all_transactions
 	to print all the lock information. IMPORTANT NOTE: This
-	function acquires the lock mutex on success. */
+	function acquires exclusive lock_sys.latch on success. */
 	ret = lock_print_info_summary(file, nowait);
 
 	if (ret) {
@@ -810,9 +803,8 @@ srv_printf_innodb_monitor(
 			}
 		}
 
-		/* NOTE: If we get here then we have the lock mutex. This
-		function will release the lock mutex that we acquired when
-		we called the lock_print_info_summary() function earlier. */
+		/* NOTE: The following function will release the lock_sys.latch
+		that lock_print_info_summary() acquired. */
 
 		lock_print_info_all_transactions(file);
 
@@ -839,29 +831,27 @@ srv_printf_innodb_monitor(
 #ifdef BTR_CUR_HASH_ADAPT
 	for (ulint i = 0; i < btr_ahi_parts && btr_search_enabled; ++i) {
 		const auto part= &btr_search_sys.parts[i];
-		rw_lock_s_lock(&part->latch);
+		part->latch.rd_lock(SRW_LOCK_CALL);
 		ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
 		fprintf(file, "Hash table size " ULINTPF
 			", node heap has " ULINTPF " buffer(s)\n",
 			part->table.n_cells,
 			part->heap->base.count - !part->heap->free_block);
-		rw_lock_s_unlock(&part->latch);
+		part->latch.rd_unlock();
 	}
 
+	/* btr_cur_n_sea_old and btr_cur_n_non_sea_old are protected by
+	srv_innodb_monitor_mutex (srv_refresh_innodb_monitor_stats) */
+	const ulint with_ahi = btr_cur_n_sea, without_ahi = btr_cur_n_non_sea;
 	fprintf(file,
 		"%.2f hash searches/s, %.2f non-hash searches/s\n",
-		static_cast<double>(btr_cur_n_sea - btr_cur_n_sea_old)
+		static_cast<double>(with_ahi - btr_cur_n_sea_old)
 		/ time_elapsed,
-		static_cast<double>(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
-		/ time_elapsed);
-	btr_cur_n_sea_old = btr_cur_n_sea;
-#else /* BTR_CUR_HASH_ADAPT */
-	fprintf(file,
-		"%.2f non-hash searches/s\n",
-		static_cast<double>(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		static_cast<double>(without_ahi - btr_cur_n_non_sea_old)
 		/ time_elapsed);
+	btr_cur_n_sea_old = with_ahi;
+	btr_cur_n_non_sea_old = without_ahi;
 #endif /* BTR_CUR_HASH_ADAPT */
-	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
 
 	fputs("---\n"
 	      "LOG\n"
@@ -953,7 +943,7 @@ srv_printf_innodb_monitor(
 	fputs("----------------------------\n"
 	      "END OF INNODB MONITOR OUTPUT\n"
 	      "============================\n", file);
-	mutex_exit(&srv_innodb_monitor_mutex);
+	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
 	fflush(file);
 
 	return(ret);
@@ -972,24 +962,27 @@ srv_export_innodb_status(void)
 	}
 
 #ifdef BTR_CUR_HASH_ADAPT
+	export_vars.innodb_ahi_hit = btr_cur_n_sea;
+	export_vars.innodb_ahi_miss = btr_cur_n_non_sea;
+
 	ulint mem_adaptive_hash = 0;
 	for (ulong i = 0; i < btr_ahi_parts; i++) {
 		const auto part= &btr_search_sys.parts[i];
-		rw_lock_s_lock(&part->latch);
+		part->latch.rd_lock(SRW_LOCK_CALL);
 		if (part->heap) {
 			ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
 
 			mem_adaptive_hash += mem_heap_get_size(part->heap)
 				+ part->table.n_cells * sizeof(hash_cell_t);
 		}
-		rw_lock_s_unlock(&part->latch);
+		part->latch.rd_unlock();
 	}
 	export_vars.innodb_mem_adaptive_hash = mem_adaptive_hash;
 #endif
 
 	export_vars.innodb_mem_dictionary = dict_sys.rough_size();
 
-	mutex_enter(&srv_innodb_monitor_mutex);
+	mysql_mutex_lock(&srv_innodb_monitor_mutex);
 
 	export_vars.innodb_data_pending_reads =
 		ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
@@ -1009,59 +1002,22 @@ srv_export_innodb_status(void)
 
 	export_vars.innodb_data_writes = os_n_file_writes;
 
-	ulint dblwr = 0;
-
-	if (buf_dblwr.is_initialised()) {
-		buf_dblwr.lock();
-		dblwr = buf_dblwr.submitted();
-		export_vars.innodb_dblwr_pages_written = buf_dblwr.written();
-		export_vars.innodb_dblwr_writes = buf_dblwr.batches();
-		buf_dblwr.unlock();
-	}
+	buf_dblwr.lock();
+	ulint dblwr = buf_dblwr.submitted();
+	export_vars.innodb_dblwr_pages_written = buf_dblwr.written();
+	export_vars.innodb_dblwr_writes = buf_dblwr.batches();
+	buf_dblwr.unlock();
 
 	export_vars.innodb_data_written = srv_stats.data_written + dblwr;
 
-	export_vars.innodb_buffer_pool_read_requests
-		= buf_pool.stat.n_page_gets;
-
 	export_vars.innodb_buffer_pool_write_requests =
 		srv_stats.buf_pool_write_requests;
 
-	export_vars.innodb_buffer_pool_reads = srv_stats.buf_pool_reads;
-
-	export_vars.innodb_buffer_pool_read_ahead_rnd =
-		buf_pool.stat.n_ra_pages_read_rnd;
-
-	export_vars.innodb_buffer_pool_read_ahead =
-		buf_pool.stat.n_ra_pages_read;
-
-	export_vars.innodb_buffer_pool_read_ahead_evicted =
-		buf_pool.stat.n_ra_pages_evicted;
-
-	export_vars.innodb_buffer_pool_pages_data =
-		UT_LIST_GET_LEN(buf_pool.LRU);
-
 	export_vars.innodb_buffer_pool_bytes_data =
 		buf_pool.stat.LRU_bytes
 		+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
 		   << srv_page_size_shift);
 
-	export_vars.innodb_buffer_pool_pages_dirty =
-		UT_LIST_GET_LEN(buf_pool.flush_list);
-
-	export_vars.innodb_buffer_pool_pages_made_young
-		= buf_pool.stat.n_pages_made_young;
-	export_vars.innodb_buffer_pool_pages_made_not_young
-		= buf_pool.stat.n_pages_not_made_young;
-
-	export_vars.innodb_buffer_pool_pages_old = buf_pool.LRU_old_len;
-
-	export_vars.innodb_buffer_pool_bytes_dirty =
-		buf_pool.stat.flush_list_bytes;
-
-	export_vars.innodb_buffer_pool_pages_free =
-		UT_LIST_GET_LEN(buf_pool.free);
-
 #ifdef UNIV_DEBUG
 	export_vars.innodb_buffer_pool_pages_latched =
 		buf_get_latched_pages_number();
@@ -1074,7 +1030,7 @@ srv_export_innodb_status(void)
 		- UT_LIST_GET_LEN(buf_pool.free);
 
 	export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id();
-	export_vars.innodb_history_list_length = trx_sys.rseg_history_len;
+	export_vars.innodb_history_list_length = trx_sys.history_size_approx();
 
 	export_vars.innodb_log_waits = srv_stats.log_waits;
 
@@ -1092,25 +1048,21 @@ srv_export_innodb_status(void)
 
 	export_vars.innodb_log_writes = srv_stats.log_writes;
 
-	export_vars.innodb_row_lock_waits = srv_stats.n_lock_wait_count;
-
-	export_vars.innodb_row_lock_current_waits =
-		srv_stats.n_lock_wait_current_count;
+	mysql_mutex_lock(&lock_sys.wait_mutex);
+	export_vars.innodb_row_lock_waits = lock_sys.get_wait_cumulative();
 
-	export_vars.innodb_row_lock_time = srv_stats.n_lock_wait_time / 1000;
+	export_vars.innodb_row_lock_current_waits= lock_sys.get_wait_pending();
 
-	if (srv_stats.n_lock_wait_count > 0) {
-
-		export_vars.innodb_row_lock_time_avg = (ulint)
-			(srv_stats.n_lock_wait_time
-			 / 1000 / srv_stats.n_lock_wait_count);
-
-	} else {
-		export_vars.innodb_row_lock_time_avg = 0;
-	}
+	export_vars.innodb_row_lock_time = lock_sys.get_wait_time_cumulative()
+		/ 1000;
+	export_vars.innodb_row_lock_time_max = lock_sys.get_wait_time_max()
+		/ 1000;
+	mysql_mutex_unlock(&lock_sys.wait_mutex);
 
-	export_vars.innodb_row_lock_time_max =
-		lock_sys.n_lock_max_wait_time / 1000;
+	export_vars.innodb_row_lock_time_avg= export_vars.innodb_row_lock_waits
+		? static_cast<ulint>(export_vars.innodb_row_lock_time
+				     / export_vars.innodb_row_lock_waits)
+		: 0;
 
 	export_vars.innodb_rows_read = srv_stats.n_rows_read;
 
@@ -1181,7 +1133,7 @@ srv_export_innodb_status(void)
 			srv_stats.n_key_requests;
 	}
 
-	mutex_exit(&srv_innodb_monitor_mutex);
+	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
 
 	mysql_mutex_lock(&log_sys.mutex);
 	export_vars.innodb_lsn_current = log_sys.get_lsn();
@@ -1221,7 +1173,7 @@ static void srv_monitor()
 		if (srv_print_innodb_monitor) {
 			/* Reset mutex_skipped counter everytime
 			srv_print_innodb_monitor changes. This is to
-			ensure we will not be blocked by lock_sys.mutex
+			ensure we will not be blocked by lock_sys.latch
 			for short duration information printing */
 			if (!monitor_state.last_srv_print_monitor) {
 				monitor_state.mutex_skipped = 0;
@@ -1245,7 +1197,7 @@ static void srv_monitor()
 		mutexes in read-only-mode */
 
 		if (!srv_read_only_mode && srv_innodb_status) {
-			mutex_enter(&srv_monitor_file_mutex);
+			mysql_mutex_lock(&srv_monitor_file_mutex);
 			rewind(srv_monitor_file);
 			if (!srv_printf_innodb_monitor(srv_monitor_file,
 						MUTEX_NOWAIT(monitor_state.mutex_skipped),
@@ -1256,28 +1208,18 @@ static void srv_monitor()
 			}
 
 			os_file_set_eof(srv_monitor_file);
-			mutex_exit(&srv_monitor_file_mutex);
+			mysql_mutex_unlock(&srv_monitor_file_mutex);
 		}
 	}
 
 	srv_refresh_innodb_monitor_stats(current_time);
 }
 
-/*********************************************************************//**
-A task which prints warnings about semaphore waits which have lasted
-too long. These can be used to track bugs which cause hangs.
-*/
+/** Periodic task which prints the info output by various InnoDB monitors.*/
 void srv_monitor_task(void*)
 {
 	/* number of successive fatal timeouts observed */
-	static ulint		fatal_cnt;
 	static lsn_t		old_lsn = recv_sys.recovered_lsn;
-	/* longest waiting thread for a semaphore */
-	os_thread_id_t	waiter;
-	static os_thread_id_t	old_waiter = os_thread_get_curr_id();
-	/* the semaphore that is being waited for */
-	const void*	sema		= NULL;
-	static const void*	old_sema	= NULL;
 
 	ut_ad(!srv_read_only_mode);
 
@@ -1292,18 +1234,24 @@ void srv_monitor_task(void*)
 	eviction policy. */
 	buf_LRU_stat_update();
 
-	if (sync_array_print_long_waits(&waiter, &sema)
-	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
-		if (fatal_cnt++) {
-			ib::fatal() << "Semaphore wait has lasted > "
-				<< srv_fatal_semaphore_wait_threshold
-				<< " seconds. We intentionally crash the"
-				" server because it appears to be hung.";
+	ulonglong now = my_hrtime_coarse().val;
+	const ulong threshold = srv_fatal_semaphore_wait_threshold;
+
+	if (ulonglong start = dict_sys.oldest_wait()) {
+		if (now >= start) {
+			now -= start;
+			ulong waited = static_cast<ulong>(now / 1000000);
+			if (waited >= threshold) {
+				ib::fatal() << dict_sys.fatal_msg;
+			}
+
+			if (waited == threshold / 4
+			    || waited == threshold / 2
+			    || waited == threshold / 4 * 3) {
+				ib::warn() << "Long wait (" << waited
+					   << " seconds) for dict_sys.latch";
+			}
 		}
-	} else {
-		fatal_cnt = 0;
-		old_waiter = waiter;
-		old_sema = sema;
 	}
 
 	srv_monitor();
@@ -1333,7 +1281,6 @@ bool srv_any_background_activity()
 
 static void purge_worker_callback(void*);
 static void purge_coordinator_callback(void*);
-static void purge_coordinator_timer_callback(void*);
 
 static tpool::task_group purge_task_group;
 tpool::waitable_task purge_worker_task(purge_worker_callback, nullptr,
@@ -1351,7 +1298,7 @@ srv_wake_purge_thread_if_not_active()
 	ut_ad(!srv_read_only_mode);
 
 	if (purge_sys.enabled() && !purge_sys.paused()
-	    && trx_sys.rseg_history_len) {
+	    && trx_sys.history_exists()) {
 		if(++purge_state.m_running == 1) {
 			srv_thread_pool->submit_task(&purge_coordinator_task);
 		}
@@ -1364,24 +1311,41 @@ bool purge_sys_t::running() const
   return purge_coordinator_task.is_running();
 }
 
+/** Suspend purge in data dictionary tables */
+void purge_sys_t::stop_SYS()
+{
+  latch.rd_lock(SRW_LOCK_CALL);
+  ++m_SYS_paused;
+  latch.rd_unlock();
+}
+
 /** Stop purge during FLUSH TABLES FOR EXPORT */
 void purge_sys_t::stop()
 {
-  rw_lock_x_lock(&latch);
-
-  if (!enabled())
+  for (;;)
   {
-    /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
-    ut_ad(!srv_undo_sources);
-    rw_lock_x_unlock(&latch);
-    return;
-  }
+    latch.wr_lock(SRW_LOCK_CALL);
 
-  ut_ad(srv_n_purge_threads > 0);
+    if (!enabled())
+    {
+      /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+      ut_ad(!srv_undo_sources);
+      latch.wr_unlock();
+      return;
+    }
+
+    ut_ad(srv_n_purge_threads > 0);
+
+    if (!must_wait_SYS())
+      break;
+
+    latch.wr_unlock();
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+  }
 
   const auto paused= m_paused++;
 
-  rw_lock_x_unlock(&latch);
+  latch.wr_unlock();
 
   if (!paused)
   {
@@ -1391,6 +1355,14 @@ void purge_sys_t::stop()
   }
 }
 
+/** Resume purge in data dictionary tables */
+void purge_sys_t::resume_SYS(void *)
+{
+  ut_d(const auto s=)
+  purge_sys.m_SYS_paused--;
+  ut_ad(s);
+}
+
 /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
 void purge_sys_t::resume()
 {
@@ -1402,9 +1374,8 @@ void purge_sys_t::resume()
    }
    ut_ad(!srv_read_only_mode);
    ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
-   ut_ad(!sync_check_iterate(sync_check()));
    purge_coordinator_task.enable();
-   rw_lock_x_lock(&latch);
+   latch.wr_lock(SRW_LOCK_CALL);
    int32_t paused= m_paused--;
    ut_a(paused);
 
@@ -1415,7 +1386,7 @@ void purge_sys_t::resume()
      srv_wake_purge_thread_if_not_active();
      MONITOR_ATOMIC_INC(MONITOR_PURGE_RESUME_COUNT);
    }
-   rw_lock_x_unlock(&latch);
+   latch.wr_unlock();
 }
 
 /*******************************************************************//**
@@ -1448,10 +1419,7 @@ The master thread is tasked to ensure that flush of log file happens
 once every second in the background. This is to ensure that not more
 than one second of trxs are lost in case of crash when
 innodb_flush_logs_at_trx_commit != 1 */
-static
-void
-srv_sync_log_buffer_in_background(void)
-/*===================================*/
+static void srv_sync_log_buffer_in_background()
 {
 	time_t	current_time = time(NULL);
 
@@ -1464,50 +1432,16 @@ srv_sync_log_buffer_in_background(void)
 	}
 }
 
-/********************************************************************//**
-Make room in the table cache by evicting an unused table.
-@return number of tables evicted. */
-static
-ulint
-srv_master_evict_from_table_cache(
-/*==============================*/
-	ulint	pct_check)	/*!< in: max percent to check */
-{
-	ulint	n_tables_evicted = 0;
-
-	dict_sys_lock();
-
-	n_tables_evicted = dict_make_room_in_cache(
-		innobase_get_table_cache_size(), pct_check);
-
-	dict_sys_unlock();
-
-	return(n_tables_evicted);
-}
-
 /** Report progress during shutdown.
 @param last   time of last output
-@param n_drop number of tables to be dropped
 @param n_read number of page reads initiated for change buffer merge */
-static void srv_shutdown_print(time_t &last, ulint n_drop, ulint n_read)
+static void srv_shutdown_print(time_t &last, ulint n_read)
 {
   time_t now= time(nullptr);
   if (now - last >= 15)
   {
     last= now;
 
-    if (n_drop)
-    {
-      sql_print_information("InnoDB: Waiting for %zu table(s) to be dropped",
-                            n_drop);
-#if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY
-      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
-                                     "InnoDB: Waiting for %zu table(s)"
-                                     " to be dropped", n_drop);
-#endif
-      return;
-    }
-
     const ulint ibuf_size= ibuf.size;
     sql_print_information("Completing change buffer merge;"
                           " %zu page reads initiated;"
@@ -1523,124 +1457,41 @@ static void srv_shutdown_print(time_t &last, ulint n_drop, ulint n_read)
   }
 }
 
-/*********************************************************************//**
-Perform the tasks that the master thread is supposed to do when the
-server is active. There are two types of tasks. The first category is
-of such tasks which are performed at each inovcation of this function.
-We assume that this function is called roughly every second when the
-server is active. The second category is of such tasks which are
-performed at some interval e.g.: purge, dict_LRU cleanup etc. */
-static
-void
-srv_master_do_active_tasks(void)
-/*============================*/
+/** Perform periodic tasks whenever the server is active.
+@param counter_time  microsecond_interval_timer() */
+static void srv_master_do_active_tasks(ulonglong counter_time)
 {
-	time_t		cur_time = time(NULL);
-	ulonglong	counter_time = microsecond_interval_timer();
-
-	/* First do the tasks that we are suppose to do at each
-	invocation of this function. */
-
 	++srv_main_active_loops;
 
 	MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS);
 
-	/* ALTER TABLE in MySQL requires on Unix that the table handler
-	can drop tables lazily after there no longer are SELECT
-	queries to them. */
-	srv_main_thread_op_info = "doing background drop tables";
-	row_drop_tables_for_mysql_in_background();
-	MONITOR_INC_TIME_IN_MICRO_SECS(
-		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND, counter_time);
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		return;
-	}
-
-	/* make sure that there is enough reusable space in the redo
-	log files */
-	srv_main_thread_op_info = "checking free log space";
-	log_free_check();
-
-	/* Flush logs if needed */
-	srv_main_thread_op_info = "flushing log";
-	srv_sync_log_buffer_in_background();
-	MONITOR_INC_TIME_IN_MICRO_SECS(
-		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
-
-	/* Now see if various tasks that are performed at defined
-	intervals need to be performed. */
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		return;
-	}
-
-	if (!(cur_time % 47)) {
+	if (!(counter_time % (47 * 1000000ULL))) {
 		srv_main_thread_op_info = "enforcing dict cache limit";
-		ulint	n_evicted = srv_master_evict_from_table_cache(50);
-		if (n_evicted != 0) {
+		if (ulint n_evicted = dict_sys.evict_table_LRU(true)) {
 			MONITOR_INC_VALUE(
-				MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE, n_evicted);
+				MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
+				n_evicted);
 		}
 		MONITOR_INC_TIME_IN_MICRO_SECS(
 			MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
 	}
 }
 
-/*********************************************************************//**
-Perform the tasks that the master thread is supposed to do whenever the
-server is idle. We do check for the server state during this function
-and if the server has entered the shutdown phase we may return from
-the function without completing the required tasks.
-Note that the server can move to active state when we are executing this
-function but we don't check for that as we are suppose to perform more
-or less same tasks when server is active. */
-static
-void
-srv_master_do_idle_tasks(void)
-/*==========================*/
+/** Perform periodic tasks whenever the server is idle.
+@param counter_time  microsecond_interval_timer() */
+static void srv_master_do_idle_tasks(ulonglong counter_time)
 {
 	++srv_main_idle_loops;
 
 	MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS);
 
-
-	/* ALTER TABLE in MySQL requires on Unix that the table handler
-	can drop tables lazily after there no longer are SELECT
-	queries to them. */
-	ulonglong counter_time = microsecond_interval_timer();
-	srv_main_thread_op_info = "doing background drop tables";
-	row_drop_tables_for_mysql_in_background();
-	MONITOR_INC_TIME_IN_MICRO_SECS(
-		MONITOR_SRV_BACKGROUND_DROP_TABLE_MICROSECOND,
-			 counter_time);
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		return;
-	}
-
-	/* make sure that there is enough reusable space in the redo
-	log files */
-	srv_main_thread_op_info = "checking free log space";
-	log_free_check();
-
-	if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
-		return;
-	}
-
 	srv_main_thread_op_info = "enforcing dict cache limit";
-	ulint	n_evicted = srv_master_evict_from_table_cache(100);
-	if (n_evicted != 0) {
+	if (ulint n_evicted = dict_sys.evict_table_LRU(false)) {
 		MONITOR_INC_VALUE(
 			MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE, n_evicted);
 	}
 	MONITOR_INC_TIME_IN_MICRO_SECS(
 		MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
-
-	/* Flush logs if needed */
-	srv_sync_log_buffer_in_background();
-	MONITOR_INC_TIME_IN_MICRO_SECS(
-		MONITOR_SRV_LOG_FLUSH_MICROSECOND, counter_time);
 }
 
 /**
@@ -1649,7 +1500,6 @@ and optionally change buffer merge (on innodb_fast_shutdown=0). */
 void srv_shutdown(bool ibuf_merge)
 {
 	ulint		n_read = 0;
-	ulint		n_tables_to_drop;
 	time_t		now = time(NULL);
 
 	do {
@@ -1657,11 +1507,6 @@ void srv_shutdown(bool ibuf_merge)
 		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
 		++srv_main_shutdown_loops;
 
-		/* FIXME: Remove the background DROP TABLE queue; it is not
-		crash-safe and breaks ACID. */
-		srv_main_thread_op_info = "doing background drop tables";
-		n_tables_to_drop = row_drop_tables_for_mysql_in_background();
-
 		if (ibuf_merge) {
 			srv_main_thread_op_info = "doing insert buffer merge";
 			/* Disallow the use of change buffer to
@@ -1670,29 +1515,32 @@ void srv_shutdown(bool ibuf_merge)
 			ibuf_max_size_update(0);
 			log_free_check();
 			n_read = ibuf_contract();
+			srv_shutdown_print(now, n_read);
 		}
-
-		if (n_tables_to_drop || ibuf_merge) {
-			srv_shutdown_print(now, n_tables_to_drop, n_read);
-		}
-	} while (n_read || n_tables_to_drop);
+	} while (n_read);
 }
 
 /** The periodic master task controlling the server. */
 void srv_master_callback(void*)
 {
-	static ulint old_activity_count;
+  static ulint old_activity_count;
 
-	ut_a(srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
+  ut_a(srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
 
-	srv_main_thread_op_info = "";
-	MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
-	if (srv_check_activity(&old_activity_count)) {
-		srv_master_do_active_tasks();
-	} else {
-		srv_master_do_idle_tasks();
-	}
-	srv_main_thread_op_info = "sleeping";
+  MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+  if (!purge_state.m_running)
+    srv_wake_purge_thread_if_not_active();
+  ulonglong counter_time= microsecond_interval_timer();
+  srv_sync_log_buffer_in_background();
+  MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+				 counter_time);
+
+  if (srv_check_activity(&old_activity_count))
+    srv_master_do_active_tasks(counter_time);
+  else
+    srv_master_do_idle_tasks(counter_time);
+
+  srv_main_thread_op_info= "sleeping";
 }
 
 /** @return whether purge should exit due to shutdown */
@@ -1707,7 +1555,8 @@ static bool srv_purge_should_exit()
     return true;
 
   /* Slow shutdown was requested. */
-  if (const size_t history_size= trx_sys.rseg_history_len)
+  const size_t history_size= trx_sys.history_size();
+  if (history_size)
   {
     static time_t progress_time;
     time_t now= time(NULL);
@@ -1718,7 +1567,7 @@ static bool srv_purge_should_exit()
       service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
 				     "InnoDB: to purge %zu transactions",
 				     history_size);
-      ib::info() << "to purge " << history_size << " transactions";
+      sql_print_information("InnoDB: to purge %zu transactions", history_size);
 #endif
     }
     return false;
@@ -1736,18 +1585,18 @@ static bool srv_task_execute()
 	ut_ad(!srv_read_only_mode);
 	ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
 
-	mutex_enter(&srv_sys.tasks_mutex);
+	mysql_mutex_lock(&srv_sys.tasks_mutex);
 
 	if (que_thr_t* thr = UT_LIST_GET_FIRST(srv_sys.tasks)) {
 		ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
 		UT_LIST_REMOVE(srv_sys.tasks, thr);
-		mutex_exit(&srv_sys.tasks_mutex);
+		mysql_mutex_unlock(&srv_sys.tasks_mutex);
 		que_run_threads(thr);
 		return true;
 	}
 
 	ut_ad(UT_LIST_GET_LEN(srv_sys.tasks) == 0);
-	mutex_exit(&srv_sys.tasks_mutex);
+	mysql_mutex_unlock(&srv_sys.tasks_mutex);
 	return false;
 }
 
@@ -1764,89 +1613,175 @@ void srv_update_purge_thread_count(uint n)
 
 Atomic_counter<int> srv_purge_thread_count_changed;
 
-/** Do the actual purge operation.
-@param[in,out]	n_total_purged	total number of purged pages
-@return length of history list before the last purge batch. */
-static size_t srv_do_purge(ulint* n_total_purged)
+inline void purge_coordinator_state::do_purge()
 {
-	ulint		n_pages_purged;
+  ut_ad(!srv_read_only_mode);
+  lazy_init();
+  ut_ad(n_threads);
+  bool wakeup= false;
 
-	static ulint	count = 0;
-	static ulint	n_use_threads = 0;
-	static size_t	rseg_history_len = 0;
-	ulint		old_activity_count = srv_get_activity_count();
-	static ulint	n_threads = srv_n_purge_threads;
+  purge_coordinator_timer->disarm();
 
-	ut_a(n_threads > 0);
-	ut_ad(!srv_read_only_mode);
+  while (purge_sys.enabled() && !purge_sys.paused())
+  {
+loop:
+    wakeup= false;
+    const auto now= my_interval_timer();
+    const auto sigcount= m_running;
+
+    if (now - start_time >= 1000000)
+    {
+      refresh(false);
+      start_time= now;
+    }
 
-	/* Purge until there are no more records to purge and there is
-	no change in configuration or server state. If the user has
-	configured more than one purge thread then we treat that as a
-	pool of threads and only use the extra threads if purge can't
-	keep up with updates. */
+    const auto old_activity_count= srv_sys.activity_count;
+    const auto history_size= trx_sys.history_size();
 
-	if (n_use_threads == 0) {
-		n_use_threads = n_threads;
-	}
+    if (UNIV_UNLIKELY(srv_purge_thread_count_changed))
+    {
+      /* Read the fresh value of srv_n_purge_threads, reset
+      the changed flag. Both are protected by purge_thread_count_mtx.
 
-	do {
-		if (UNIV_UNLIKELY(srv_purge_thread_count_changed)) {
-			/* Read the fresh value of srv_n_purge_threads, reset
-			the changed flag. Both variables are protected by
-			purge_thread_count_mtx.
-
-			This code does not run concurrently, it is executed
-			by a single purge_coordinator thread, and no races
-			involving srv_purge_thread_count_changed are possible.
-			*/
-
-			std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
-			n_threads = n_use_threads = srv_n_purge_threads;
-			srv_purge_thread_count_changed = 0;
-		} else if (trx_sys.rseg_history_len > rseg_history_len
-		    || (srv_max_purge_lag > 0
-			&& rseg_history_len > srv_max_purge_lag)) {
-
-			/* History length is now longer than what it was
-			when we took the last snapshot. Use more threads. */
-
-			if (n_use_threads < n_threads) {
-				++n_use_threads;
-			}
+      This code does not run concurrently, it is executed
+      by a single purge_coordinator thread, and no races
+      involving srv_purge_thread_count_changed are possible. */
+      {
+        std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+        n_threads= n_use_threads= srv_n_purge_threads;
+        srv_purge_thread_count_changed= 0;
+      }
+      refresh(true);
+      start_time= now;
+    }
+    else if (history_size > m_history_length)
+    {
+      /* dynamically adjust the purge thread based on redo log fill factor */
+      if (n_use_threads < n_threads && lsn_age_factor < lsn_lwm)
+      {
+more_threads:
+        ++n_use_threads;
+        lsn_hwm= lsn_lwm;
+        lsn_lwm-= series[n_use_threads];
+      }
+      else if (n_use_threads > 1 && lsn_age_factor >= lsn_hwm)
+      {
+fewer_threads:
+        --n_use_threads;
+        lsn_lwm= lsn_hwm;
+        lsn_hwm+= series[n_use_threads];
+      }
+      else if (n_use_threads == 1 && lsn_age_factor >= 100 - safety_net)
+      {
+        wakeup= true;
+        break;
+      }
+    }
+    else if (n_threads > n_use_threads &&
+             srv_max_purge_lag && m_history_length > srv_max_purge_lag)
+      goto more_threads;
+    else if (n_use_threads > 1 && old_activity_count == srv_sys.activity_count)
+      goto fewer_threads;
+
+    ut_ad(n_use_threads);
+    ut_ad(n_use_threads <= n_threads);
+
+    m_history_length= history_size;
+
+    if (history_size &&
+        trx_purge(n_use_threads,
+                  !(++count % srv_purge_rseg_truncate_frequency) ||
+                  purge_sys.truncate.current ||
+                  (srv_shutdown_state != SRV_SHUTDOWN_NONE &&
+                   srv_fast_shutdown == 0)))
+      continue;
+
+    if (m_running == sigcount)
+    {
+      /* Purge was not woken up by srv_wake_purge_thread_if_not_active() */
+
+      /* The magic number 5000 is an approximation for the case where we have
+      cached undo log records which prevent truncate of rollback segments. */
+      wakeup= history_size &&
+        (history_size >= 5000 ||
+         history_size != trx_sys.history_size_approx());
+      break;
+    }
+    else if (!trx_sys.history_exists())
+      break;
 
-		} else if (srv_check_activity(&old_activity_count)
-			   && n_use_threads > 1) {
+    if (!srv_purge_should_exit())
+      goto loop;
+  }
 
-			/* History length same or smaller since last snapshot,
-			use fewer threads. */
+  if (wakeup)
+    purge_coordinator_timer->set_time(10, 0);
 
-			--n_use_threads;
-		}
+  m_running= 0;
+}
 
-		/* Ensure that the purge threads are less than what
-		was configured. */
+inline void purge_coordinator_state::compute_series()
+{
+  ulint points= n_threads;
+  memset(series, 0, sizeof series);
+  constexpr ulint spread= 100 - adaptive_purge_threshold - safety_net;
 
-		ut_a(n_use_threads > 0);
-		ut_a(n_use_threads <= n_threads);
+  /* We distribute spread across n_threads,
+  e.g.: spread of 60 is distributed across n_threads=4 as: 6+12+18+24 */
 
-		/* Take a snapshot of the history list before purge. */
-		if (!(rseg_history_len = trx_sys.rseg_history_len)) {
-			break;
-		}
+  const ulint additional_points= (points * (points + 1)) / 2;
+  if (spread % additional_points == 0)
+  {
+    /* Arithmetic progression is possible. */
+    const ulint delta= spread / additional_points;
+    ulint growth= delta;
+    do
+    {
+      series[points--]= growth;
+      growth += delta;
+    }
+    while (points);
+    return;
+  }
 
-		n_pages_purged = trx_purge(
-			n_use_threads,
-			!(++count % srv_purge_rseg_truncate_frequency)
-			|| purge_sys.truncate.current
-			|| (srv_shutdown_state != SRV_SHUTDOWN_NONE
-			    && srv_fast_shutdown == 0));
+  /* Use average distribution to spread across the points */
+  const ulint delta= spread / points;
+  ulint total= 0;
+  do
+  {
+    series[points--]= delta;
+    total+= delta;
+  }
+  while (points);
 
-		*n_total_purged += n_pages_purged;
-	} while (n_pages_purged > 0 && !purge_sys.paused()
-		 && !srv_purge_should_exit());
+  for (points= 1; points <= n_threads && total++ < spread; )
+    series[points++]++;
+}
+
+inline void purge_coordinator_state::lazy_init()
+{
+  if (n_threads)
+    return;
+  n_threads= n_use_threads= srv_n_purge_threads;
+  refresh(true);
+  start_time= my_interval_timer();
+}
+
+void purge_coordinator_state::refresh(bool full)
+{
+  if (full)
+  {
+    compute_series();
+    lsn_lwm= adaptive_purge_threshold;
+    lsn_hwm= adaptive_purge_threshold + series[n_threads];
+  }
 
-	return(rseg_history_len);
+  mysql_mutex_lock(&log_sys.mutex);
+  const lsn_t last= log_sys.last_checkpoint_lsn,
+    max_age= log_sys.max_checkpoint_age;
+  mysql_mutex_unlock(&log_sys.mutex);
+
+  lsn_age_factor= ulint(((log_sys.get_lsn() - last) * 100) / max_age);
 }
 
 
@@ -1892,27 +1827,6 @@ static void release_thd(THD *thd, void *ctx)
 	set_current_thd(0);
 }
 
-
-/*
-  Called by timer when purge coordinator decides
-  to delay processing of purge records.
-*/
-static void purge_coordinator_timer_callback(void *)
-{
-  if (!purge_sys.enabled() || purge_sys.paused() ||
-      purge_state.m_running || !trx_sys.rseg_history_len)
-    return;
-
-  if (purge_state.m_history_length < 5000 &&
-      purge_state.m_history_length == trx_sys.rseg_history_len)
-    /* No new records were added since wait started.
-    Simply wait for new records. The magic number 5000 is an
-    approximation for the case where we	have cached UNDO
-    log records which prevent truncate of the UNDO segments.*/
-    return;
-  srv_wake_purge_thread_if_not_active();
-}
-
 static void purge_worker_callback(void*)
 {
   ut_ad(!current_thd);
@@ -1925,62 +1839,24 @@ static void purge_worker_callback(void*)
   release_thd(thd,ctx);
 }
 
-static void purge_coordinator_callback_low()
-{
-  ulint n_total_purged= ULINT_UNDEFINED;
-  purge_state.m_history_length= 0;
-
-  if (!purge_sys.enabled() || purge_sys.paused())
-    return;
-  do
-  {
-    n_total_purged = 0;
-    int sigcount= purge_state.m_running;
-
-    purge_state.m_history_length= srv_do_purge(&n_total_purged);
-
-    /* Check if purge was woken by srv_wake_purge_thread_if_not_active() */
-
-    bool woken_during_purge= purge_state.m_running > sigcount;
-
-    /* If last purge batch processed less than 1 page and there is
-    still work to do, delay the next batch by 10ms. Unless
-    someone added work and woke us up. */
-    if (n_total_purged == 0)
-    {
-      if (trx_sys.rseg_history_len == 0)
-        return;
-      if (!woken_during_purge)
-      {
-        /* Delay next purge round*/
-        purge_coordinator_timer->set_time(10, 0);
-        return;
-      }
-    }
-  }
-  while ((purge_sys.enabled() && !purge_sys.paused()) ||
-         !srv_purge_should_exit());
-}
-
 static void purge_coordinator_callback(void*)
 {
   void *ctx;
   THD *thd= acquire_thd(&ctx);
-  purge_coordinator_callback_low();
-  release_thd(thd,ctx);
-  purge_state.m_running= 0;
+  purge_state.do_purge();
+  release_thd(thd, ctx);
 }
 
 void srv_init_purge_tasks()
 {
   purge_create_background_thds(srv_n_purge_threads);
   purge_coordinator_timer= srv_thread_pool->create_timer
-    (purge_coordinator_timer_callback, nullptr);
+    (purge_coordinator_callback, nullptr);
 }
 
 static void srv_shutdown_purge_tasks()
 {
-  purge_coordinator_task.wait();
+  purge_coordinator_task.disable();
   delete purge_coordinator_timer;
   purge_coordinator_timer= nullptr;
   purge_worker_task.wait();
@@ -2002,11 +1878,11 @@ srv_que_task_enqueue_low(
 	que_thr_t*	thr)	/*!< in: query thread */
 {
 	ut_ad(!srv_read_only_mode);
-	mutex_enter(&srv_sys.tasks_mutex);
+	mysql_mutex_lock(&srv_sys.tasks_mutex);
 
 	UT_LIST_ADD_LAST(srv_sys.tasks, thr);
 
-	mutex_exit(&srv_sys.tasks_mutex);
+	mysql_mutex_unlock(&srv_sys.tasks_mutex);
 }
 
 #ifdef UNIV_DEBUG
@@ -2017,11 +1893,11 @@ ulint srv_get_task_queue_length()
 
 	ut_ad(!srv_read_only_mode);
 
-	mutex_enter(&srv_sys.tasks_mutex);
+	mysql_mutex_lock(&srv_sys.tasks_mutex);
 
 	n_tasks = UT_LIST_GET_LEN(srv_sys.tasks);
 
-	mutex_exit(&srv_sys.tasks_mutex);
+	mysql_mutex_unlock(&srv_sys.tasks_mutex);
 
 	return(n_tasks);
 }
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index a507b29ffa2..a1368c5146c 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -55,7 +55,6 @@ Created 2/16/1996 Heikki Tuuri
 #include "buf0dblwr.h"
 #include "buf0dump.h"
 #include "os0file.h"
-#include "os0thread.h"
 #include "fil0fil.h"
 #include "fil0crypt.h"
 #include "fsp0fsp.h"
@@ -96,12 +95,11 @@ Created 2/16/1996 Heikki Tuuri
 #include "row0row.h"
 #include "row0mysql.h"
 #include "btr0pcur.h"
-#include "os0event.h"
 #include "zlib.h"
 #include "ut0crc32.h"
 
 /** We are prepared for a situation that we have this many threads waiting for
-a semaphore inside InnoDB. srv_start() sets the value. */
+a transactional lock inside InnoDB. srv_start() sets the value. */
 ulint srv_max_n_threads;
 
 /** Log sequence number at shutdown */
@@ -118,8 +116,6 @@ incomplete transactions */
 bool	srv_startup_is_before_trx_rollback_phase;
 /** TRUE if the server is being started */
 bool	srv_is_being_started;
-/** TRUE if SYS_TABLESPACES is available for lookups */
-bool	srv_sys_tablespaces_open;
 /** TRUE if the server was successfully started */
 bool	srv_was_started;
 /** The original value of srv_log_file_size (innodb_log_file_size) */
@@ -128,13 +124,13 @@ static ulonglong	srv_log_file_size_requested;
 static bool		srv_start_has_been_called;
 
 /** Whether any undo log records can be generated */
-UNIV_INTERN bool	srv_undo_sources;
+bool	srv_undo_sources;
 
 #ifdef UNIV_DEBUG
 /** InnoDB system tablespace to set during recovery */
-UNIV_INTERN uint	srv_sys_space_size_debug;
+uint	srv_sys_space_size_debug;
 /** whether redo log file have been created at startup */
-UNIV_INTERN bool	srv_log_file_created;
+bool	srv_log_file_created;
 #endif /* UNIV_DEBUG */
 
 /** whether some background threads that create redo log have been started */
@@ -253,7 +249,11 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
 	}
 
 	DBUG_PRINT("ib_log", ("After innodb_log_abort_6"));
-	DBUG_ASSERT(!buf_pool.any_io_pending());
+	ut_ad(!os_aio_pending_reads());
+	ut_ad(!os_aio_pending_writes());
+	ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+	ut_ad(!buf_pool.get_oldest_modification(0));
+	ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
 
 	DBUG_EXECUTE_IF("innodb_log_abort_7", return DB_ERROR;);
 	DBUG_PRINT("ib_log", ("After innodb_log_abort_7"));
@@ -487,15 +487,12 @@ static ulint trx_rseg_get_n_undo_tablespaces()
 static ulint srv_undo_tablespace_open(bool create, const char* name, ulint i)
 {
   bool success;
-  char undo_name[sizeof "innodb_undo000"];
   ulint space_id= 0;
   ulint fsp_flags= 0;
 
   if (create)
   {
     space_id= srv_undo_space_id_start + i;
-    snprintf(undo_name, sizeof(undo_name),
-             "innodb_undo%03u", static_cast<unsigned>(space_id));
     switch (srv_checksum_algorithm) {
     case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
     case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
@@ -522,7 +519,8 @@ static ulint srv_undo_tablespace_open(bool create, const char* name, ulint i)
   {
     page_t *page= static_cast<byte*>(aligned_malloc(srv_page_size,
                                                     srv_page_size));
-    dberr_t err= os_file_read(IORequestRead, fh, page, 0, srv_page_size);
+    dberr_t err= os_file_read(IORequestRead, fh, page, 0, srv_page_size,
+                              nullptr);
     if (err != DB_SUCCESS)
     {
 err_exit:
@@ -550,7 +548,6 @@ err_exit:
     }
 
     space_id= id;
-    snprintf(undo_name, sizeof undo_name, "innodb_undo%03u", id);
     aligned_free(page);
   }
 
@@ -562,14 +559,14 @@ err_exit:
 
   fil_set_max_space_id_if_bigger(space_id);
 
-  fil_space_t *space= fil_space_t::create(undo_name, space_id, fsp_flags,
-					  FIL_TYPE_TABLESPACE, NULL,
+  fil_space_t *space= fil_space_t::create(space_id, fsp_flags,
+					  FIL_TYPE_TABLESPACE, nullptr,
 					  FIL_ENCRYPTION_DEFAULT, true);
   ut_a(fil_validate());
   ut_a(space);
 
   fil_node_t *file= space->add(name, fh, 0, false, true);
-  mutex_enter(&fil_system.mutex);
+  mysql_mutex_lock(&fil_system.mutex);
 
   if (create)
   {
@@ -584,7 +581,7 @@ err_exit:
     fil_system.n_open--;
   }
 
-  mutex_exit(&fil_system.mutex);
+  mysql_mutex_unlock(&fil_system.mutex);
   return space_id;
 }
 
@@ -603,11 +600,7 @@ srv_check_undo_redo_logs_exists()
 	/* Check if any undo tablespaces exist */
 	for (ulint i = 1; i <= srv_undo_tablespaces; ++i) {
 
-		snprintf(
-			name, sizeof(name),
-			"%s%cundo%03zu",
-			srv_undo_dir, OS_PATH_SEPARATOR,
-			i);
+		snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i);
 
 		fh = os_file_create(
 			innodb_data_file_key, name,
@@ -665,8 +658,7 @@ static dberr_t srv_all_undo_tablespaces_open(bool create_new_db, ulint n_undo)
   for (ulint i= 0; i < n_undo; ++i)
   {
     char name[OS_FILE_MAX_PATH];
-    snprintf(name, sizeof name, "%s%cundo%03zu", srv_undo_dir,
-             OS_PATH_SEPARATOR, i + 1);
+    snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i + 1);
     ulint space_id= srv_undo_tablespace_open(create_new_db, name, i);
     if (!space_id)
     {
@@ -696,8 +688,7 @@ static dberr_t srv_all_undo_tablespaces_open(bool create_new_db, ulint n_undo)
        ++i)
   {
      char name[OS_FILE_MAX_PATH];
-     snprintf(name, sizeof(name),
-              "%s%cundo%03zu", srv_undo_dir, OS_PATH_SEPARATOR, i);
+     snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i);
      if (!srv_undo_tablespace_open(create_new_db, name, i))
        break;
      ++srv_undo_tablespaces_open;
@@ -731,8 +722,7 @@ srv_undo_tablespaces_init(bool create_new_db)
     for (ulint i= 0; i < srv_undo_tablespaces; ++i)
     {
       char name[OS_FILE_MAX_PATH];
-      snprintf(name, sizeof name, "%s%cundo%03zu",
-               srv_undo_dir, OS_PATH_SEPARATOR, i + 1);
+      snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i + 1);
       if (dberr_t err= srv_undo_tablespace_create(name))
       {
         ib::error() << "Could not create undo tablespace '" << name << "'.";
@@ -764,10 +754,12 @@ srv_undo_tablespaces_init(bool create_new_db)
     mtr_t mtr;
     for (ulint i= 0; i < srv_undo_tablespaces; ++i)
     {
-       mtr.start();
-       fsp_header_init(fil_space_get(srv_undo_space_id_start + i),
-                       SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
-       mtr.commit();
+      mtr.start();
+      dberr_t err= fsp_header_init(fil_space_get(srv_undo_space_id_start + i),
+                                   SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, &mtr);
+      mtr.commit();
+      if (err)
+        return err;
     }
   }
 
@@ -813,10 +805,13 @@ srv_open_tmp_tablespace(bool create_new_db)
 		mtr_t mtr;
 		mtr.start();
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
-		fsp_header_init(fil_system.temp_space,
-				srv_tmp_space.get_sum_of_sizes(),
-				&mtr);
+		err = fsp_header_init(fil_system.temp_space,
+				      srv_tmp_space.get_sum_of_sizes(),
+				      &mtr);
 		mtr.commit();
+		if (err == DB_SUCCESS) {
+			err = trx_temp_rseg_create(&mtr);
+		}
 	} else {
 		/* This file was just opened in the code above! */
 		ib::error() << "The innodb_temporary"
@@ -832,10 +827,8 @@ srv_open_tmp_tablespace(bool create_new_db)
 static void srv_shutdown_threads()
 {
 	ut_ad(!srv_undo_sources);
-	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
-
-	lock_sys.timeout_timer.reset();
 	srv_master_timer.reset();
+	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
 
 	if (purge_sys.enabled()) {
 		srv_purge_shutdown();
@@ -857,11 +850,6 @@ static void srv_shutdown_bg_undo_sources()
     ut_ad(!srv_read_only_mode);
     fts_optimize_shutdown();
     dict_stats_shutdown();
-    while (row_get_background_drop_list_len_low())
-    {
-      srv_inc_activity_count();
-      os_thread_yield();
-    }
     srv_undo_sources= false;
   }
 }
@@ -983,7 +971,11 @@ same_size:
   }
 
   ut_ad(flushed_lsn == log_sys.get_lsn());
-  ut_ad(!buf_pool.any_io_pending());
+  ut_ad(!os_aio_pending_reads());
+  ut_ad(!os_aio_pending_writes());
+  ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+  ut_ad(!buf_pool.get_oldest_modification(0));
+  ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
 
   DBUG_RETURN(flushed_lsn);
 }
@@ -1010,6 +1002,9 @@ static dberr_t find_and_check_log_file(bool &log_file_found)
     if (is_operation_restore())
       return DB_NOT_FOUND;
 
+    /* This might be first start after mariabackup
+    copy-back or move-back. */
+    srv_start_after_restore= true;
     return DB_SUCCESS;
   }
 
@@ -1040,7 +1035,9 @@ static dberr_t find_and_check_log_file(bool &log_file_found)
   header, checkpoint page 1, empty, checkpoint page 2, redo log page(s).
 
   Mariabackup --prepare would create an empty LOG_FILE_NAME. Tolerate it. */
-  if (size != 0 && size <= OS_FILE_LOG_BLOCK_SIZE * 4)
+  if (size == 0)
+    srv_start_after_restore= true;
+  else if (size <= OS_FILE_LOG_BLOCK_SIZE * 4)
   {
     ib::error() << "Log file " << logfile0 << " size " << size
                 << " is too small";
@@ -1052,6 +1049,11 @@ static dberr_t find_and_check_log_file(bool &log_file_found)
   return DB_SUCCESS;
 }
 
+static tpool::task_group rollback_all_recovered_group(1);
+static tpool::task rollback_all_recovered_task(trx_rollback_all_recovered,
+                                               nullptr,
+                                               &rollback_all_recovered_group);
+
 /** Start InnoDB.
 @param[in]	create_new_db	whether to create a new database
 @return DB_SUCCESS or error code */
@@ -1076,7 +1078,7 @@ dberr_t srv_start(bool create_new_db)
 	}
 
 	high_level_read_only = srv_read_only_mode
-		|| srv_force_recovery > SRV_FORCE_NO_IBUF_MERGE
+		|| srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN
 		|| srv_sys_space.created_new_raw();
 
 	srv_started_redo = false;
@@ -1091,8 +1093,6 @@ dberr_t srv_start(bool create_new_db)
 	ib::info() << "!!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!";
 #endif
 
-	ib::info() << MUTEX_TYPE;
-
 	ib::info() << "Compressed tables use zlib " ZLIB_VERSION
 #ifdef UNIV_ZIP_DEBUG
 	      " with validation"
@@ -1110,7 +1110,7 @@ dberr_t srv_start(bool create_new_db)
 	if (srv_start_has_been_called) {
 		ib::error() << "Startup called second time"
 			" during the process lifetime."
-			" In the MySQL Embedded Server Library"
+			" In the MariaDB Embedded Server Library"
 			" you cannot call server_init() more than"
 			" once during the process lifetime.";
 	}
@@ -1124,37 +1124,24 @@ dberr_t srv_start(bool create_new_db)
 	mysql_stage_register("innodb", srv_stages,
 			     static_cast<int>(UT_ARR_SIZE(srv_stages)));
 
-	/* Set the maximum number of threads which can wait for a semaphore
-	inside InnoDB: this is the 'sync wait array' size */
-
-	srv_max_n_threads = 1   /* io_ibuf_thread */
-			    + 1 /* io_log_thread */
-			    + 1 /* srv_print_monitor_task */
-			    + 1 /* srv_purge_coordinator_thread */
-			    + 1 /* buf_dump_thread */
-			    + 1 /* dict_stats_thread */
-			    + 1 /* fts_optimize_thread */
-			    + 1 /* trx_rollback_all_recovered */
-			    + 128 /* added as margin, for use of
-				  InnoDB Memcached etc. */
-			    + 1/* buf_flush_page_cleaner */
-			    + max_connections
-			    + srv_n_read_io_threads
-			    + srv_n_write_io_threads
-			    + srv_n_purge_threads
-			    /* FTS Parallel Sort */
-			    + fts_sort_pll_degree * FTS_NUM_AUX_INDEX
-			      * max_connections;
+	srv_max_n_threads =
+		1 /* dict_stats_thread */
+		+ 1 /* fts_optimize_thread */
+		+ 128 /* safety margin */
+		+ max_connections;
 
 	srv_boot();
 
 	ib::info() << my_crc32c_implementation();
 
 	if (!srv_read_only_mode) {
+		mysql_mutex_init(srv_monitor_file_mutex_key,
+				 &srv_monitor_file_mutex, nullptr);
+		mysql_mutex_init(srv_misc_tmpfile_mutex_key,
+				 &srv_misc_tmpfile_mutex, nullptr);
+	}
 
-		mutex_create(LATCH_ID_SRV_MONITOR_FILE,
-			     &srv_monitor_file_mutex);
-
+	if (!srv_read_only_mode) {
 		if (srv_innodb_status) {
 
 			srv_monitor_file_name = static_cast<char*>(
@@ -1190,9 +1177,6 @@ dberr_t srv_start(bool create_new_db)
 			}
 		}
 
-		mutex_create(LATCH_ID_SRV_MISC_TMPFILE,
-			     &srv_misc_tmpfile_mutex);
-
 		srv_misc_tmpfile = os_file_create_tmpfile();
 
 		if (!srv_misc_tmpfile && err == DB_SUCCESS) {
@@ -1220,6 +1204,11 @@ dberr_t srv_start(bool create_new_db)
 		ib::info() << "Using Linux native AIO";
 	}
 #endif
+#ifdef HAVE_URING
+	if (srv_use_native_aio) {
+		ib::info() << "Using liburing";
+	}
+#endif
 
 	fil_system.create(srv_file_per_table ? 50000 : 5000);
 
@@ -1406,17 +1395,18 @@ file_checked:
 		ut_ad(fil_system.sys_space->id == 0);
 		compile_time_assert(TRX_SYS_SPACE == 0);
 		compile_time_assert(IBUF_SPACE_ID == 0);
-		fsp_header_init(fil_system.sys_space,
-				uint32_t(sum_of_new_sizes), &mtr);
+		ut_a(fsp_header_init(fil_system.sys_space,
+				     uint32_t(sum_of_new_sizes), &mtr)
+		     == DB_SUCCESS);
 
 		ulint ibuf_root = btr_create(
 			DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space,
-			DICT_IBUF_ID_MIN, nullptr, &mtr);
+			DICT_IBUF_ID_MIN, nullptr, &mtr, &err);
 
 		mtr_commit(&mtr);
 
 		if (ibuf_root == FIL_NULL) {
-			return(srv_init_abort(DB_ERROR));
+			return srv_init_abort(err);
 		}
 
 		ut_ad(ibuf_root == IBUF_TREE_ROOT_PAGE_NO);
@@ -1425,8 +1415,7 @@ file_checked:
 		the first rollback segment before the double write buffer.
 		All the remaining rollback segments will be created later,
 		after the double write buffer has been created. */
-		trx_sys_create_sys_pages();
-		err = trx_lists_init_at_db_start();
+		err = trx_sys_create_sys_pages(&mtr);
 
 		if (err != DB_SUCCESS) {
 			return(srv_init_abort(err));
@@ -1489,6 +1478,9 @@ file_checked:
 			if (err != DB_SUCCESS) {
 				return srv_init_abort(err);
 			}
+			if (srv_operation != SRV_OPERATION_RESTORE) {
+				dict_sys.load_sys_tables();
+			}
 			err = trx_lists_init_at_db_start();
 			if (err != DB_SUCCESS) {
 				return srv_init_abort(err);
@@ -1496,6 +1488,7 @@ file_checked:
 			break;
 		case SRV_OPERATION_RESTORE_DELTA:
 		case SRV_OPERATION_BACKUP:
+		case SRV_OPERATION_BACKUP_NO_DEFER:
 			ut_ad("wrong mariabackup mode" == 0);
 		}
 
@@ -1506,8 +1499,8 @@ file_checked:
 
 			recv_sys.apply(true);
 
-			if (recv_sys.found_corrupt_log
-			    || recv_sys.found_corrupt_fs) {
+			if (recv_sys.is_corrupt_log()
+			    || recv_sys.is_corrupt_fs()) {
 				return(srv_init_abort(DB_CORRUPTION));
 			}
 
@@ -1531,28 +1524,27 @@ file_checked:
 			if (sum_of_new_sizes > 0) {
 				/* New data file(s) were added */
 				mtr.start();
-				mtr.x_lock_space(fil_system.sys_space,
-						 __FILE__, __LINE__);
+				mtr.x_lock_space(fil_system.sys_space);
 				buf_block_t* block = buf_page_get(
 					page_id_t(0, 0), 0,
 					RW_SX_LATCH, &mtr);
+				/* The first page of the system tablespace
+				should already have been successfully
+				accessed earlier during startup. */
+				ut_a(block);
 				ulint size = mach_read_from_4(
 					FSP_HEADER_OFFSET + FSP_SIZE
-					+ block->frame);
+					+ block->page.frame);
 				ut_ad(size == fil_system.sys_space
 				      ->size_in_header);
 				size += sum_of_new_sizes;
 				mtr.write<4>(*block,
 					     FSP_HEADER_OFFSET + FSP_SIZE
-					     + block->frame, size);
+					     + block->page.frame, size);
 				fil_system.sys_space->size_in_header
 					= uint32_t(size);
 				mtr.commit();
-				/* Immediately write the log record about
-				increased tablespace size to disk, so that it
-				is durable even if mysqld would crash
-				quickly */
-				log_buffer_flush_to_disk();
+				log_write_up_to(mtr.commit_lsn(), true);
 			}
 		}
 
@@ -1562,7 +1554,7 @@ file_checked:
 			buf_block_t* block = buf_page_get(page_id_t(0, 0), 0,
 							  RW_S_LATCH, &mtr);
 			ut_ad(mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET
-					       + block->frame)
+					       + block->page.frame)
 			      == fil_system.sys_space->size_in_header);
 			mtr.commit();
 		}
@@ -1617,7 +1609,11 @@ file_checked:
 			ut_ad(srv_force_recovery <= SRV_FORCE_IGNORE_CORRUPT);
 			ut_ad(recv_no_log_write);
 			err = fil_write_flushed_lsn(log_sys.get_lsn());
-			DBUG_ASSERT(!buf_pool.any_io_pending());
+			ut_ad(!os_aio_pending_reads());
+			ut_ad(!os_aio_pending_writes());
+			ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+			ut_ad(!buf_pool.get_oldest_modification(0));
+			ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
 			log_sys.log.close_file();
 			if (err == DB_SUCCESS) {
 				bool trunc = srv_operation
@@ -1661,7 +1657,11 @@ file_checked:
 			threads until creating a log checkpoint at the
 			end of create_log_file(). */
 			ut_d(recv_no_log_write = true);
-			DBUG_ASSERT(!buf_pool.any_io_pending());
+			ut_ad(!os_aio_pending_reads());
+			ut_ad(!os_aio_pending_writes());
+			ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+			ut_ad(!buf_pool.get_oldest_modification(0));
+			ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
 
 			DBUG_EXECUTE_IF("innodb_log_abort_3",
 					return(srv_init_abort(DB_ERROR)););
@@ -1721,8 +1721,7 @@ file_checked:
 
 	/* Note: When creating the extra rollback segments during an upgrade
 	we violate the latching order, even if the change buffer is empty.
-	We make an exception in sync0sync.cc and check srv_is_being_started
-	for that violation. It cannot create a deadlock because we are still
+	It cannot create a deadlock because we are still
 	running in single threaded mode essentially. Only the IO threads
 	should be running at this stage. */
 
@@ -1732,7 +1731,7 @@ file_checked:
 
 	if (!create_new_db) {
 		ut_ad(high_level_read_only
-		      || srv_force_recovery <= SRV_FORCE_NO_IBUF_MERGE);
+		      || srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
 
 		/* Validate a few system page types that were left
 		uninitialized before MySQL or MariaDB 5.5. */
@@ -1746,6 +1745,11 @@ file_checked:
 				page_id_t(IBUF_SPACE_ID,
 					  FSP_IBUF_HEADER_PAGE_NO),
 				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+			corrupted_old_page:
+				mtr.commit();
+				return srv_init_abort(DB_CORRUPTION);
+			}
 			fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
 			/* Already MySQL 3.23.53 initialized
 			FSP_IBUF_TREE_ROOT_PAGE_NO to
@@ -1753,16 +1757,25 @@ file_checked:
 			block = buf_page_get(
 				page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
 				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+				goto corrupted_old_page;
+			}
 			fil_block_check_type(*block, FIL_PAGE_TYPE_TRX_SYS,
 					     &mtr);
 			block = buf_page_get(
 				page_id_t(TRX_SYS_SPACE,
 					  FSP_FIRST_RSEG_PAGE_NO),
 				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+				goto corrupted_old_page;
+			}
 			fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
 			block = buf_page_get(
 				page_id_t(TRX_SYS_SPACE, FSP_DICT_HDR_PAGE_NO),
 				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+				goto corrupted_old_page;
+			}
 			fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
 			mtr.commit();
 		}
@@ -1773,7 +1786,7 @@ file_checked:
 		should guarantee that there is at most one data
 		dictionary transaction active at a time. */
 		if (!high_level_read_only
-		    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		    && srv_force_recovery <= SRV_FORCE_NO_TRX_UNDO) {
 			/* If the following call is ever removed, the
 			first-time ha_innobase::open() must hold (or
 			acquire and release) a table lock that
@@ -1787,17 +1800,7 @@ file_checked:
 			trx_rollback_recovered(false);
 		}
 
-		/* FIXME: Skip the following if srv_read_only_mode,
-		while avoiding "Allocated tablespace ID" warnings. */
-		if (srv_force_recovery <= SRV_FORCE_NO_IBUF_MERGE) {
-			/* Open or Create SYS_TABLESPACES and SYS_DATAFILES
-			so that tablespace names and other metadata can be
-			found. */
-			err = dict_create_or_check_sys_tablespace();
-			if (err != DB_SUCCESS) {
-				return(srv_init_abort(err));
-			}
-
+		if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
 			/* The following call is necessary for the insert
 			buffer to work with multiple tablespaces. We must
 			know the mapping between space id's and .ibd file
@@ -1819,30 +1822,15 @@ file_checked:
 		    && !srv_read_only_mode) {
 			/* Drop partially created indexes. */
 			row_merge_drop_temp_indexes();
-			/* Drop garbage tables. */
-			row_mysql_drop_garbage_tables();
-
-			/* Drop any auxiliary tables that were not
-			dropped when the parent table was
-			dropped. This can happen if the parent table
-			was dropped but the server crashed before the
-			auxiliary tables were dropped. */
-			fts_drop_orphaned_tables();
-
 			/* Rollback incomplete non-DDL transactions */
 			trx_rollback_is_active = true;
-			os_thread_create(trx_rollback_all_recovered);
+			srv_thread_pool->submit_task(&rollback_all_recovered_task);
 		}
 	}
 
 	srv_startup_is_before_trx_rollback_phase = false;
 
 	if (!srv_read_only_mode) {
-		/* timer task which watches the timeouts
-		for lock waits */
-		lock_sys.timeout_timer.reset(srv_thread_pool->create_timer(
-			lock_wait_timeout_task));
-
 		DBUG_EXECUTE_IF("innodb_skip_monitors", goto skip_monitors;);
 		/* Create the task which warns of long semaphore waits */
 		srv_start_periodic_timer(srv_monitor_timer, srv_monitor_task,
@@ -1864,14 +1852,7 @@ skip_monitors:
 		}
 	}
 
-	/* Create the SYS_FOREIGN and SYS_FOREIGN_COLS system tables */
-	err = dict_create_or_check_foreign_constraint_tables();
-	if (err == DB_SUCCESS) {
-		err = dict_create_or_check_sys_tablespace();
-		if (err == DB_SUCCESS) {
-			err = dict_create_or_check_sys_virtual();
-		}
-	}
+	err = dict_sys.create_or_check_sys_tables();
 	switch (err) {
 	case DB_SUCCESS:
 		break;
@@ -1895,21 +1876,11 @@ skip_monitors:
 			return(srv_init_abort(err));
 		}
 
-		trx_temp_rseg_create();
-
 		if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
 			srv_start_periodic_timer(srv_master_timer, srv_master_callback, 1000);
 		}
 	}
 
-	if (!srv_read_only_mode
-            && srv_operation <= SRV_OPERATION_EXPORT_RESTORED
-	    && srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
-		srv_init_purge_tasks();
-		purge_sys.coordinator_startup();
-		srv_wake_purge_thread_if_not_active();
-	}
-
 	srv_is_being_started = false;
 
 	if (srv_print_verbose_log) {
@@ -1953,10 +1924,8 @@ skip_monitors:
 		/* Create thread(s) that handles key rotation. This is
 		needed already here as log_preflush_pool_modified_pages
 		will flush dirty pages and that might need e.g.
-		fil_crypt_threads_event. */
-		fil_system_enter();
+		fil_crypt_threads_cond. */
 		fil_crypt_threads_init();
-		fil_system_exit();
 
 		/* Initialize online defragmentation. */
 		btr_defragment_init();
@@ -1988,7 +1957,7 @@ void innodb_preshutdown()
 
     if (trx_sys.is_initialised())
       while (trx_sys.any_active_transactions())
-        os_thread_sleep(1000);
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
   }
   srv_shutdown_bg_undo_sources();
   srv_purge_shutdown();
@@ -2006,6 +1975,7 @@ void innodb_shutdown()
 	switch (srv_operation) {
 	case SRV_OPERATION_BACKUP:
 	case SRV_OPERATION_RESTORE_DELTA:
+	case SRV_OPERATION_BACKUP_NO_DEFER:
 		break;
 	case SRV_OPERATION_RESTORE:
 	case SRV_OPERATION_RESTORE_EXPORT:
@@ -2046,13 +2016,13 @@ void innodb_shutdown()
 
 	ut_ad(dict_sys.is_initialised() || !srv_was_started);
 	ut_ad(trx_sys.is_initialised() || !srv_was_started);
-	ut_ad(buf_dblwr.is_initialised() || !srv_was_started
+	ut_ad(buf_dblwr.is_created() || !srv_was_started
 	      || srv_read_only_mode
 	      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
 	ut_ad(lock_sys.is_initialised() || !srv_was_started);
 	ut_ad(log_sys.is_initialised() || !srv_was_started);
 	ut_ad(ibuf.index || !innodb_change_buffering || !srv_was_started
-	      || srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE);
+	      || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO);
 
 	dict_stats_deinit();
 
@@ -2082,13 +2052,12 @@ void innodb_shutdown()
 	trx_pool_close();
 
 	if (!srv_read_only_mode) {
-		mutex_free(&srv_monitor_file_mutex);
-		mutex_free(&srv_misc_tmpfile_mutex);
+		mysql_mutex_destroy(&srv_monitor_file_mutex);
+		mysql_mutex_destroy(&srv_misc_tmpfile_mutex);
 	}
 
 	dict_sys.close();
 	btr_search_sys_free();
-	row_mysql_close();
 	srv_free();
 	fil_system.close();
 	pars_lexer_close();
@@ -2096,7 +2065,6 @@ void innodb_shutdown()
 
 	ut_ad(buf_pool.is_initialised() || !srv_was_started);
 	buf_pool.close();
-	sync_check_close();
 
 	srv_sys_space.shutdown();
 	if (srv_tmp_space.get_sanity_check_status()) {
@@ -2133,17 +2101,14 @@ srv_get_meta_data_filename(
 	char*		path;
 
 	/* Make sure the data_dir_path is set. */
-	dict_get_and_save_data_dir_path(table, false);
-
-	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
-		ut_a(table->data_dir_path);
+	dict_get_and_save_data_dir_path(table);
 
-		path = fil_make_filepath(
-			table->data_dir_path, table->name.m_name, CFG, true);
-	} else {
-		path = fil_make_filepath(NULL, table->name.m_name, CFG, false);
-	}
+	const char* data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || data_dir_path);
 
+	path = fil_make_filepath(data_dir_path, table->name, CFG,
+				 data_dir_path != nullptr);
 	ut_a(path);
 	len = strlen(path);
 	ut_a(max_len >= len);
diff --git a/storage/innobase/sync/srw_lock.cc b/storage/innobase/sync/srw_lock.cc
new file mode 100644
index 00000000000..05445f1a68c
--- /dev/null
+++ b/storage/innobase/sync/srw_lock.cc
@@ -0,0 +1,545 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include "srw_lock.h"
+#include "srv0srv.h"
+#include "my_cpu.h"
+#include "transactional_lock_guard.h"
+
+#ifdef NO_ELISION
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+# include <intrin.h>
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+  int regs[4];
+  __cpuid(regs, 0);
+  if (regs[0] < 7)
+    return false;
+  __cpuidex(regs, 7, 0);
+  /* Restricted Transactional Memory (RTM) */
+  have_transactional_memory= regs[1] & 1U << 11;
+  return have_transactional_memory;
+}
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# include <cpuid.h>
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+  if (__get_cpuid_max(0, nullptr) < 7)
+    return false;
+  unsigned eax, ebx, ecx, edx;
+  __cpuid_count(7, 0, eax, ebx, ecx, edx);
+  /* Restricted Transactional Memory (RTM) */
+  have_transactional_memory= ebx & 1U << 11;
+  return have_transactional_memory;
+}
+
+# ifdef UNIV_DEBUG
+TRANSACTIONAL_TARGET
+bool xtest() { return have_transactional_memory && _xtest(); }
+# endif
+#elif defined __powerpc64__ || defined __s390__
+# include <htmxlintrin.h>
+# include <setjmp.h>
+# include <signal.h>
+
+__attribute__((target("htm"),hot))
+bool xbegin()
+{
+  return have_transactional_memory &&
+    __TM_simple_begin() == _HTM_TBEGIN_STARTED;
+}
+
+__attribute__((target("htm"),hot))
+void xabort() { __TM_abort(); }
+
+__attribute__((target("htm"),hot))
+void xend() { __TM_end(); }
+
+bool have_transactional_memory;
+static sigjmp_buf ill_jmp;
+static void ill_handler(int sig)
+{
+  siglongjmp(ill_jmp, sig);
+}
+/**
+  Here we are testing we can do a transaction without SIGILL
+  and a 1 instruction store can succeed.
+*/
+__attribute__((noinline))
+static void test_tm(bool *r)
+{
+  if (__TM_simple_begin() == _HTM_TBEGIN_STARTED)
+  {
+    *r= true;
+    __TM_end();
+  }
+}
+bool transactional_lock_enabled()
+{
+  bool r= false;
+  sigset_t oset;
+  struct sigaction ill_act, oact_ill;
+
+  memset(&ill_act, 0, sizeof(ill_act));
+  ill_act.sa_handler = ill_handler;
+  sigfillset(&ill_act.sa_mask);
+  sigdelset(&ill_act.sa_mask, SIGILL);
+
+  sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
+  sigaction(SIGILL, &ill_act, &oact_ill);
+  if (sigsetjmp(ill_jmp, 1) == 0)
+  {
+    test_tm(&r);
+  }
+  sigaction(SIGILL, &oact_ill, NULL);
+  sigprocmask(SIG_SETMASK, &oset, NULL);
+  return r;
+}
+
+# ifdef UNIV_DEBUG
+__attribute__((target("htm"),hot))
+bool xtest()
+{
+  return have_transactional_memory &&
+    _HTM_STATE (__builtin_ttest ()) == _HTM_TRANSACTIONAL;
+}
+# endif
+#endif
+
+/** @return the parameter for srw_pause() */
+static inline unsigned srw_pause_delay()
+{
+  return my_cpu_relax_multiplier / 4 * srv_spin_wait_delay;
+}
+
+/** Pause the CPU for some time, with no memory accesses. */
+static inline void srw_pause(unsigned delay)
+{
+  HMT_low();
+  while (delay--)
+    MY_RELAX_CPU();
+  HMT_medium();
+}
+
+#ifdef SUX_LOCK_GENERIC
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait()
+{
+  const unsigned delay= srw_pause_delay();
+
+  for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+  {
+    srw_pause(delay);
+    if (wr_lock_try())
+      return;
+  }
+
+  pthread_mutex_lock(&lock);
+}
+# endif
+
+template void ssux_lock_impl<false>::init();
+template void ssux_lock_impl<true>::init();
+template void ssux_lock_impl<false>::destroy();
+template void ssux_lock_impl<true>::destroy();
+
+template<bool spinloop>
+inline void srw_mutex_impl<spinloop>::wait(uint32_t lk)
+{
+  pthread_mutex_lock(&mutex);
+  while (lock.load(std::memory_order_relaxed) == lk)
+    pthread_cond_wait(&cond, &mutex);
+  pthread_mutex_unlock(&mutex);
+}
+
+template<bool spinloop>
+inline void ssux_lock_impl<spinloop>::wait(uint32_t lk)
+{
+  pthread_mutex_lock(&writer.mutex);
+  while (readers.load(std::memory_order_relaxed) == lk)
+    pthread_cond_wait(&readers_cond, &writer.mutex);
+  pthread_mutex_unlock(&writer.mutex);
+}
+
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wake()
+{
+  pthread_mutex_lock(&mutex);
+  pthread_cond_signal(&cond);
+  pthread_mutex_unlock(&mutex);
+}
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wake()
+{
+  pthread_mutex_lock(&writer.mutex);
+  pthread_cond_signal(&readers_cond);
+  pthread_mutex_unlock(&writer.mutex);
+}
+#else
+static_assert(4 == sizeof(rw_lock), "ABI");
+# ifdef _WIN32
+#  include <synchapi.h>
+
+template<bool spinloop>
+inline void srw_mutex_impl<spinloop>::wait(uint32_t lk)
+{ WaitOnAddress(&lock, &lk, 4, INFINITE); }
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wake() { WakeByAddressSingle(&lock); }
+
+template<bool spinloop>
+inline void ssux_lock_impl<spinloop>::wait(uint32_t lk)
+{ WaitOnAddress(&readers, &lk, 4, INFINITE); }
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wake() { WakeByAddressSingle(&readers); }
+# else
+#  ifdef __linux__
+#   include <linux/futex.h>
+#   include <sys/syscall.h>
+#   define SRW_FUTEX(a,op,n) \
+    syscall(SYS_futex, a, FUTEX_ ## op ## _PRIVATE, n, nullptr, nullptr, 0)
+#  elif defined __OpenBSD__
+#   include <sys/time.h>
+#   include <sys/futex.h>
+#   define SRW_FUTEX(a,op,n) \
+    futex((volatile uint32_t*) a, FUTEX_ ## op, n, nullptr, nullptr)
+#  elif defined __FreeBSD__
+#   include <sys/types.h>
+#   include <sys/umtx.h>
+#   define FUTEX_WAKE UMTX_OP_WAKE_PRIVATE
+#   define FUTEX_WAIT UMTX_OP_WAIT_UINT_PRIVATE
+#   define SRW_FUTEX(a,op,n) _umtx_op(a, FUTEX_ ## op, n, nullptr, nullptr)
+#  elif defined __DragonFly__
+#   include <unistd.h>
+#   define FUTEX_WAKE(a,n) umtx_wakeup(a,n)
+#   define FUTEX_WAIT(a,n) umtx_sleep(a,n,0)
+#   define SRW_FUTEX(a,op,n) FUTEX_ ## op((volatile int*) a, int(n))
+#  else
+#   error "no futex support"
+#  endif
+
+template<bool spinloop>
+inline void srw_mutex_impl<spinloop>::wait(uint32_t lk)
+{ SRW_FUTEX(&lock, WAIT, lk); }
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wake() { SRW_FUTEX(&lock, WAKE, 1); }
+
+template<bool spinloop>
+inline void ssux_lock_impl<spinloop>::wait(uint32_t lk)
+{ SRW_FUTEX(&readers, WAIT, lk); }
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wake() { SRW_FUTEX(&readers, WAKE, 1); }
+# endif
+#endif
+
+template void srw_mutex_impl<false>::wake();
+template void ssux_lock_impl<false>::wake();
+template void srw_mutex_impl<true>::wake();
+template void ssux_lock_impl<true>::wake();
+
+/*
+
+Unfortunately, compilers targeting IA-32 or AMD64 currently cannot
+translate the following single-bit operations into Intel 80386 instructions:
+
+     m.fetch_or(1<<b) & 1<<b       LOCK BTS b, m
+     m.fetch_and(~(1<<b)) & 1<<b   LOCK BTR b, m
+     m.fetch_xor(1<<b) & 1<<b      LOCK BTC b, m
+
+Hence, we will manually translate fetch_or() using GCC-style inline
+assembler code or a Microsoft intrinsic function.
+
+*/
+
+#if defined __clang_major__ && __clang_major__ < 10
+/* Only clang-10 introduced support for asm goto */
+#elif defined __APPLE__
+/* At least some versions of Apple Xcode do not support asm goto */
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# define IF_FETCH_OR_GOTO(mem, bit, label)				\
+  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
+               "jc %l1" : : "m" (mem) : "cc", "memory" : label);
+# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
+  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
+               "jnc %l1" : : "m" (mem) : "cc", "memory" : label);
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+# define IF_FETCH_OR_GOTO(mem, bit, label)				\
+  if (_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit)) \
+    goto label;
+# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
+  if (!_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit))\
+    goto label;
+#endif
+
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wait_and_lock()
+{
+  uint32_t lk= 1 + lock.fetch_add(1, std::memory_order_relaxed);
+
+  if (spinloop)
+  {
+    const unsigned delay= srw_pause_delay();
+
+    for (auto spin= srv_n_spin_wait_rounds;;)
+    {
+      DBUG_ASSERT(~HOLDER & lk);
+      if (lk & HOLDER)
+        lk= lock.load(std::memory_order_relaxed);
+      else
+      {
+#ifdef IF_NOT_FETCH_OR_GOTO
+        static_assert(HOLDER == (1U << 31), "compatibility");
+        IF_NOT_FETCH_OR_GOTO(*this, 31, acquired);
+        lk|= HOLDER;
+#else
+        if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER))
+          goto acquired;
+#endif
+        srw_pause(delay);
+      }
+      if (!--spin)
+        break;
+    }
+  }
+
+  for (;;)
+  {
+    DBUG_ASSERT(~HOLDER & lk);
+    if (lk & HOLDER)
+    {
+      wait(lk);
+#ifdef IF_FETCH_OR_GOTO
+reload:
+#endif
+      lk= lock.load(std::memory_order_relaxed);
+    }
+    else
+    {
+#ifdef IF_FETCH_OR_GOTO
+      static_assert(HOLDER == (1U << 31), "compatibility");
+      IF_FETCH_OR_GOTO(*this, 31, reload);
+#else
+      if ((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER)
+        continue;
+      DBUG_ASSERT(lk);
+#endif
+acquired:
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return;
+    }
+  }
+}
+
+template void srw_mutex_impl<false>::wait_and_lock();
+template void srw_mutex_impl<true>::wait_and_lock();
+
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wr_wait(uint32_t lk)
+{
+  DBUG_ASSERT(writer.is_locked());
+  DBUG_ASSERT(lk);
+  DBUG_ASSERT(lk < WRITER);
+
+  if (spinloop)
+  {
+    const unsigned delay= srw_pause_delay();
+
+    for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+    {
+      srw_pause(delay);
+      lk= readers.load(std::memory_order_acquire);
+      if (lk == WRITER)
+        return;
+      DBUG_ASSERT(lk > WRITER);
+    }
+  }
+
+  lk|= WRITER;
+
+  do
+  {
+    DBUG_ASSERT(lk > WRITER);
+    wait(lk);
+    lk= readers.load(std::memory_order_acquire);
+  }
+  while (lk != WRITER);
+}
+
+template void ssux_lock_impl<true>::wr_wait(uint32_t);
+template void ssux_lock_impl<false>::wr_wait(uint32_t);
+
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::rd_wait()
+{
+  for (;;)
+  {
+    writer.wr_lock();
+    bool acquired= rd_lock_try();
+    writer.wr_unlock();
+    if (acquired)
+      break;
+  }
+}
+
+template void ssux_lock_impl<true>::rd_wait();
+template void ssux_lock_impl<false>::rd_wait();
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+template<> void srw_lock_<true>::rd_wait()
+{
+  const unsigned delay= srw_pause_delay();
+
+  for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+  {
+    srw_pause(delay);
+    if (rd_lock_try())
+      return;
+  }
+
+  IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk));
+}
+
+template<> void srw_lock_<true>::wr_wait()
+{
+  const unsigned delay= srw_pause_delay();
+
+  for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+  {
+    srw_pause(delay);
+    if (wr_lock_try())
+      return;
+  }
+
+  IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk));
+}
+#endif
+
+#ifdef UNIV_PFS_RWLOCK
+template void srw_lock_impl<false>::psi_rd_lock(const char*, unsigned);
+template void srw_lock_impl<false>::psi_wr_lock(const char*, unsigned);
+template void srw_lock_impl<true>::psi_rd_lock(const char*, unsigned);
+template void srw_lock_impl<true>::psi_wr_lock(const char*, unsigned);
+
+template<bool spinloop>
+void srw_lock_impl<spinloop>::psi_rd_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.rd_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_rdwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYREADLOCK : PSI_RWLOCK_READLOCK, file, line))
+  {
+    if (!nowait)
+      lock.rd_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.rd_lock();
+}
+
+template<bool spinloop>
+void srw_lock_impl<spinloop>::psi_wr_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.wr_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYWRITELOCK : PSI_RWLOCK_WRITELOCK, file, line))
+  {
+    if (!nowait)
+      lock.wr_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.wr_lock();
+}
+
+void ssux_lock::psi_rd_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.rd_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_rdwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYSHAREDLOCK : PSI_RWLOCK_SHAREDLOCK, file, line))
+  {
+    if (!nowait)
+      lock.rd_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.rd_lock();
+}
+
+void ssux_lock::psi_u_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi, PSI_RWLOCK_SHAREDEXCLUSIVELOCK, file, line))
+  {
+    lock.u_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else
+    lock.u_lock();
+}
+
+void ssux_lock::psi_wr_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.wr_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYEXCLUSIVELOCK : PSI_RWLOCK_EXCLUSIVELOCK,
+       file, line))
+  {
+    if (!nowait)
+      lock.wr_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.wr_lock();
+}
+
+void ssux_lock::psi_u_wr_upgrade(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  DBUG_ASSERT(lock.writer.is_locked());
+  uint32_t lk= 1;
+  const bool nowait=
+    lock.readers.compare_exchange_strong(lk, ssux_lock_impl<false>::WRITER,
+                                         std::memory_order_acquire,
+                                         std::memory_order_relaxed);
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYEXCLUSIVELOCK : PSI_RWLOCK_EXCLUSIVELOCK,
+       file, line))
+  {
+    if (!nowait)
+      lock.u_wr_upgrade();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.u_wr_upgrade();
+}
+#else /* UNIV_PFS_RWLOCK */
+template void ssux_lock_impl<false>::rd_lock();
+template void ssux_lock_impl<false>::rd_unlock();
+template void ssux_lock_impl<false>::u_unlock();
+template void ssux_lock_impl<false>::wr_unlock();
+#endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
index dc240387f21..2dc39118d3d 100644
--- a/storage/innobase/trx/trx0i_s.cc
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -41,8 +41,6 @@ Created July 17, 2007 Vasil Dimov
 #include "rem0rec.h"
 #include "row0row.h"
 #include "srv0srv.h"
-#include "sync0rw.h"
-#include "sync0sync.h"
 #include "trx0sys.h"
 #include "que0que.h"
 #include "trx0purge.h"
@@ -139,8 +137,7 @@ struct i_s_table_cache_t {
 
 /** This structure describes the intermediate buffer */
 struct trx_i_s_cache_t {
-	rw_lock_t	rw_lock;	/*!< read-write lock protecting
-					the rest of this structure */
+	srw_lock rw_lock;		/*!< read-write lock protecting this */
 	Atomic_relaxed<ulonglong> last_read;
 					/*!< last time the cache was read;
 					measured in nanoseconds */
@@ -159,7 +156,7 @@ struct trx_i_s_cache_t {
 	ha_storage_t*	storage;	/*!< storage for external volatile
 					data that may become unavailable
 					when we release
-					lock_sys.mutex */
+					lock_sys.latch */
 	ulint		mem_allocd;	/*!< the amount of memory
 					allocated with mem_alloc*() */
 	bool		is_truncated;	/*!< this is true if the memory
@@ -180,7 +177,7 @@ trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
 @retval 0xFFFF for table locks */
 static uint16_t wait_lock_get_heap_no(const lock_t *lock)
 {
-  return lock_get_type(lock) == LOCK_REC
+  return !lock->is_table()
     ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
     : uint16_t{0xFFFF};
 }
@@ -425,23 +422,30 @@ fill_trx_row(
 {
 	const char*	s;
 
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked();
 
-	row->trx_id = trx_get_id_for_print(trx);
+	const lock_t* wait_lock = trx->lock.wait_lock;
+
+	row->trx_id = trx->id;
 	row->trx_started = trx->start_time;
-	row->trx_state = trx_get_que_state_str(trx);
+	if (trx->in_rollback) {
+		row->trx_state = "ROLLING BACK";
+	} else if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+		row->trx_state = "COMMITTING";
+	} else if (wait_lock) {
+		row->trx_state = "LOCK WAIT";
+	} else {
+		row->trx_state = "RUNNING";
+	}
+
 	row->requested_lock_row = requested_lock_row;
 	ut_ad(requested_lock_row == NULL
 	      || i_s_locks_row_validate(requested_lock_row));
 
-	if (trx->lock.wait_lock != NULL) {
+	ut_ad(!wait_lock == !requested_lock_row);
 
-		ut_a(requested_lock_row != NULL);
-		row->trx_wait_started = trx->lock.wait_started;
-	} else {
-		ut_a(requested_lock_row == NULL);
-		row->trx_wait_started = 0;
-	}
+	const my_hrtime_t suspend_time= trx->lock.suspend_time;
+	row->trx_wait_started = wait_lock ? hrtime_to_time(suspend_time) : 0;
 
 	row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
 
@@ -482,15 +486,14 @@ thd_done:
 
 	row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
 
-	/* These are protected by both trx->mutex or lock_sys.mutex,
-	or just lock_sys.mutex. For reading, it suffices to hold
-	lock_sys.mutex. */
+	/* These are protected by lock_sys.latch (which we are holding)
+	and sometimes also trx->mutex. */
 
 	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
 
 	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
 
-	row->trx_rows_locked = lock_number_of_rows_locked(&trx->lock);
+	row->trx_rows_locked = trx->lock.n_rec_locks;
 
 	row->trx_rows_modified = trx->undo_no;
 
@@ -596,7 +599,7 @@ fill_lock_data(
 	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
 					volatile data */
 {
-	ut_a(lock_get_type(lock) == LOCK_REC);
+	ut_a(!lock->is_table());
 
 	switch (heap_no) {
 	case PAGE_HEAP_NO_INFIMUM:
@@ -615,7 +618,6 @@ fill_lock_data(
 	const buf_block_t*	block;
 	const page_t*		page;
 	const rec_t*		rec;
-	const dict_index_t*	index;
 	ulint			n_fields;
 	mem_heap_t*		heap;
 	rec_offs		offsets_onstack[REC_OFFS_NORMAL_SIZE];
@@ -644,7 +646,8 @@ fill_lock_data(
 
 	rec = page_find_rec_with_heap_no(page, heap_no);
 
-	index = lock_rec_get_index(lock);
+	const dict_index_t* index = lock->index;
+	ut_ad(index->is_primary() || !dict_index_is_online_ddl(index));
 
 	n_fields = dict_index_get_n_unique(index);
 
@@ -687,6 +690,15 @@ fill_lock_data(
 	return(TRUE);
 }
 
+/** @return the table of a lock */
+static const dict_table_t *lock_get_table(const lock_t &lock)
+{
+  if (lock.is_table())
+    return lock.un_member.tab_lock.table;
+  ut_ad(lock.index->is_primary() || !dict_index_is_online_ddl(lock.index));
+  return lock.index->table;
+}
+
 /*******************************************************************//**
 Fills i_s_locks_row_t object. Returns its first argument.
 If memory can not be allocated then FALSE is returned.
@@ -701,12 +713,9 @@ static bool fill_locks_row(
 				volatile strings */
 {
 	row->lock_trx_id = lock->trx->id;
-	const auto lock_type = lock_get_type(lock);
-	ut_ad(lock_type == LOCK_REC || lock_type == LOCK_TABLE);
-
-	const bool is_gap_lock = lock_type == LOCK_REC
-		&& (lock->type_mode & LOCK_GAP);
-	switch (lock->type_mode & LOCK_MODE_MASK) {
+	const bool is_gap_lock = lock->is_gap();
+	ut_ad(!is_gap_lock || !lock->is_table());
+	switch (lock->mode()) {
 	case LOCK_S:
 		row->lock_mode = uint8_t(1 + is_gap_lock);
 		break;
@@ -727,8 +736,10 @@ static bool fill_locks_row(
 		row->lock_mode = 0;
 	}
 
+	const dict_table_t* table= lock_get_table(*lock);
+
 	row->lock_table = ha_storage_put_str_memlim(
-		cache->storage, lock_get_table_name(lock).m_name,
+		cache->storage, table->name.m_name,
 		MAX_ALLOWED_FOR_STORAGE(cache));
 
 	/* memory could not be allocated */
@@ -737,9 +748,9 @@ static bool fill_locks_row(
 		return false;
 	}
 
-	if (lock_type == LOCK_REC) {
+	if (!lock->is_table()) {
 		row->lock_index = ha_storage_put_str_memlim(
-			cache->storage, lock_rec_get_index_name(lock),
+			cache->storage, lock->index->name,
 			MAX_ALLOWED_FOR_STORAGE(cache));
 
 		/* memory could not be allocated */
@@ -765,7 +776,7 @@ static bool fill_locks_row(
 		row->lock_data = NULL;
 	}
 
-	row->lock_table_id = lock_get_table_id(lock);
+	row->lock_table_id = table->id;
 
 	row->hash_chain.value = row;
 	ut_ad(i_s_locks_row_validate(row));
@@ -820,26 +831,19 @@ fold_lock(
 #else
 	ulint	ret;
 
-	switch (lock_get_type(lock)) {
-	case LOCK_REC:
+	if (!lock->is_table()) {
 		ut_a(heap_no != 0xFFFF);
 		ret = ut_fold_ulint_pair((ulint) lock->trx->id,
 					 lock->un_member.rec_lock.page_id.
 					 fold());
 		ret = ut_fold_ulint_pair(ret, heap_no);
-
-		break;
-	case LOCK_TABLE:
+	} else {
 		/* this check is actually not necessary for continuing
 		correct operation, but something must have gone wrong if
 		it fails. */
 		ut_a(heap_no == 0xFFFF);
 
-		ret = (ulint) lock_get_table_id(lock);
-
-		break;
-	default:
-		ut_error;
+		ret = (ulint) lock_get_table(*lock)->id;
 	}
 
 	return(ret);
@@ -863,26 +867,20 @@ locks_row_eq_lock(
 #ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
 	return(0);
 #else
-	switch (lock_get_type(lock)) {
-	case LOCK_REC:
+	if (!lock->is_table()) {
 		ut_a(heap_no != 0xFFFF);
 
 		return(row->lock_trx_id == lock->trx->id
 		       && row->lock_page == lock->un_member.rec_lock.page_id
 		       && row->lock_rec == heap_no);
-
-	case LOCK_TABLE:
+	} else {
 		/* this check is actually not necessary for continuing
 		correct operation, but something must have gone wrong if
 		it fails. */
 		ut_a(heap_no == 0xFFFF);
 
 		return(row->lock_trx_id == lock->trx->id
-		       && row->lock_table_id == lock_get_table_id(lock));
-
-	default:
-		ut_error;
-		return(FALSE);
+		       && row->lock_table_id == lock_get_table(*lock)->id);
 	}
 #endif
 }
@@ -1049,25 +1047,22 @@ add_trx_relevant_locks_to_cache(
 					requested lock row, or NULL or
 					undefined */
 {
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked();
 
 	/* If transaction is waiting we add the wait lock and all locks
 	from another transactions that are blocking the wait lock. */
-	if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+	if (const lock_t *wait_lock = trx->lock.wait_lock) {
 
 		const lock_t*		curr_lock;
 		i_s_locks_row_t*	blocking_lock_row;
 		lock_queue_iterator_t	iter;
 
-		ut_a(trx->lock.wait_lock != NULL);
-
 		uint16_t wait_lock_heap_no
-			= wait_lock_get_heap_no(trx->lock.wait_lock);
+			= wait_lock_get_heap_no(wait_lock);
 
 		/* add the requested lock */
-		*requested_lock_row
-			= add_lock_to_cache(cache, trx->lock.wait_lock,
-					    wait_lock_heap_no);
+		*requested_lock_row = add_lock_to_cache(cache, wait_lock,
+							wait_lock_heap_no);
 
 		/* memory could not be allocated */
 		if (*requested_lock_row == NULL) {
@@ -1078,18 +1073,16 @@ add_trx_relevant_locks_to_cache(
 		/* then iterate over the locks before the wait lock and
 		add the ones that are blocking it */
 
-		lock_queue_iterator_reset(&iter, trx->lock.wait_lock,
-					  ULINT_UNDEFINED);
+		lock_queue_iterator_reset(&iter, wait_lock, ULINT_UNDEFINED);
 
 		for (curr_lock = lock_queue_iterator_get_prev(&iter);
 		     curr_lock != NULL;
 		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
 
-			if (lock_has_to_wait(trx->lock.wait_lock,
-					     curr_lock)) {
+			if (lock_has_to_wait(wait_lock, curr_lock)) {
 
 				/* add the lock that is
-				blocking trx->lock.wait_lock */
+				blocking wait_lock */
 				blocking_lock_row
 					= add_lock_to_cache(
 						cache, curr_lock,
@@ -1139,9 +1132,6 @@ static bool can_cache_be_updated(trx_i_s_cache_t* cache)
 	we are currently holding an exclusive rw lock on the cache.
 	So it is not possible for last_read to be updated while we are
 	reading it. */
-
-	ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
-
 	return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
 }
 
@@ -1217,7 +1207,7 @@ static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
 
 static void fetch_data_into_cache(trx_i_s_cache_t *cache)
 {
-  ut_ad(lock_mutex_own());
+  LockMutexGuard g{SRW_LOCK_CALL};
   trx_i_s_cache_clear(cache);
 
   /* Capture the state of transactions */
@@ -1225,10 +1215,10 @@ static void fetch_data_into_cache(trx_i_s_cache_t *cache)
     if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
         &trx != (purge_sys.query ? purge_sys.query->trx : nullptr))
     {
-      mutex_enter(&trx.mutex);
+      trx.mutex_lock();
       if (trx.state != TRX_STATE_NOT_STARTED)
         fetch_data_into_cache_low(cache, &trx);
-      mutex_exit(&trx.mutex);
+      trx.mutex_unlock();
     }
   });
   cache->is_truncated= false;
@@ -1250,10 +1240,7 @@ trx_i_s_possibly_fetch_data_into_cache(
 	}
 
 	/* We need to read trx_sys and record/table lock queues */
-
-	lock_mutex_enter();
 	fetch_data_into_cache(cache);
-	lock_mutex_exit();
 
 	/* update cache last read time */
 	cache->last_read = my_interval_timer();
@@ -1281,15 +1268,14 @@ trx_i_s_cache_init(
 	trx_i_s_cache_t*	cache)	/*!< out: cache to init */
 {
 	/* The latching is done in the following order:
-	acquire trx_i_s_cache_t::rw_lock, X
-	acquire lock mutex
-	release lock mutex
+	acquire trx_i_s_cache_t::rw_lock, rwlock
+	acquire exclusive lock_sys.latch
+	release exclusive lock_sys.latch
 	release trx_i_s_cache_t::rw_lock
-	acquire trx_i_s_cache_t::rw_lock, S
+	acquire trx_i_s_cache_t::rw_lock, rdlock
 	release trx_i_s_cache_t::rw_lock */
 
-	rw_lock_create(trx_i_s_cache_lock_key, &cache->rw_lock,
-		       SYNC_TRX_I_S_RWLOCK);
+	cache->rw_lock.SRW_LOCK_INIT(trx_i_s_cache_lock_key);
 
 	cache->last_read = 0;
 
@@ -1315,7 +1301,7 @@ trx_i_s_cache_free(
 /*===============*/
 	trx_i_s_cache_t*	cache)	/*!< in, own: cache to free */
 {
-	rw_lock_free(&cache->rw_lock);
+	cache->rw_lock.destroy();
 
 	cache->locks_hash.free();
 	ha_storage_free(cache->storage);
@@ -1331,7 +1317,7 @@ trx_i_s_cache_start_read(
 /*=====================*/
 	trx_i_s_cache_t*	cache)	/*!< in: cache */
 {
-	rw_lock_s_lock(&cache->rw_lock);
+	cache->rw_lock.rd_lock(SRW_LOCK_CALL);
 }
 
 /*******************************************************************//**
@@ -1342,7 +1328,7 @@ trx_i_s_cache_end_read(
 	trx_i_s_cache_t*	cache)	/*!< in: cache */
 {
 	cache->last_read = my_interval_timer();
-	rw_lock_s_unlock(&cache->rw_lock);
+	cache->rw_lock.rd_unlock();
 }
 
 /*******************************************************************//**
@@ -1352,7 +1338,7 @@ trx_i_s_cache_start_write(
 /*======================*/
 	trx_i_s_cache_t*	cache)	/*!< in: cache */
 {
-	rw_lock_x_lock(&cache->rw_lock);
+	cache->rw_lock.wr_lock(SRW_LOCK_CALL);
 }
 
 /*******************************************************************//**
@@ -1362,9 +1348,7 @@ trx_i_s_cache_end_write(
 /*====================*/
 	trx_i_s_cache_t*	cache)	/*!< in: cache */
 {
-	ut_ad(rw_lock_own(&cache->rw_lock, RW_LOCK_X));
-
-	rw_lock_x_unlock(&cache->rw_lock);
+	cache->rw_lock.wr_unlock();
 }
 
 /*******************************************************************//**
@@ -1377,9 +1361,6 @@ cache_select_table(
 	trx_i_s_cache_t*	cache,	/*!< in: whole cache */
 	enum i_s_table		table)	/*!< in: which table */
 {
-	ut_ad(rw_lock_own_flagged(&cache->rw_lock,
-				  RW_LOCK_FLAG_X | RW_LOCK_FLAG_S));
-
 	switch (table) {
 	case I_S_INNODB_TRX:
 		return &cache->innodb_trx;
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index 38438108480..6bdfaa95241 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -26,17 +26,14 @@ Created 3/26/1996 Heikki Tuuri
 
 #include "trx0purge.h"
 #include "fsp0fsp.h"
-#include "fut0fut.h"
 #include "mach0data.h"
 #include "mtr0log.h"
-#include "os0thread.h"
 #include "que0que.h"
 #include "row0purge.h"
 #include "row0upd.h"
 #include "srv0mon.h"
 #include "srv0srv.h"
 #include "srv0start.h"
-#include "sync0sync.h"
 #include "trx0rec.h"
 #include "trx0roll.h"
 #include "trx0rseg.h"
@@ -54,10 +51,6 @@ ulong		srv_max_purge_lag_delay = 0;
 /** The global data structure coordinating a purge */
 purge_sys_t	purge_sys;
 
-/** A dummy undo record used as a return value when we have a whole undo log
-which needs no purge */
-trx_undo_rec_t	trx_purge_dummy_rec;
-
 #ifdef UNIV_DEBUG
 my_bool		srv_purge_view_update_only_debug;
 #endif /* UNIV_DEBUG */
@@ -74,9 +67,9 @@ TrxUndoRsegsIterator::TrxUndoRsegsIterator()
 /** Sets the next rseg to purge in purge_sys.
 Executed in the purge coordinator thread.
 @return whether anything is to be purged */
-inline bool TrxUndoRsegsIterator::set_next()
+TRANSACTIONAL_INLINE inline bool TrxUndoRsegsIterator::set_next()
 {
-	mutex_enter(&purge_sys.pq_mutex);
+	mysql_mutex_lock(&purge_sys.pq_mutex);
 
 	/* Only purge consumes events from the priority queue, user
 	threads only produce the events. */
@@ -99,31 +92,45 @@ inline bool TrxUndoRsegsIterator::set_next()
 	} else {
 		/* Queue is empty, reset iterator. */
 		purge_sys.rseg = NULL;
-		mutex_exit(&purge_sys.pq_mutex);
+		mysql_mutex_unlock(&purge_sys.pq_mutex);
 		m_rsegs = NullElement;
 		m_iter = m_rsegs.begin();
 		return false;
 	}
 
 	purge_sys.rseg = *m_iter++;
-	mutex_exit(&purge_sys.pq_mutex);
-	mutex_enter(&purge_sys.rseg->mutex);
-
-	ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
-	ut_ad(purge_sys.rseg->last_trx_no() == m_rsegs.trx_no);
+	mysql_mutex_unlock(&purge_sys.pq_mutex);
 
-	/* We assume in purge of externally stored fields that space id is
-	in the range of UNDO tablespace space ids */
+	/* We assume in purge of externally stored fields that space
+	id is in the range of UNDO tablespace space ids */
 	ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
 	      || srv_is_undo_tablespace(purge_sys.rseg->space->id));
 
-	ut_a(purge_sys.tail.trx_no <= purge_sys.rseg->last_trx_no());
+	trx_id_t last_trx_no;
+	{
+#ifdef SUX_LOCK_GENERIC
+		purge_sys.rseg->latch.rd_lock(SRW_LOCK_CALL);
+#else
+		transactional_shared_lock_guard<srw_spin_lock> rg
+			{purge_sys.rseg->latch};
+#endif
+		last_trx_no = purge_sys.rseg->last_trx_no();
 
-	purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no();
-	purge_sys.hdr_offset = purge_sys.rseg->last_offset();
-	purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+		purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+		purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+#ifdef SUX_LOCK_GENERIC
+		purge_sys.rseg->latch.rd_unlock();
+#endif
+	}
 
-	mutex_exit(&purge_sys.rseg->mutex);
+	/* Only the purge coordinator task will access this object
+	purge_sys.rseg_iter, or any of purge_sys.hdr_page_no,
+	purge_sys.tail, purge_sys.head, or modify purge_sys.view. */
+	ut_ad(last_trx_no == m_rsegs.trx_no);
+	ut_a(purge_sys.hdr_page_no != FIL_NULL);
+	ut_a(purge_sys.tail.trx_no <= last_trx_no);
+	purge_sys.tail.trx_no = last_trx_no;
 
 	return(true);
 }
@@ -145,8 +152,7 @@ purge_graph_build()
 	trx->op_info = "purge trx";
 
 	mem_heap_t*	heap = mem_heap_create(512);
-	que_fork_t*	fork = que_fork_create(
-		NULL, NULL, QUE_FORK_PURGE, heap);
+	que_fork_t*	fork = que_fork_create(heap);
 	fork->trx = trx;
 
 	for (auto i = innodb_purge_threads_MAX; i; i--) {
@@ -165,6 +171,7 @@ void purge_sys_t::create()
   ut_ad(!heap);
   ut_ad(!enabled());
   m_paused= 0;
+  m_SYS_paused= 0;
   query= purge_graph_build();
   next_stored= false;
   rseg= NULL;
@@ -172,8 +179,9 @@ void purge_sys_t::create()
   offset= 0;
   hdr_page_no= 0;
   hdr_offset= 0;
-  rw_lock_create(trx_purge_latch_key, &latch, SYNC_PURGE_LATCH);
-  mutex_create(LATCH_ID_PURGE_SYS_PQ, &pq_mutex);
+  latch.SRW_LOCK_INIT(trx_purge_latch_key);
+  end_latch.init();
+  mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr);
   truncate.current= NULL;
   truncate.last= NULL;
   heap= mem_heap_create(4096);
@@ -193,12 +201,41 @@ void purge_sys_t::close()
   ut_ad(trx->state == TRX_STATE_ACTIVE);
   trx->state= TRX_STATE_NOT_STARTED;
   trx->free();
-  rw_lock_free(&latch);
-  mutex_free(&pq_mutex);
+  latch.destroy();
+  end_latch.destroy();
+  mysql_mutex_destroy(&pq_mutex);
   mem_heap_free(heap);
   heap= nullptr;
 }
 
+/** Determine if the history of a transaction is purgeable.
+@param trx_id  transaction identifier
+@return whether the history is purgeable */
+TRANSACTIONAL_TARGET bool purge_sys_t::is_purgeable(trx_id_t trx_id) const
+{
+  bool purgeable;
+#if !defined SUX_LOCK_GENERIC && !defined NO_ELISION
+  purgeable= false;
+  if (xbegin())
+  {
+    if (!latch.is_write_locked())
+    {
+      purgeable= view.changes_visible(trx_id);
+      xend();
+    }
+    else
+      xabort();
+  }
+  else
+#endif
+  {
+    latch.rd_lock(SRW_LOCK_CALL);
+    purgeable= view.changes_visible(trx_id);
+    latch.rd_unlock();
+  }
+  return purgeable;
+}
+
 /*================ UNDO LOG HISTORY LIST =============================*/
 
 /** Prepend the history list with an undo log.
@@ -214,17 +251,22 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 	ut_ad(undo == trx->rsegs.m_redo.undo);
 	trx_rseg_t*	rseg		= trx->rsegs.m_redo.rseg;
 	ut_ad(undo->rseg == rseg);
-	buf_block_t*	rseg_header	= trx_rsegf_get(
-		rseg->space, rseg->page_no, mtr);
+	buf_block_t*	rseg_header	= rseg->get(mtr, nullptr);
+	/* We are in transaction commit; we cannot return an error. If the
+	database is corrupted, it is better to crash it than to
+	intentionally violate ACID by committing something that is known to
+	be corrupted. */
+	ut_ad(rseg_header);
 	buf_block_t*	undo_page	= trx_undo_set_state_at_finish(
 		undo, mtr);
-	trx_ulogf_t*	undo_header	= undo_page->frame + undo->hdr_offset;
+	trx_ulogf_t*	undo_header	= undo_page->page.frame
+		+ undo->hdr_offset;
 
 	ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
 	ut_ad(rseg->needs_purge > trx->id);
 
 	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
-					   + rseg_header->frame))) {
+					   + rseg_header->page.frame))) {
 		/* This database must have been upgraded from
 		before MariaDB 10.3.5. */
 		trx_rseg_format_upgrade(rseg_header, mtr);
@@ -240,19 +282,19 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 
 		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
 
-		uint32_t hist_size = mach_read_from_4(TRX_RSEG_HISTORY_SIZE
-						      + TRX_RSEG
-						      + rseg_header->frame);
+		uint32_t hist_size = mach_read_from_4(
+			TRX_RSEG_HISTORY_SIZE + TRX_RSEG
+			+ rseg_header->page.frame);
 
 		ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR
 						 + TRX_UNDO_PAGE_LIST
-						 + undo_page->frame));
+						 + undo_page->page.frame));
 
 		mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE
-			      + rseg_header->frame,
+			      + rseg_header->page.frame,
 			      hist_size + undo->size);
 		mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID
-			      + rseg_header->frame,
+			      + rseg_header->page.frame,
 			      trx_sys.get_max_trx_id());
 	}
 
@@ -263,8 +305,8 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 
 	Before any transaction-generating background threads or the
 	purge have been started, we can
-	start transactions in row_merge_drop_temp_indexes() and
-	fts_drop_orphaned_tables(), and roll back recovered transactions.
+	start transactions in row_merge_drop_temp_indexes(),
+	and roll back recovered transactions.
 
 	Arbitrary user transactions may be executed when all the undo log
 	related background processes (including purge) are disabled due to
@@ -279,12 +321,11 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 		  && (srv_is_being_started
 		      || trx_rollback_is_active
 		      || srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
-	      || ((trx->mysql_thd || trx->internal)
-		  && srv_fast_shutdown));
+	      || srv_fast_shutdown);
 
 #ifdef	WITH_WSREP
-	if (wsrep_is_wsrep_xid(trx->xid)) {
-		trx_rseg_update_wsrep_checkpoint(rseg_header, trx->xid, mtr);
+	if (wsrep_is_wsrep_xid(&trx->xid)) {
+		trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr);
 	}
 #endif
 
@@ -296,9 +337,15 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 	}
 
 	/* Add the log as the first in the history list */
-	flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
-		       static_cast<uint16_t>(undo->hdr_offset
-					     + TRX_UNDO_HISTORY_NODE), mtr);
+
+	/* We are in transaction commit; we cannot return an error
+	when detecting corruption. It is better to crash the server
+	than to intentionally violate ACID by committing something
+	that is known to be corrupted. */
+	ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+			    static_cast<uint16_t>(undo->hdr_offset
+						  + TRX_UNDO_HISTORY_NODE),
+			    mtr) == DB_SUCCESS);
 
 	mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page,
 				       undo_header + TRX_UNDO_TRX_NO,
@@ -312,7 +359,7 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 				      trx->rw_trx_hash_element->no);
 	}
 
-	trx_sys.rseg_history_len++;
+	rseg->history_size++;
 
 	if (undo->state == TRX_UNDO_CACHED) {
 		UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
@@ -325,46 +372,60 @@ trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
 	undo = NULL;
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Remove undo log header from the history list.
 @param[in,out]  rseg    rollback segment header page
 @param[in]      log     undo log segment header page
 @param[in]      offset  byte offset in the undo log segment header page
 @param[in,out]  mtr     mini-transaction */
-static void trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
-                                     uint16_t offset, mtr_t *mtr)
+static dberr_t trx_purge_remove_log_hdr(buf_block_t *rseg, buf_block_t* log,
+                                        uint16_t offset, mtr_t *mtr)
 {
-  flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY,
-              log, static_cast<uint16_t>(offset + TRX_UNDO_HISTORY_NODE), mtr);
-  trx_sys.rseg_history_len--;
+  return flst_remove(rseg, TRX_RSEG + TRX_RSEG_HISTORY, log,
+                     uint16_t(offset + TRX_UNDO_HISTORY_NODE), mtr);
 }
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /** Free an undo log segment, and remove the header from the history list.
 @param[in,out]	mtr		mini-transaction
 @param[in,out]	rseg		rollback segment
-@param[in]	hdr_addr	file address of log_hdr */
-static
-void trx_purge_free_segment(mtr_t &mtr, trx_rseg_t* rseg, fil_addr_t hdr_addr)
+@param[in]	hdr_addr	file address of log_hdr
+@return error code */
+static dberr_t
+trx_purge_free_segment(mtr_t &mtr, trx_rseg_t* rseg, fil_addr_t hdr_addr)
 {
   mtr.commit();
   log_free_check();
   mtr.start();
-  ut_ad(mutex_own(&rseg->mutex));
 
-  buf_block_t *rseg_hdr= trx_rsegf_get(rseg->space, rseg->page_no, &mtr);
-  buf_block_t *block=
-    trx_undo_page_get(page_id_t(rseg->space->id, hdr_addr.page), &mtr);
+  const page_id_t hdr_page_id{rseg->space->id, hdr_addr.page};
+  dberr_t err;
+  buf_block_t *rseg_hdr= rseg->get(&mtr, &err);
+  if (!rseg_hdr)
+    return err;
+  buf_block_t *block= buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH,
+                                       nullptr, BUF_GET_POSSIBLY_FREED,
+                                       &mtr, &err);
+  if (!block)
+    return err;
+
   const uint32_t seg_size=
-    flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+    flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
+
+  err= trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+    return err;
+
   ut_ad(rseg->curr_size >= seg_size);
   rseg->curr_size-= seg_size;
+  rseg->history_size--;
 
-  trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset, &mtr);
-  byte *hist= TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->frame;
+  byte *hist= TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->page.frame;
   ut_ad(mach_read_from_4(hist) >= seg_size);
   mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
 
   while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
-                                    block->frame, &mtr))
+                                    block->page.frame, &mtr))
   {
     block->fix();
     mtr.commit();
@@ -377,35 +438,38 @@ void trx_purge_free_segment(mtr_t &mtr, trx_rseg_t* rseg, fil_addr_t hdr_addr)
     innodb_undo_log_truncate=ON will be able to reclaim the space. */
     log_free_check();
     mtr.start();
-    ut_ad(rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
-    rw_lock_x_lock(&block->lock);
-    mtr_memo_push(&mtr, block, MTR_MEMO_PAGE_X_FIX);
+    block->page.lock.x_lock();
+    mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
   }
 
   while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
-                         block->frame, &mtr));
+                         block->page.frame, &mtr));
+  return DB_SUCCESS;
 }
 
 /** Remove unnecessary history data from a rollback segment.
 @param[in,out]	rseg		rollback segment
-@param[in]	limit		truncate anything before this */
+@param[in]	limit		truncate anything before this
+@return error code */
 static
-void
+dberr_t
 trx_purge_truncate_rseg_history(
 	trx_rseg_t&			rseg,
 	const purge_sys_t::iterator&	limit)
 {
 	fil_addr_t	hdr_addr;
-	fil_addr_t	prev_hdr_addr;
 	mtr_t		mtr;
-	trx_id_t	undo_trx_no;
 
 	mtr.start();
 
-	buf_block_t* rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
+	dberr_t err;
+	buf_block_t* rseg_hdr = rseg.get(&mtr, &err);
+	if (!rseg_hdr) {
+		goto func_exit;
+	}
 
 	hdr_addr = flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY
-				 + rseg_hdr->frame);
+				 + rseg_hdr->page.frame);
 	hdr_addr.boffset = static_cast<uint16_t>(hdr_addr.boffset
 						 - TRX_UNDO_HISTORY_NODE);
 
@@ -413,18 +477,24 @@ loop:
 	if (hdr_addr.page == FIL_NULL) {
 func_exit:
 		mtr.commit();
-		return;
+		return err;
+	}
+
+	buf_block_t* block = buf_page_get_gen(page_id_t(rseg.space->id,
+							hdr_addr.page),
+					      0, RW_X_LATCH, nullptr,
+					      BUF_GET_POSSIBLY_FREED,
+					      &mtr, &err);
+	if (!block) {
+		goto func_exit;
 	}
 
-	buf_block_t* block = trx_undo_page_get(page_id_t(rseg.space->id,
-							 hdr_addr.page),
-					       &mtr);
-	undo_trx_no = mach_read_from_8(block->frame + hdr_addr.boffset
-				       + TRX_UNDO_TRX_NO);
+	const trx_id_t undo_trx_no = mach_read_from_8(
+		block->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO);
 
 	if (undo_trx_no >= limit.trx_no) {
 		if (undo_trx_no == limit.trx_no) {
-			trx_undo_truncate_start(
+			err = trx_undo_truncate_start(
 				&rseg, hdr_addr.page,
 				hdr_addr.boffset, limit.undo_no);
 		}
@@ -432,36 +502,43 @@ func_exit:
 		goto func_exit;
 	}
 
-	prev_hdr_addr = flst_get_prev_addr(block->frame + hdr_addr.boffset
-					   + TRX_UNDO_HISTORY_NODE);
+	fil_addr_t prev_hdr_addr = flst_get_prev_addr(
+		block->page.frame + hdr_addr.boffset + TRX_UNDO_HISTORY_NODE);
 	prev_hdr_addr.boffset = static_cast<uint16_t>(prev_hdr_addr.boffset
 						      - TRX_UNDO_HISTORY_NODE);
 
-	if (!rseg.trx_ref_count
+	if (!rseg.is_referenced()
 	    && rseg.needs_purge <= (purge_sys.head.trx_no
 				    ? purge_sys.head.trx_no
 				    : purge_sys.tail.trx_no)
 	    && mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
-				+ block->frame)
+				+ block->page.frame)
 	    == TRX_UNDO_TO_PURGE
-	    && !mach_read_from_2(block->frame + hdr_addr.boffset
+	    && !mach_read_from_2(block->page.frame + hdr_addr.boffset
 				 + TRX_UNDO_NEXT_LOG)) {
 		/* We can free the whole log segment.
 		This will call trx_purge_remove_log_hdr(). */
-		trx_purge_free_segment(mtr, &rseg, hdr_addr);
+		err = trx_purge_free_segment(mtr, &rseg, hdr_addr);
 	} else {
 		/* Remove the log hdr from the rseg history. */
-		trx_purge_remove_log_hdr(rseg_hdr, block, hdr_addr.boffset,
-					 &mtr);
+		rseg.history_size--;
+		err = trx_purge_remove_log_hdr(rseg_hdr, block,
+					       hdr_addr.boffset, &mtr);
 	}
 
 	mtr.commit();
+	if (err != DB_SUCCESS) {
+		return err;
+	}
 	mtr.start();
 
-	rseg_hdr = trx_rsegf_get(rseg.space, rseg.page_no, &mtr);
-
 	hdr_addr = prev_hdr_addr;
 
+	rseg_hdr = rseg.get(&mtr, &err);
+	if (!rseg_hdr) {
+		goto func_exit;
+	}
+
 	goto loop;
 }
 
@@ -473,7 +550,7 @@ static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
 	typedef	std::vector<TrxUndoRsegs>	purge_elem_list_t;
 	purge_elem_list_t			purge_elem_list;
 
-	mutex_enter(&purge_sys.pq_mutex);
+	mysql_mutex_lock(&purge_sys.pq_mutex);
 
 	/* Remove rseg instances that are in the purge queue before we start
 	truncate of corresponding UNDO truncate. */
@@ -500,7 +577,7 @@ static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
 		}
 	}
 
-	mutex_exit(&purge_sys.pq_mutex);
+	mysql_mutex_unlock(&purge_sys.pq_mutex);
 }
 
 #if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__
@@ -513,7 +590,7 @@ __attribute__((optimize(0)))
 Removes unnecessary history data from rollback segments. NOTE that when this
 function is called, the caller must not have any latches on undo log pages!
 */
-static void trx_purge_truncate_history()
+TRANSACTIONAL_TARGET static void trx_purge_truncate_history()
 {
   ut_ad(purge_sys.head <= purge_sys.tail);
   purge_sys_t::iterator &head= purge_sys.head.trx_no
@@ -526,19 +603,18 @@ static void trx_purge_truncate_history()
     head.undo_no= 0;
   }
 
-  for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
-  {
-    if (trx_rseg_t *rseg= trx_sys.rseg_array[i])
+  dberr_t err= DB_SUCCESS;
+  for (auto &rseg : trx_sys.rseg_array)
+    if (rseg.space)
     {
-      ut_ad(rseg->id == i);
-      ut_ad(rseg->is_persistent());
-      mutex_enter(&rseg->mutex);
-      trx_purge_truncate_rseg_history(*rseg, head);
-      mutex_exit(&rseg->mutex);
+      ut_ad(rseg.is_persistent());
+      rseg.latch.wr_lock(SRW_LOCK_CALL);
+      if (dberr_t e= trx_purge_truncate_rseg_history(rseg, head))
+        err= e;
+      rseg.latch.wr_unlock();
     }
-  }
 
-  if (srv_undo_tablespaces_active < 2)
+  if (err != DB_SUCCESS || srv_undo_tablespaces_active < 2)
     return;
 
   while (srv_undo_log_truncate)
@@ -577,31 +653,32 @@ static void trx_purge_truncate_history()
 
     DBUG_LOG("undo", "marking for truncate: " << file->name);
 
-    for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+    for (auto &rseg : trx_sys.rseg_array)
+      if (rseg.space == &space)
+        /* Once set, this rseg will not be allocated to subsequent
+        transactions, but we will wait for existing active
+        transactions to finish. */
+        rseg.set_skip_allocation();
+
+    for (auto &rseg : trx_sys.rseg_array)
     {
-      trx_rseg_t *rseg= trx_sys.rseg_array[i];
-      if (!rseg || rseg->space != &space)
+      if (rseg.space != &space)
         continue;
-      ut_ad(rseg->is_persistent());
 
-      mutex_enter(&rseg->mutex);
-      /* Once set, this rseg will not be allocated to subsequent
-      transactions, but we will wait for existing active
-      transactions to finish and to be purged. */
-      rseg->skip_allocation = true;
-
-      if (rseg->trx_ref_count || rseg->needs_purge > head.trx_no)
+      rseg.latch.rd_lock(SRW_LOCK_CALL);
+      ut_ad(rseg.skip_allocation());
+      if (rseg.is_referenced() || rseg.needs_purge > head.trx_no)
       {
-      not_free:
-        mutex_exit(&rseg->mutex);
+not_free:
+        rseg.latch.rd_unlock();
         return;
       }
 
-      ut_ad(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+      ut_ad(UT_LIST_GET_LEN(rseg.undo_list) == 0);
       /* Check if all segments are cached and safe to remove. */
       ulint cached= 0;
 
-      for (const trx_undo_t *undo= UT_LIST_GET_FIRST(rseg->undo_cached); undo;
+      for (const trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached); undo;
            undo= UT_LIST_GET_NEXT(undo_list, undo))
       {
         if (head.trx_no < undo->trx_id)
@@ -610,12 +687,11 @@ static void trx_purge_truncate_history()
           cached+= undo->size;
       }
 
-      ut_ad(rseg->curr_size > cached);
-
-      if (rseg->curr_size > cached + 1)
+      ut_ad(rseg.curr_size > cached);
+      if (rseg.curr_size > cached + 1)
         goto not_free;
 
-      mutex_exit(&rseg->mutex);
+      rseg.latch.rd_unlock();
     }
 
     ib::info() << "Truncating " << file->name;
@@ -625,7 +701,7 @@ static void trx_purge_truncate_history()
 
     mtr_t mtr;
     mtr.start();
-    mtr_x_lock_space(&space, &mtr);
+    mtr.x_lock_space(&space);
 
     /* Lock all modified pages of the tablespace.
 
@@ -647,37 +723,43 @@ static void trx_purge_truncate_history()
       if (bpage->id().space() == space.id &&
           bpage->oldest_modification() != 1)
       {
-        ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE);
+        ut_ad(bpage->frame);
         auto block= reinterpret_cast<buf_block_t*>(bpage);
-        block->fix();
-        ut_ad(rw_lock_s_lock_nowait(block->debug_latch, __FILE__, __LINE__));
+        if (!bpage->lock.x_lock_try())
+        {
+        rescan:
+          /* Let buf_pool_t::release_freed_page() proceed. */
+          mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+          mysql_mutex_lock(&buf_pool.mutex);
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+          mysql_mutex_unlock(&buf_pool.mutex);
+          bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+          continue;
+        }
         buf_pool.flush_hp.set(prev);
         mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
 #ifdef BTR_CUR_HASH_ADAPT
         ut_ad(!block->index); /* There is no AHI on undo tablespaces. */
 #endif
-        rw_lock_x_lock(&block->lock);
+        bpage->fix();
+        ut_ad(!bpage->is_io_fixed());
         mysql_mutex_lock(&buf_pool.flush_list_mutex);
-        ut_ad(bpage->io_fix() == BUF_IO_NONE);
 
         if (bpage->oldest_modification() > 1)
         {
-          bpage->clear_oldest_modification(false);
+          bpage->reset_oldest_modification();
           mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
         }
         else
         {
-          rw_lock_x_unlock(&block->lock);
-          block->unfix();
+          bpage->unfix();
+          bpage->lock.x_unlock();
         }
 
         if (prev != buf_pool.flush_hp.get())
-        {
           /* Rescan, because we may have lost the position. */
-          bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
-          continue;
-        }
+          goto rescan;
       }
 
       bpage= prev;
@@ -685,72 +767,63 @@ static void trx_purge_truncate_history()
 
     mysql_mutex_unlock(&buf_pool.flush_list_mutex);
 
+    /* Re-initialize tablespace, in a single mini-transaction. */
+    const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+
     /* Adjust the tablespace metadata. */
-    if (!fil_truncate_prepare(space.id))
+    mysql_mutex_lock(&fil_system.mutex);
+    space.set_stopping();
+    space.is_being_truncated= true;
+    if (space.crypt_data)
     {
-      ib::error() << "Failed to find UNDO tablespace " << file->name;
-      mtr.commit();
-      return;
+      space.reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      fil_space_crypt_close_tablespace(&space);
+      space.release();
+    }
+    else
+      mysql_mutex_unlock(&fil_system.mutex);
+
+    for (auto i= 6000; space.referenced();
+         std::this_thread::sleep_for(std::chrono::milliseconds(10)))
+    {
+      if (!--i)
+      {
+        mtr.commit();
+        ib::error() << "Failed to freeze UNDO tablespace " << file->name;
+        return;
+      }
     }
 
-    /* Re-initialize tablespace, in a single mini-transaction. */
-    const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
     /* Associate the undo tablespace with mtr.
     During mtr::commit_shrink(), InnoDB can use the undo
     tablespace object to clear all freed ranges */
     mtr.set_named_space(&space);
     mtr.trim_pages(page_id_t(space.id, size));
-    fsp_header_init(&space, size, &mtr);
-    mutex_enter(&fil_system.mutex);
+    ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS);
+    mysql_mutex_lock(&fil_system.mutex);
     space.size= file->size= size;
-    mutex_exit(&fil_system.mutex);
-
-    buf_block_t *sys_header= trx_sysf_get(&mtr);
+    mysql_mutex_unlock(&fil_system.mutex);
 
-    for (ulint i= 0; i < TRX_SYS_N_RSEGS; ++i)
+    for (auto &rseg : trx_sys.rseg_array)
     {
-      trx_rseg_t *rseg= trx_sys.rseg_array[i];
-      if (!rseg || rseg->space != &space)
+      if (rseg.space != &space)
         continue;
 
-      ut_ad(rseg->id == i);
-      ut_ad(rseg->is_persistent());
-      ut_ad(!rseg->trx_ref_count);
-      ut_ad(rseg->needs_purge <= head.trx_no);
-      ut_d(const auto old_page= rseg->page_no);
+      ut_ad(!rseg.is_referenced());
+      ut_ad(rseg.needs_purge <= head.trx_no);
 
-      buf_block_t *rblock= trx_rseg_header_create(&space, i,
+      buf_block_t *rblock= trx_rseg_header_create(&space,
+                                                  &rseg - trx_sys.rseg_array,
                                                   trx_sys.get_max_trx_id(),
-                                                  sys_header, &mtr);
-      ut_ad(rblock);
-      rseg->page_no= rblock ? rblock->page.id().page_no() : FIL_NULL;
-      ut_ad(old_page == rseg->page_no);
-
-      /* Before re-initialization ensure that we free the existing
-      structure. There can't be any active transactions. */
-      ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
-
-      for (trx_undo_t *undo= UT_LIST_GET_FIRST(rseg->undo_cached), *next_undo;
-           undo; undo= next_undo)
-      {
-        next_undo= UT_LIST_GET_NEXT(undo_list, undo);
-        UT_LIST_REMOVE(rseg->undo_cached, undo);
-        MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
-        ut_free(undo);
-      }
-
+                                                  &mtr, &err);
+      ut_a(rblock);
       /* These were written by trx_rseg_header_create(). */
-      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rblock->frame));
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                              rblock->page.frame));
       ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
-                              rblock->frame));
-      /* Initialize the undo log lists according to
-      the rseg header */
-      rseg->curr_size= 1;
-      rseg->trx_ref_count= 0;
-      rseg->needs_purge= 0;
-      rseg->skip_allocation= false;
-      rseg->last_page_no= FIL_NULL;
-      rseg->last_commit_and_offset= 0;
+                              rblock->page.frame));
+      rseg.reinit(rblock->page.id().page_no());
     }
 
     mtr.commit_shrink(space);
@@ -788,80 +861,81 @@ static void trx_purge_rseg_get_next_history_log(
 	ulint*		n_pages_handled)/*!< in/out: number of UNDO pages
 					handled */
 {
-	fil_addr_t	prev_log_addr;
-	trx_id_t	trx_no;
-	mtr_t		mtr;
-
-	mutex_enter(&purge_sys.rseg->mutex);
+  fil_addr_t prev_log_addr;
+  mtr_t mtr;
 
-	ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
-
-	purge_sys.tail.trx_no = purge_sys.rseg->last_trx_no() + 1;
-	purge_sys.tail.undo_no = 0;
-	purge_sys.next_stored = false;
-
-	mtr.start();
-
-	const buf_block_t* undo_page = trx_undo_page_get_s_latched(
-		page_id_t(purge_sys.rseg->space->id,
-			  purge_sys.rseg->last_page_no), &mtr);
-
-	const trx_ulogf_t* log_hdr = undo_page->frame
-		+ purge_sys.rseg->last_offset();
-
-	/* Increase the purge page count by one for every handled log */
-
-	(*n_pages_handled)++;
+  mtr.start();
 
-	prev_log_addr = flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
-	prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset
-						      - TRX_UNDO_HISTORY_NODE);
+  purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
 
+  ut_a(purge_sys.rseg->last_page_no != FIL_NULL);
 
-	const bool empty = prev_log_addr.page == FIL_NULL;
+  purge_sys.tail.trx_no= purge_sys.rseg->last_trx_no() + 1;
+  purge_sys.tail.undo_no= 0;
+  purge_sys.next_stored= false;
 
-	if (empty) {
-		/* No logs left in the history list */
-		purge_sys.rseg->last_page_no = FIL_NULL;
-	}
+  if (const buf_block_t* undo_page=
+      buf_page_get_gen(page_id_t(purge_sys.rseg->space->id,
+                                 purge_sys.rseg->last_page_no),
+                       0, RW_S_LATCH, nullptr,
+                       BUF_GET_POSSIBLY_FREED, &mtr))
+  {
+    const trx_ulogf_t *log_hdr=
+      undo_page->page.frame + purge_sys.rseg->last_offset();
+    /* Increase the purge page count by one for every handled log */
+    ++*n_pages_handled;
+    prev_log_addr= flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+    prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset -
+                                                  TRX_UNDO_HISTORY_NODE);
+  }
+  else
+    prev_log_addr.page= FIL_NULL;
 
-	mutex_exit(&purge_sys.rseg->mutex);
-	mtr.commit();
+  const bool empty= prev_log_addr.page == FIL_NULL;
 
-	if (empty) {
-		return;
-	}
+  if (empty)
+    /* No logs left in the history list */
+    purge_sys.rseg->last_page_no= FIL_NULL;
 
-	/* Read the previous log header. */
-	mtr.start();
+  purge_sys.rseg->latch.wr_unlock();
+  mtr.commit();
 
-	log_hdr = trx_undo_page_get_s_latched(
-		page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
-		&mtr)->frame
-		+ prev_log_addr.boffset;
+  if (empty)
+    return;
 
-	trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
-	ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+  /* Read the previous log header. */
+  mtr.start();
 
-	mtr.commit();
+  trx_id_t trx_no= 0;
 
-	mutex_enter(&purge_sys.rseg->mutex);
+  if (const buf_block_t* undo_page=
+      buf_page_get_gen(page_id_t(purge_sys.rseg->space->id, prev_log_addr.page),
+                       0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr))
+  {
+    const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset;
 
-	purge_sys.rseg->last_page_no = prev_log_addr.page;
-	purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+    trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+    ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+  }
 
-	/* Purge can also produce events, however these are already ordered
-	in the rollback segment and any user generated event will be greater
-	than the events that Purge produces. ie. Purge can never produce
-	events from an empty rollback segment. */
+  mtr.commit();
 
-	mutex_enter(&purge_sys.pq_mutex);
+  if (UNIV_UNLIKELY(!trx_no))
+    return;
 
-	purge_sys.purge_queue.push(*purge_sys.rseg);
+  purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
+  purge_sys.rseg->last_page_no= prev_log_addr.page;
+  purge_sys.rseg->set_last_commit(prev_log_addr.boffset, trx_no);
 
-	mutex_exit(&purge_sys.pq_mutex);
+  /* Purge can also produce events, however these are already ordered
+  in the rollback segment and any user generated event will be greater
+  than the events that Purge produces. ie. Purge can never produce
+  events from an empty rollback segment. */
 
-	mutex_exit(&purge_sys.rseg->mutex);
+  mysql_mutex_lock(&purge_sys.pq_mutex);
+  purge_sys.purge_queue.push(*purge_sys.rseg);
+  mysql_mutex_unlock(&purge_sys.pq_mutex);
+  purge_sys.rseg->latch.wr_unlock();
 }
 
 /** Position the purge sys "iterator" on the undo record to use for purging. */
@@ -877,11 +951,11 @@ static void trx_purge_read_undo_rec()
 	if (purge_sys.rseg->needs_purge) {
 		mtr_t		mtr;
 		mtr.start();
-		buf_block_t* undo_page;
+		const buf_block_t* undo_page;
 		if (trx_undo_rec_t* undo_rec = trx_undo_get_first_rec(
 			    *purge_sys.rseg->space, purge_sys.hdr_page_no,
 			    purge_sys.hdr_offset, RW_S_LATCH,
-			    undo_page, &mtr)) {
+			    undo_page, &mtr, nullptr)) {
 
 			offset = page_offset(undo_rec);
 			undo_no = trx_undo_rec_get_undo_no(undo_rec);
@@ -909,10 +983,7 @@ Chooses the next undo log to purge and updates the info in purge_sys. This
 function is used to initialize purge_sys when the next record to purge is
 not known, and also to update the purge system info on the next record when
 purge has handled the whole undo log for a transaction. */
-static
-void
-trx_purge_choose_next_log(void)
-/*===========================*/
+TRANSACTIONAL_TARGET static void trx_purge_choose_next_log()
 {
 	ut_ad(!purge_sys.next_stored);
 
@@ -920,13 +991,15 @@ trx_purge_choose_next_log(void)
 		trx_purge_read_undo_rec();
 	} else {
 		/* There is nothing to do yet. */
-		os_thread_yield();
+		std::this_thread::yield();
 	}
 }
 
 /***********************************************************************//**
 Gets the next record to purge and updates the info in the purge system.
-@return copy of an undo log record or pointer to the dummy undo log record */
+@return copy of an undo log record
+@retval -1 if there is nothing to purge
+@retval nullptr on corruption */
 static
 trx_undo_rec_t*
 trx_purge_get_next_rec(
@@ -940,8 +1013,7 @@ trx_purge_get_next_rec(
 	ut_ad(purge_sys.next_stored);
 	ut_ad(purge_sys.tail.trx_no < purge_sys.low_limit_no());
 
-	const ulint space = purge_sys.rseg->space->id;
-	const uint32_t page_no = purge_sys.page_no;
+	const page_id_t page_id{purge_sys.rseg->space->id, purge_sys.page_no};
 	const uint16_t offset = purge_sys.offset;
 
 	if (offset == 0) {
@@ -953,15 +1025,21 @@ trx_purge_get_next_rec(
 		/* Look for the next undo log and record to purge */
 
 		trx_purge_choose_next_log();
-
-		return(&trx_purge_dummy_rec);
+		return reinterpret_cast<trx_undo_rec_t*>(-1);
 	}
 
-	mtr_start(&mtr);
+	mtr.start();
+
+	const buf_block_t* undo_page
+		= buf_page_get_gen(page_id, 0, RW_S_LATCH, nullptr,
+				   BUF_GET_POSSIBLY_FREED, &mtr);
+	if (UNIV_UNLIKELY(!undo_page)) {
+corrupted:
+		mtr.commit();
+		return nullptr;
+	}
 
-	buf_block_t* undo_page = trx_undo_page_get_s_latched(
-		page_id_t(space, page_no), &mtr);
-	buf_block_t* rec2_page = undo_page;
+	const buf_block_t* rec2_page = undo_page;
 
 	const trx_undo_rec_t* rec2 = trx_undo_page_get_next_rec(
 		undo_page, offset, purge_sys.hdr_page_no, purge_sys.hdr_offset);
@@ -983,8 +1061,12 @@ trx_purge_get_next_rec(
 
 		mtr_start(&mtr);
 
-		undo_page = trx_undo_page_get_s_latched(
-			page_id_t(space, page_no), &mtr);
+		undo_page = buf_page_get_gen(page_id, 0, RW_S_LATCH,
+					     nullptr, BUF_GET_POSSIBLY_FREED,
+					     &mtr);
+		if (UNIV_UNLIKELY(!undo_page)) {
+			goto corrupted;
+		}
 	} else {
 		purge_sys.offset = page_offset(rec2);
 		purge_sys.page_no = rec2_page->page.id().page_no();
@@ -996,19 +1078,19 @@ trx_purge_get_next_rec(
 		}
 	}
 
-	trx_undo_rec_t*	rec_copy = trx_undo_rec_copy(undo_page->frame + offset,
-						     heap);
+	trx_undo_rec_t*	rec_copy = trx_undo_rec_copy(undo_page->page.frame
+						     + offset, heap);
 
-	mtr_commit(&mtr);
-
-	return(rec_copy);
+	mtr.commit();
+	return rec_copy;
 }
 
 /********************************************************************//**
 Fetches the next undo log record from the history list to purge. It must be
 released with the corresponding release function.
-@return copy of an undo log record or pointer to trx_purge_dummy_rec,
-if the whole undo log can skipped in purge; NULL if none left */
+@return copy of an undo log record
+@retval -1 if the whole undo log can skipped in purge
+@retval nullptr if nothing is left, or on corruption */
 static MY_ATTRIBUTE((warn_unused_result))
 trx_undo_rec_t*
 trx_purge_fetch_next_rec(
@@ -1024,29 +1106,28 @@ trx_purge_fetch_next_rec(
 		if (!purge_sys.next_stored) {
 			DBUG_PRINT("ib_purge",
 				   ("no logs left in the history list"));
-			return(NULL);
+			return nullptr;
 		}
 	}
 
 	if (purge_sys.tail.trx_no >= purge_sys.low_limit_no()) {
-
-		return(NULL);
+		return nullptr;
 	}
 
 	/* fprintf(stderr, "Thread %lu purging trx %llu undo record %llu\n",
-	os_thread_get_curr_id(), iter->trx_no, iter->undo_no); */
+	pthread_self(), iter->trx_no, iter->undo_no); */
 
 	*roll_ptr = trx_undo_build_roll_ptr(
 		/* row_purge_record_func() will later set
 		ROLL_PTR_INSERT_FLAG for TRX_UNDO_INSERT_REC */
 		false,
-		purge_sys.rseg->id,
+		trx_sys.rseg_id(purge_sys.rseg, true),
 		purge_sys.page_no, purge_sys.offset);
 
 	/* The following call will advance the stored values of the
 	purge iterator. */
 
-	return(trx_purge_get_next_rec(n_pages_handled, heap));
+	return trx_purge_get_next_rec(n_pages_handled, heap);
 }
 
 /** Run a purge batch.
@@ -1097,7 +1178,6 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 
 	i = 0;
 
-	const ulint		batch_size = srv_purge_batch_size;
 	std::unordered_map<table_id_t, purge_node_t*> table_id_map;
 	mem_heap_empty(purge_sys.heap);
 
@@ -1105,8 +1185,6 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 		purge_node_t*		node;
 		trx_purge_rec_t		purge_rec;
 
-		ut_a(!thr->is_active);
-
 		/* Get the purge node. */
 		node = (purge_node_t*) thr->child;
 		ut_a(que_node_get_type(node) == QUE_NODE_PURGE);
@@ -1125,7 +1203,8 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 
 		if (purge_rec.undo_rec == NULL) {
 			break;
-		} else if (purge_rec.undo_rec == &trx_purge_dummy_rec) {
+		} else if (purge_rec.undo_rec
+			   == reinterpret_cast<trx_undo_rec_t*>(-1)) {
 			continue;
 		}
 
@@ -1150,7 +1229,7 @@ trx_purge_attach_undo_recs(ulint n_purge_threads)
 
 		node->undo_recs.push(purge_rec);
 
-		if (n_pages_handled >= batch_size) {
+		if (n_pages_handled >= srv_purge_batch_size) {
 			break;
 		}
 	}
@@ -1176,7 +1255,7 @@ trx_purge_dml_delay(void)
 	/* If purge lag is set then calculate the new DML delay. */
 
 	if (srv_max_purge_lag > 0) {
-		double ratio = static_cast<double>(trx_sys.rseg_history_len) /
+		double ratio = static_cast<double>(trx_sys.history_size()) /
 			static_cast<double>(srv_max_purge_lag);
 
 		if (ratio > 1.0) {
@@ -1202,14 +1281,14 @@ extern tpool::waitable_task purge_worker_task;
 /** Wait for pending purge jobs to complete. */
 static void trx_purge_wait_for_workers_to_complete()
 {
-  bool notify_wait = purge_worker_task.is_running();
+  const bool notify_wait{purge_worker_task.is_running()};
 
   if (notify_wait)
-   tpool::tpool_wait_begin();
+    tpool::tpool_wait_begin();
 
   purge_worker_task.wait();
 
-  if(notify_wait)
+  if (notify_wait)
     tpool::tpool_wait_end();
 
   /* There should be no outstanding tasks as long
@@ -1217,12 +1296,33 @@ static void trx_purge_wait_for_workers_to_complete()
   ut_ad(srv_get_task_queue_length() == 0);
 }
 
+/** Update end_view at the end of a purge batch. */
+TRANSACTIONAL_INLINE void purge_sys_t::clone_end_view()
+{
+  /* This is only invoked only by the purge coordinator,
+  which is the only thread that can modify our inputs head, tail, view.
+  Therefore, we only need to protect end_view from concurrent reads. */
+
+  /* Limit the end_view similar to what trx_purge_truncate_history() does. */
+  const trx_id_t trx_no= head.trx_no ? head.trx_no : tail.trx_no;
+#ifdef SUX_LOCK_GENERIC
+  end_latch.wr_lock();
+#else
+  transactional_lock_guard<srw_spin_lock_low> g(end_latch);
+#endif
+  end_view= view;
+  end_view.clamp_low_limit_id(trx_no);
+#ifdef SUX_LOCK_GENERIC
+  end_latch.wr_unlock();
+#endif
+}
+
 /**
 Run a purge batch.
 @param n_tasks   number of purge tasks to submit to the queue
 @param truncate  whether to truncate the history at the end of the batch
 @return number of undo log pages handled in the batch */
-ulint trx_purge(ulint n_tasks, bool truncate)
+TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, bool truncate)
 {
 	que_thr_t*	thr = NULL;
 	ulint		n_pages_handled;
@@ -1256,6 +1356,8 @@ ulint trx_purge(ulint n_tasks, bool truncate)
 
 	trx_purge_wait_for_workers_to_complete();
 
+	purge_sys.clone_end_view();
+
 	if (truncate) {
 		trx_purge_truncate_history();
 	}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index 33a3962047f..dc24f083d05 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -38,6 +38,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "trx0rseg.h"
 #include "row0row.h"
 #include "row0mysql.h"
+#include "row0ins.h"
 
 /** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
 const dtuple_t trx_undo_metadata = {
@@ -58,10 +59,11 @@ const dtuple_t trx_undo_metadata = {
 @return bytes left */
 static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
 {
-  ut_ad(ptr >= &undo_block->frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+  ut_ad(ptr >=
+        &undo_block->page.frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
   /* The 10 is supposed to be an extra safety margin (and needed for
   compatibility with older versions) */
-  lint left= srv_page_size - (ptr - undo_block->frame) -
+  lint left= srv_page_size - (ptr - undo_block->page.frame) -
     (10 + FIL_PAGE_DATA_END);
   ut_ad(left >= 0);
   return left < 0 ? 0 : static_cast<ulint>(left);
@@ -81,14 +83,14 @@ trx_undo_page_set_next_prev_and_add(
 					written on this undo page. */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-  ut_ad(page_align(ptr) == undo_block->frame);
+  ut_ad(page_align(ptr) == undo_block->page.frame);
 
   if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
     return 0;
 
   byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
 						TRX_UNDO_PAGE_FREE +
-						undo_block->frame);
+						undo_block->page.frame);
 
   const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
 
@@ -96,13 +98,14 @@ trx_undo_page_set_next_prev_and_add(
   memcpy(ptr, ptr_to_first_free, 2);
   ptr += 2;
 
-  const uint16_t end_of_rec= static_cast<uint16_t>(ptr - undo_block->frame);
+  const uint16_t end_of_rec= static_cast<uint16_t>
+    (ptr - undo_block->page.frame);
 
   /* Update the offset to first free undo record */
   mach_write_to_2(ptr_to_first_free, end_of_rec);
   /* Write offset of the next undo log record */
-  memcpy(undo_block->frame + first_free, ptr_to_first_free, 2);
-  const byte *start= undo_block->frame + first_free + 2;
+  memcpy(undo_block->page.frame + first_free, ptr_to_first_free, 2);
+  const byte *start= undo_block->page.frame + first_free + 2;
 
   mtr->undo_append(*undo_block, start, ptr - start - 2);
   return first_free;
@@ -371,19 +374,24 @@ trx_undo_report_insert_virtual(
 	return(true);
 }
 
-/**********************************************************************//**
-Reports in the undo log of an insert of a clustered index record.
+/** Reports in the undo log of an insert of a clustered index record.
+@param	undo_block	undo log page
+@param	trx		transaction
+@param	index		clustered index
+@param	clust_entry	index entry which will be inserted to the
+			clustered index
+@param	mtr		mini-transaction
+@param	write_empty	write empty table undo log record
 @return offset of the inserted entry on the page if succeed, 0 if fail */
 static
 uint16_t
 trx_undo_page_report_insert(
-/*========================*/
-	buf_block_t*	undo_block,	/*!< in: undo log page */
-	trx_t*		trx,		/*!< in: transaction */
-	dict_index_t*	index,		/*!< in: clustered index */
-	const dtuple_t*	clust_entry,	/*!< in: index entry which will be
-					inserted to the clustered index */
-	mtr_t*		mtr)		/*!< in: mtr */
+	buf_block_t*	undo_block,
+	trx_t*		trx,
+	dict_index_t*	index,
+	const dtuple_t*	clust_entry,
+	mtr_t*		mtr,
+	bool		write_empty)
 {
 	ut_ad(index->is_primary());
 	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
@@ -391,13 +399,13 @@ trx_undo_page_report_insert(
 	TRX_UNDO_INSERT == 1 into insert_undo pages,
 	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
 	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
-			       + undo_block->frame) <= 2);
+			       + undo_block->page.frame) <= 2);
 
 	uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
 					       (TRX_UNDO_PAGE_HDR
 						+ TRX_UNDO_PAGE_FREE
-						+ undo_block->frame));
-	byte* ptr = undo_block->frame + first_free;
+						+ undo_block->page.frame));
+	byte* ptr = undo_block->page.frame + first_free;
 
 	if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
 		/* Not enough space for writing the general parameters */
@@ -411,15 +419,23 @@ trx_undo_page_report_insert(
 	*ptr++ = TRX_UNDO_INSERT_REC;
 	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
 	ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+
+	if (write_empty) {
+		/* Table is in bulk operation */
+		undo_block->page.frame[first_free + 2] = TRX_UNDO_EMPTY;
+		goto done;
+	}
+
 	/*----------------------------------------*/
 	/* Store then the fields required to uniquely determine the record
 	to be inserted in the clustered index */
 	if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
 		ut_ad(clust_entry->is_metadata());
 		ut_ad(index->is_instant());
-		ut_ad(undo_block->frame[first_free + 2]
+		ut_ad(undo_block->page.frame[first_free + 2]
 		      == TRX_UNDO_INSERT_REC);
-		undo_block->frame[first_free + 2] = TRX_UNDO_INSERT_METADATA;
+		undo_block->page.frame[first_free + 2]
+			= TRX_UNDO_INSERT_METADATA;
 		goto done;
 	}
 
@@ -463,10 +479,10 @@ done:
 /**********************************************************************//**
 Reads from an undo log record the general parameters.
 @return remaining part of undo log record after reading these values */
-byte*
+const byte*
 trx_undo_rec_get_pars(
 /*==================*/
-	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
 	ulint*		type,		/*!< out: undo record type:
 					TRX_UNDO_INSERT_REC, ... */
 	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
@@ -476,41 +492,34 @@ trx_undo_rec_get_pars(
 	undo_no_t*	undo_no,	/*!< out: undo log record number */
 	table_id_t*	table_id)	/*!< out: table id */
 {
-	const byte*	ptr;
 	ulint		type_cmpl;
 
-	ptr = undo_rec + 2;
-
-	type_cmpl = mach_read_from_1(ptr);
-	ptr++;
+	type_cmpl = undo_rec[2];
+	const byte *ptr = undo_rec + 3;
 
 	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
 	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
 	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
 	ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
-	ut_ad(*type <= TRX_UNDO_DEL_MARK_REC);
+	ut_ad(*type <= TRX_UNDO_EMPTY);
 	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
 
 	*undo_no = mach_read_next_much_compressed(&ptr);
 	*table_id = mach_read_next_much_compressed(&ptr);
 	ut_ad(*table_id);
 
-	return(const_cast<byte*>(ptr));
+	return ptr;
 }
 
 /** Read from an undo log record a non-virtual column value.
-@param[in,out]	ptr		pointer to remaining part of the undo record
-@param[in,out]	field		stored field
-@param[in,out]	len		length of the field, or UNIV_SQL_NULL
-@param[in,out]	orig_len	original length of the locally stored part
+@param ptr	pointer to remaining part of the undo record
+@param field	stored field
+@param len	length of the field, or UNIV_SQL_NULL
+@param orig_len	original length of the locally stored part
 of an externally stored column, or 0
 @return remaining part of undo log record after reading these values */
-byte*
-trx_undo_rec_get_col_val(
-	const byte*	ptr,
-	const byte**	field,
-	uint32_t*	len,
-	uint32_t*	orig_len)
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                                     uint32_t *len, uint32_t *orig_len)
 {
 	*len = mach_read_next_compressed(&ptr);
 	*orig_len = 0;
@@ -548,16 +557,16 @@ trx_undo_rec_get_col_val(
 		}
 	}
 
-	return(const_cast<byte*>(ptr));
+	return ptr;
 }
 
 /*******************************************************************//**
 Builds a row reference from an undo log record.
 @return pointer to remaining part of undo record */
-byte*
+const byte*
 trx_undo_rec_get_row_ref(
 /*=====================*/
-	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
 				record, at the start of the row reference;
 				NOTE that this copy of the undo log record must
 				be preserved as long as the row reference is
@@ -568,20 +577,16 @@ trx_undo_rec_get_row_ref(
 	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
 				needed is allocated */
 {
-	ulint		ref_len;
-	ulint		i;
-
-	ut_ad(index && ptr && ref && heap);
-	ut_a(dict_index_is_clust(index));
+	ut_ad(index->is_primary());
 
-	ref_len = dict_index_get_n_unique(index);
+	const ulint ref_len = dict_index_get_n_unique(index);
 
 	dtuple_t* tuple = dtuple_create(heap, ref_len);
 	*ref = tuple;
 
 	dict_index_copy_types(tuple, index, ref_len);
 
-	for (i = 0; i < ref_len; i++) {
+	for (ulint i = 0; i < ref_len; i++) {
 		const byte*	field;
 		uint32_t	len, orig_len;
 
@@ -592,29 +597,21 @@ trx_undo_rec_get_row_ref(
 		dfield_set_data(dfield, field, len);
 	}
 
-	return(ptr);
+	return ptr;
 }
 
-/*******************************************************************//**
-Skips a row reference from an undo log record.
+/** Skip a row reference from an undo log record.
+@param ptr    part of an update undo log record
+@param index  clustered index
 @return pointer to remaining part of undo record */
-static
-byte*
-trx_undo_rec_skip_row_ref(
-/*======================*/
-	byte*		ptr,	/*!< in: remaining part in update undo log
-				record, at the start of the row reference */
-	dict_index_t*	index)	/*!< in: clustered index */
+static const byte *trx_undo_rec_skip_row_ref(const byte *ptr,
+                                             const dict_index_t *index)
 {
-	ulint	ref_len;
-	ulint	i;
-
-	ut_ad(index && ptr);
-	ut_a(dict_index_is_clust(index));
+	ut_ad(index->is_primary());
 
-	ref_len = dict_index_get_n_unique(index);
+	ulint ref_len = dict_index_get_n_unique(index);
 
-	for (i = 0; i < ref_len; i++) {
+	for (ulint i = 0; i < ref_len; i++) {
 		const byte*	field;
 		uint32_t len, orig_len;
 
@@ -796,14 +793,14 @@ trx_undo_page_report_modify(
 	TRX_UNDO_INSERT == 1 into insert_undo pages,
 	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
 	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
-			       + undo_block->frame) <= 2);
+			       + undo_block->page.frame) <= 2);
 
-	byte* ptr_to_first_free = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
-						       + TRX_UNDO_PAGE_FREE
-						       + undo_block->frame);
+	byte* ptr_to_first_free = my_assume_aligned<2>(
+		TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		+ undo_block->page.frame);
 
 	const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
-	byte *ptr = undo_block->frame + first_free;
+	byte *ptr = undo_block->page.frame + first_free;
 
 	if (trx_undo_left(undo_block, ptr) < 50) {
 		/* NOTE: the value 50 must be big enough so that the general
@@ -1392,12 +1389,12 @@ already_logged:
 
 	mach_write_to_2(ptr, first_free);
 	const uint16_t new_free = static_cast<uint16_t>(
-		ptr + 2 - undo_block->frame);
-	mach_write_to_2(undo_block->frame + first_free, new_free);
+		ptr + 2 - undo_block->page.frame);
+	mach_write_to_2(undo_block->page.frame + first_free, new_free);
 
 	mach_write_to_2(ptr_to_first_free, new_free);
 
-	const byte* start = &undo_block->frame[first_free + 2];
+	const byte* start = &undo_block->page.frame[first_free + 2];
 	mtr->undo_append(*undo_block, start, ptr - start);
 	return(first_free);
 }
@@ -1678,11 +1675,11 @@ trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
 {
 	byte*	ptr_first_free  = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
 						       + TRX_UNDO_PAGE_FREE
-						       + block->frame);
+						       + block->page.frame);
 	const uint16_t first_free = mach_read_from_2(ptr_first_free);
 	ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
 	ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
-	byte* const start = block->frame + first_free;
+	byte* const start = block->page.frame + first_free;
 	size_t len = strlen(table->name.m_name);
 	const size_t fixed = 2 + 1 + 11 + 11 + 2;
 	ut_ad(len <= NAME_CHAR_LEN * 5 * 2 + 1);
@@ -1704,7 +1701,7 @@ trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
 	memcpy(ptr, table->name.m_name, len);
 	ptr += len;
 	mach_write_to_2(ptr, first_free);
-	mach_write_to_2(ptr_first_free, ptr + 2 - block->frame);
+	mach_write_to_2(ptr_first_free, ptr + 2 - block->page.frame);
 	memcpy(start, ptr_first_free, 2);
 	mtr->undo_append(*block, start + 2, ptr - start - 2);
 	return first_free;
@@ -1745,9 +1742,8 @@ dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
 			} else {
 				mtr.commit();
 				mtr.start();
-				block = trx_undo_add_page(undo, &mtr);
+				block = trx_undo_add_page(undo, &mtr, &err);
 				if (!block) {
-					err = DB_OUT_OF_FILE_SPACE;
 					break;
 				}
 			}
@@ -1758,6 +1754,42 @@ dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
 	return err;
 }
 
+TRANSACTIONAL_TARGET ATTRIBUTE_NOINLINE
+/** @return whether the transaction holds an exclusive lock on a table */
+static bool trx_has_lock_x(const trx_t &trx, dict_table_t& table)
+{
+  if (table.is_temporary())
+    return true;
+
+  uint32_t n;
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (table.lock_mutex_is_locked())
+      xabort();
+    n= table.n_lock_x_or_s;
+    xend();
+  }
+  else
+#endif
+  {
+    table.lock_mutex_lock();
+    n= table.n_lock_x_or_s;
+    table.lock_mutex_unlock();
+  }
+
+  /* This thread is executing trx. No other thread can modify our table locks
+  (only record locks might be created, in an implicit-to-explicit conversion).
+  Hence, no mutex is needed here. */
+  if (n)
+    for (const lock_t *lock : trx.lock.table_locks)
+      if (lock && lock->type_mode == (LOCK_X | LOCK_TABLE))
+        return true;
+
+  return false;
+}
+
 /***********************************************************************//**
 Writes information to an undo log about an insert, update, or a delete marking
 of a clustered index record. This information is used in a rollback of the
@@ -1788,7 +1820,6 @@ trx_undo_report_row_operation(
 					undo log record */
 {
 	trx_t*		trx;
-	mtr_t		mtr;
 #ifdef UNIV_DEBUG
 	int		loop_count	= 0;
 #endif /* UNIV_DEBUG */
@@ -1804,6 +1835,39 @@ trx_undo_report_row_operation(
 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
 	ut_ad(!trx->in_rollback);
 
+	/* We must determine if this is the first time when this
+	transaction modifies this table. */
+	auto m = trx->mod_tables.emplace(index->table, trx->undo_no);
+	ut_ad(m.first->second.valid(trx->undo_no));
+
+	if (m.second && index->table->is_active_ddl()) {
+		trx->apply_online_log= true;
+	}
+
+	bool bulk = !rec;
+
+	if (!bulk) {
+		/* An UPDATE or DELETE must not be covered by an
+		earlier start_bulk_insert(). */
+		ut_ad(!m.first->second.is_bulk_insert());
+	} else if (m.first->second.is_bulk_insert()) {
+		/* Above, the emplace() tried to insert an object with
+		!is_bulk_insert(). Only an explicit start_bulk_insert()
+		(below) can set the flag. */
+		ut_ad(!m.second);
+		/* We already wrote a TRX_UNDO_EMPTY record. */
+		ut_ad(thr->run_node);
+		ut_ad(que_node_get_type(thr->run_node) == QUE_NODE_INSERT);
+		ut_ad(trx->bulk_insert);
+		return DB_SUCCESS;
+	} else if (m.second && trx->bulk_insert
+		   && trx_has_lock_x(*trx, *index->table)) {
+		m.first->second.start_bulk_insert();
+	} else {
+		bulk = false;
+	}
+
+	mtr_t		mtr;
 	mtr.start();
 	trx_undo_t**	pundo;
 	trx_rseg_t*	rseg;
@@ -1825,10 +1889,11 @@ trx_undo_report_row_operation(
 	buf_block_t*	undo_block = trx_undo_assign_low(trx, rseg, pundo,
 							 &err, &mtr);
 	trx_undo_t*	undo	= *pundo;
-
 	ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
 	if (UNIV_UNLIKELY(undo_block == NULL)) {
-		goto err_exit;
+err_exit:
+		mtr.commit();
+		return err;
 	}
 
 	ut_ad(undo != NULL);
@@ -1836,7 +1901,8 @@ trx_undo_report_row_operation(
 	do {
 		uint16_t offset = !rec
 			? trx_undo_page_report_insert(
-				undo_block, trx, index, clust_entry, &mtr)
+				undo_block, trx, index, clust_entry, &mtr,
+				bulk)
 			: trx_undo_page_report_modify(
 				undo_block, trx, index, rec, offsets, update,
 				cmpl_info, clust_entry, &mtr);
@@ -1844,8 +1910,8 @@ trx_undo_report_row_operation(
 		if (UNIV_UNLIKELY(offset == 0)) {
 			const uint16_t first_free = mach_read_from_2(
 				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-				+ undo_block->frame);
-			memset(undo_block->frame + first_free, 0,
+				+ undo_block->page.frame);
+			memset(undo_block->page.frame + first_free, 0,
 			       (srv_page_size - FIL_PAGE_DATA_END)
 			       - first_free);
 
@@ -1862,7 +1928,7 @@ trx_undo_report_row_operation(
 				tree latch, which is the rseg
 				mutex. We must commit the mini-transaction
 				first, because it may be holding lower-level
-				latches, such as SYNC_FSP and SYNC_FSP_PAGE. */
+				latches, such as SYNC_FSP_PAGE. */
 
 				mtr.commit();
 				mtr.start();
@@ -1870,11 +1936,19 @@ trx_undo_report_row_operation(
 					mtr.set_log_mode(MTR_LOG_NO_REDO);
 				}
 
-				mutex_enter(&rseg->mutex);
-				trx_undo_free_last_page(undo, &mtr);
-				mutex_exit(&rseg->mutex);
+				rseg->latch.wr_lock(SRW_LOCK_CALL);
+				err = trx_undo_free_last_page(undo, &mtr);
+				rseg->latch.wr_unlock();
+
+				if (m.second) {
+					/* We are not going to modify
+					this table after all. */
+					trx->mod_tables.erase(m.first);
+				}
 
-				err = DB_UNDO_RECORD_TOO_BIG;
+				if (err == DB_SUCCESS) {
+					err = DB_UNDO_RECORD_TOO_BIG;
+				}
 				goto err_exit;
 			} else {
 				/* Write log for clearing the unused
@@ -1904,28 +1978,24 @@ trx_undo_report_row_operation(
 			ut_ad(!undo->empty());
 
 			if (!is_temp) {
-				const undo_no_t limit = undo->top_undo_no;
-				/* Determine if this is the first time
-				when this transaction modifies a
-				system-versioned column in this table. */
-				trx_mod_table_time_t& time
-					= trx->mod_tables.insert(
-						trx_mod_tables_t::value_type(
-							index->table, limit))
-					.first->second;
-				ut_ad(time.valid(limit));
+				trx_mod_table_time_t& time = m.first->second;
+				ut_ad(time.valid(undo->top_undo_no));
 
 				if (!time.is_versioned()
 				    && index->table->versioned_by_id()
 				    && (!rec /* INSERT */
 					|| (update
 					    && update->affects_versioned()))) {
-					time.set_versioned(limit);
+					time.set_versioned(undo->top_undo_no);
 				}
 			}
 
-			*roll_ptr = trx_undo_build_roll_ptr(
-				!rec, rseg->id, undo->top_page_no, offset);
+			if (!bulk) {
+				*roll_ptr = trx_undo_build_roll_ptr(
+					!rec, trx_sys.rseg_id(rseg, !is_temp),
+					undo->top_page_no, offset);
+			}
+
 			return(DB_SUCCESS);
 		}
 
@@ -1940,12 +2010,16 @@ trx_undo_report_row_operation(
 			mtr.set_log_mode(MTR_LOG_NO_REDO);
 		}
 
-		undo_block = trx_undo_add_page(undo, &mtr);
+		undo_block = trx_undo_add_page(undo, &mtr, &err);
 
 		DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
 				undo_block = NULL;);
 	} while (UNIV_LIKELY(undo_block != NULL));
 
+	if (err != DB_OUT_OF_FILE_SPACE) {
+		goto err_exit;
+	}
+
 	ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
 		DB_OUT_OF_FILE_SPACE,
 		//ER_INNODB_UNDO_LOG_FULL,
@@ -1956,12 +2030,7 @@ trx_undo_report_row_operation(
 		undo->rseg->space == fil_system.sys_space
 		? "system" : is_temp ? "temporary" : "undo");
 
-	/* Did not succeed: out of space */
-	err = DB_OUT_OF_FILE_SPACE;
-
-err_exit:
-	mtr_commit(&mtr);
-	return(err);
+	goto err_exit;
 }
 
 /*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
@@ -1972,112 +2041,113 @@ err_exit:
 static
 trx_undo_rec_t*
 trx_undo_get_undo_rec_low(
-	roll_ptr_t	roll_ptr,
-	mem_heap_t*	heap)
+	roll_ptr_t		roll_ptr,
+	mem_heap_t*		heap)
 {
-	trx_undo_rec_t*	undo_rec;
-	ulint		rseg_id;
-	uint32_t	page_no;
-	uint16_t	offset;
-	trx_rseg_t*	rseg;
-	bool		is_insert;
-	mtr_t		mtr;
+  ulint rseg_id;
+  uint32_t page_no;
+  uint16_t offset;
+  bool is_insert;
+  mtr_t mtr;
 
-	trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
-				 &offset);
-	ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
-	ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-	rseg = trx_sys.rseg_array[rseg_id];
-	ut_ad(rseg->is_persistent());
+  trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset);
+  ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+  ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id];
+  ut_ad(rseg->is_persistent());
 
-	mtr.start();
+  mtr.start();
 
-	buf_block_t* undo_page = trx_undo_page_get_s_latched(
-		page_id_t(rseg->space->id, page_no), &mtr);
+  const buf_block_t* undo_page=
+    buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr);
 
-	undo_rec = trx_undo_rec_copy(undo_page->frame + offset, heap);
+  trx_undo_rec_t *undo_rec= undo_page
+    ? trx_undo_rec_copy(undo_page->page.frame + offset, heap)
+    : nullptr;
 
-	mtr.commit();
+  mtr.commit();
+  return undo_rec;
+}
 
-	return(undo_rec);
+/** Copy an undo record to heap, to check if a secondary index record
+can be safely purged.
+@param trx_id   DB_TRX_ID corresponding to roll_ptr
+@param name     table name
+@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
+@param heap     memory heap for allocation
+@return copy of the record
+@retval nullptr if the version is visible to purge_sys.view */
+static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id,
+                                                     const table_name_t &name,
+                                                     roll_ptr_t roll_ptr,
+                                                     mem_heap_t* heap)
+{
+  {
+    purge_sys_t::view_guard check;
+    if (!check.view().changes_visible(trx_id))
+      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+  }
+  return nullptr;
 }
 
 /** Copy an undo record to heap.
-@param[in]	roll_ptr	roll pointer to record
-@param[in,out]	heap		memory heap where copied
-@param[in]	trx_id		id of the trx that generated
-				the roll pointer: it points to an
-				undo log of this transaction
-@param[in]	name		table name
-@param[out]	undo_rec	own: copy of the record
-@retval true if the undo log has been
-truncated and we cannot fetch the old version
-@retval false if the undo log record is available
-NOTE: the caller must have latches on the clustered index page. */
-static MY_ATTRIBUTE((warn_unused_result))
-bool
-trx_undo_get_undo_rec(
-	roll_ptr_t		roll_ptr,
-	mem_heap_t*		heap,
-	trx_id_t		trx_id,
-	const table_name_t&	name,
-	trx_undo_rec_t**	undo_rec)
+@param trx_id   DB_TRX_ID corresponding to roll_ptr
+@param name     table name
+@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
+@param heap     memory heap for allocation
+@return copy of the record
+@retval nullptr if the undo log is not available */
+static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id,
+                                             const table_name_t &name,
+                                             roll_ptr_t roll_ptr,
+                                             mem_heap_t *heap)
 {
-	rw_lock_s_lock(&purge_sys.latch);
-
-	bool missing_history = purge_sys.changes_visible(trx_id, name);
-	if (!missing_history) {
-		*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
-	}
-
-	rw_lock_s_unlock(&purge_sys.latch);
-
-	return(missing_history);
+  {
+    purge_sys_t::end_view_guard check;
+    if (!check.view().changes_visible(trx_id))
+      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+  }
+  return nullptr;
 }
 
-#ifdef UNIV_DEBUG
-#define ATTRIB_USED_ONLY_IN_DEBUG
-#else /* UNIV_DEBUG */
-#define ATTRIB_USED_ONLY_IN_DEBUG	MY_ATTRIBUTE((unused))
-#endif /* UNIV_DEBUG */
-
-/*******************************************************************//**
-Build a previous version of a clustered index record. The caller must
-hold a latch on the index page of the clustered index record.
-@retval true if previous version was built, or if it was an insert
-or the table has been rebuilt
-@retval false if the previous version is earlier than purge_view,
-or being purged, which means that it may have been removed */
-bool
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param	rec		version of a clustered index record
+@param	index		clustered index
+@param	offsets		rec_get_offsets(rec, index)
+@param	heap		memory heap from which the memory needed is
+			allocated
+@param	old_vers	previous version or NULL if rec is the
+			first inserted version, or if history data
+			has been deleted (an error), or if the purge
+			could have removed the version
+			though it has not yet done so
+@param	v_heap		memory heap used to create vrow
+			dtuple if it is not yet created. This heap
+			diffs from "heap" above in that it could be
+			prebuilt->old_vers_heap for selection
+@param	v_row		virtual column info, if any
+@param	v_status	status determine if it is going into this
+			function by purge thread or not.
+			And if we read "after image" of undo log
+@param	undo_block	undo log block which was cached during
+			online dml apply or nullptr
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+TRANSACTIONAL_TARGET
+dberr_t
 trx_undo_prev_version_build(
-/*========================*/
-	const rec_t*	index_rec ATTRIB_USED_ONLY_IN_DEBUG,
-				/*!< in: clustered index record in the
-				index tree */
-	mtr_t*		index_mtr ATTRIB_USED_ONLY_IN_DEBUG,
-				/*!< in: mtr which contains the latch to
-				index_rec page and purge_view */
-	const rec_t*	rec,	/*!< in: version of a clustered index record */
-	dict_index_t*	index,	/*!< in: clustered index */
-	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
-	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
-				needed is allocated */
-	rec_t**		old_vers,/*!< out, own: previous version, or NULL if
-				rec is the first inserted version, or if
-				history data has been deleted (an error),
-				or if the purge COULD have removed the version
-				though it has not yet done so */
-	mem_heap_t*	v_heap,	/* !< in: memory heap used to create vrow
-				dtuple if it is not yet created. This heap
-				diffs from "heap" above in that it could be
-				prebuilt->old_vers_heap for selection */
-	dtuple_t**	vrow,	/*!< out: virtual column info, if any */
+	const rec_t 	*rec,
+	dict_index_t	*index,
+	rec_offs	*offsets,
+	mem_heap_t	*heap,
+	rec_t		**old_vers,
+	mem_heap_t	*v_heap,
+	dtuple_t	**vrow,
 	ulint		v_status)
-				/*!< in: status determine if it is going
-				into this function by purge thread or not.
-				And if we read "after image" of undo log */
 {
-	trx_undo_rec_t*	undo_rec	= NULL;
 	dtuple_t*	entry;
 	trx_id_t	rec_trx_id;
 	ulint		type;
@@ -2086,19 +2156,13 @@ trx_undo_prev_version_build(
 	trx_id_t	trx_id;
 	roll_ptr_t	roll_ptr;
 	upd_t*		update;
-	byte*		ptr;
 	byte		info_bits;
 	ulint		cmpl_info;
 	bool		dummy_extern;
 	byte*		buf;
 
 	ut_ad(!index->table->is_temporary());
-	ut_ad(!rw_lock_own(&purge_sys.latch, RW_LOCK_S));
-	ut_ad(index_mtr->memo_contains_page_flagged(index_rec,
-						    MTR_MEMO_PAGE_S_FIX
-						    | MTR_MEMO_PAGE_X_FIX));
 	ut_ad(rec_offs_validate(rec, index, offsets));
-	ut_a(index->is_primary());
 
 	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
 
@@ -2106,34 +2170,31 @@ trx_undo_prev_version_build(
 
 	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
 		/* The record rec is the first inserted version */
-		return(true);
+		return DB_SUCCESS;
 	}
 
 	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
 
 	ut_ad(!index->table->skip_alter_undo);
 
-	if (trx_undo_get_undo_rec(
-		    roll_ptr, heap, rec_trx_id, index->table->name,
-		    &undo_rec)) {
-		if (v_status & TRX_UNDO_PREV_IN_PURGE) {
-			/* We are fetching the record being purged */
-			undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
-		} else {
-			/* The undo record may already have been purged,
-			during purge or semi-consistent read. */
-			return(false);
-		}
+	trx_undo_rec_t*	undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY
+		? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name,
+						roll_ptr, heap)
+		: trx_undo_get_undo_rec(rec_trx_id, index->table->name,
+					roll_ptr, heap);
+	if (!undo_rec) {
+		return DB_MISSING_HISTORY;
 	}
 
-	ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
-				    &dummy_extern, &undo_no, &table_id);
+	const byte *ptr =
+		trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+				      &dummy_extern, &undo_no, &table_id);
 
 	if (table_id != index->table->id) {
 		/* The table should have been rebuilt, but purge has
 		not yet removed the undo log records for the
 		now-dropped old table (table_id). */
-		return(true);
+		return DB_SUCCESS;
 	}
 
 	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
@@ -2181,26 +2242,9 @@ trx_undo_prev_version_build(
 		delete-marked record by trx_id, no transactions need to access
 		the BLOB. */
 
-		/* the row_upd_changes_disowned_external(update) call could be
-		omitted, but the synchronization on purge_sys.latch is likely
-		more expensive. */
-
-		if ((update->info_bits & REC_INFO_DELETED_FLAG)
-		    && row_upd_changes_disowned_external(update)) {
-			bool	missing_extern;
-
-			rw_lock_s_lock(&purge_sys.latch);
-
-			missing_extern = purge_sys.changes_visible(
-				trx_id,	index->table->name);
-
-			rw_lock_s_unlock(&purge_sys.latch);
-
-			if (missing_extern) {
-				/* treat as a fresh insert, not to
-				cause assertion error at the caller. */
-				return(true);
-			}
+		if (update->info_bits & REC_INFO_DELETED_FLAG
+		    && purge_sys.is_purgeable(trx_id)) {
+			return DB_SUCCESS;
 		}
 
 		/* We have to set the appropriate extern storage bits in the
@@ -2215,8 +2259,8 @@ trx_undo_prev_version_build(
 		following call is safe. */
 		if (!row_upd_index_replace_new_col_vals(entry, *index, update,
 							heap)) {
-			ut_a(v_status & TRX_UNDO_PREV_IN_PURGE);
-			return false;
+			return (v_status & TRX_UNDO_PREV_IN_PURGE)
+				? DB_MISSING_HISTORY : DB_CORRUPTION;
 		}
 
 		/* Get number of externally stored columns in updated record */
@@ -2313,7 +2357,7 @@ trx_undo_prev_version_build(
 				     v_status & TRX_UNDO_PREV_IN_PURGE);
 	}
 
-	return(true);
+	return DB_SUCCESS;
 }
 
 /** Read virtual column value from undo log
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
index fbe5a7e9b0a..4de7ab29243 100644
--- a/storage/innobase/trx/trx0roll.cc
+++ b/storage/innobase/trx/trx0roll.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2016, 2021, MariaDB Corporation.
+Copyright (c) 2016, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -60,6 +60,7 @@ const trx_t*		trx_roll_crash_recv_trx;
 inline bool trx_t::rollback_finish()
 {
   mod_tables.clear();
+  apply_online_log= false;
   if (UNIV_LIKELY(error_state == DB_SUCCESS))
   {
     commit();
@@ -84,7 +85,7 @@ inline bool trx_t::rollback_finish()
     undo= nullptr;
   }
   commit_low();
-  lock.que_state= TRX_QUE_RUNNING;
+  commit_cleanup();
   return false;
 }
 
@@ -134,23 +135,26 @@ inline void trx_t::rollback_low(trx_savept_t *savept)
   }
   else
   {
+    /* There must not be partial rollback if transaction was chosen as deadlock
+    victim. Galera transaction abort can be invoked during partial rollback. */
+    ut_ad(!(lock.was_chosen_as_deadlock_victim & 1));
     ut_a(error_state == DB_SUCCESS);
     const undo_no_t limit= savept->least_undo_no;
+    apply_online_log= false;
     for (trx_mod_tables_t::iterator i= mod_tables.begin();
-	 i != mod_tables.end(); )
+         i != mod_tables.end(); )
     {
       trx_mod_tables_t::iterator j= i++;
       ut_ad(j->second.valid());
       if (j->second.rollback(limit))
         mod_tables.erase(j);
+      else if (!apply_online_log)
+        apply_online_log= j->first->is_active_ddl();
     }
-    lock.que_state= TRX_QUE_RUNNING;
     MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
   }
 
   mem_heap_free(heap);
-
-  MONITOR_DEC(MONITOR_TRX_ACTIVE);
 }
 
 /** Initiate rollback.
@@ -158,7 +162,7 @@ inline void trx_t::rollback_low(trx_savept_t *savept)
 @return error code or DB_SUCCESS */
 dberr_t trx_t::rollback(trx_savept_t *savept)
 {
-  ut_ad(!trx_mutex_own(this));
+  ut_ad(!mutex_is_owner());
   if (state == TRX_STATE_NOT_STARTED)
   {
     error_state= DB_SUCCESS;
@@ -210,9 +214,13 @@ dberr_t trx_rollback_for_mysql(trx_t* trx)
 	case TRX_STATE_NOT_STARTED:
 		trx->will_lock = false;
 		ut_ad(trx->mysql_thd);
+		/* Galera transaction abort can be invoked from MDL acquision
+		code, so trx->lock.was_chosen_as_deadlock_victim can be set
+		even if trx->state is TRX_STATE_NOT_STARTED. */
+		ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
 #ifdef WITH_WSREP
 		trx->wsrep= false;
-		trx->lock.was_chosen_as_wsrep_victim= false;
+		trx->lock.was_chosen_as_deadlock_victim= false;
 #endif
 		return(DB_SUCCESS);
 
@@ -245,12 +253,10 @@ dberr_t trx_rollback_for_mysql(trx_t* trx)
 			      == trx->rsegs.m_redo.rseg);
 			mtr_t		mtr;
 			mtr.start();
-			mutex_enter(&trx->rsegs.m_redo.rseg->mutex);
 			if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
 				trx_undo_set_state_at_prepare(trx, undo, true,
 							      &mtr);
 			}
-			mutex_exit(&trx->rsegs.m_redo.rseg->mutex);
 			/* Write the redo log for the XA ROLLBACK
 			state change to the global buffer. It is
 			not necessary to flush the redo log. If
@@ -302,11 +308,11 @@ trx_rollback_last_sql_stat_for_mysql(
 
 		if (trx->fts_trx != NULL) {
 			fts_savepoint_rollback_last_stmt(trx);
+			fts_savepoint_laststmt_refresh(trx);
 		}
 
-		/* The following call should not be needed,
-		but we play it safe: */
-		trx_mark_sql_stat_end(trx);
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+		trx->end_bulk_insert();
 
 		trx->op_info = "";
 
@@ -362,24 +368,16 @@ trx_roll_savepoint_free(
 	ut_free(savep);
 }
 
-/*******************************************************************//**
-Frees savepoint structs starting from savep. */
-void
-trx_roll_savepoints_free(
-/*=====================*/
-	trx_t*			trx,	/*!< in: transaction handle */
-	trx_named_savept_t*	savep)	/*!< in: free all savepoints starting
-					with this savepoint i*/
+/** Discard all savepoints starting from a particular savepoint.
+@param savept    first savepoint to discard */
+void trx_t::savepoints_discard(trx_named_savept_t *savept)
 {
-	while (savep != NULL) {
-		trx_named_savept_t*	next_savep;
-
-		next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
-
-		trx_roll_savepoint_free(trx, savep);
-
-		savep = next_savep;
-	}
+  while (savept)
+  {
+    auto next= UT_LIST_GET_NEXT(trx_savepoints, savept);
+    trx_roll_savepoint_free(this, savept);
+    savept= next;
+  }
 }
 
 /*******************************************************************//**
@@ -412,8 +410,7 @@ trx_rollback_to_savepoint_for_mysql_low(
 
 	/* Free all savepoints strictly later than savep. */
 
-	trx_roll_savepoints_free(
-		trx, UT_LIST_GET_NEXT(trx_savepoints, savep));
+	trx->savepoints_discard(UT_LIST_GET_NEXT(trx_savepoints, savep));
 
 	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
 
@@ -428,9 +425,6 @@ trx_rollback_to_savepoint_for_mysql_low(
 	trx_mark_sql_stat_end(trx);
 
 	trx->op_info = "";
-#ifdef WITH_WSREP
-	trx->lock.was_chosen_as_wsrep_victim = false;
-#endif
 	return(err);
 }
 
@@ -531,12 +525,15 @@ trx_savepoint_for_mysql(
 
 	savep->name = mem_strdup(savepoint_name);
 
-	savep->savept = trx_savept_take(trx);
+	savep->savept.least_undo_no = trx->undo_no;
+	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
 
 	savep->mysql_binlog_cache_pos = binlog_cache_pos;
 
 	UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
 
+	trx->end_bulk_insert();
+
 	return(DB_SUCCESS);
 }
 
@@ -561,24 +558,13 @@ trx_release_savepoint_for_mysql(
 
 	if (savep != NULL) {
 		trx_roll_savepoint_free(trx, savep);
+		return DB_SUCCESS;
+	} else if (trx->last_sql_stat_start.least_undo_no == 0) {
+		/* Bulk insert could have discarded savepoints */
+		return DB_SUCCESS;
 	}
 
-	return(savep != NULL ? DB_SUCCESS : DB_NO_SAVEPOINT);
-}
-
-/*******************************************************************//**
-Returns a transaction savepoint taken at this point in time.
-@return savepoint */
-trx_savept_t
-trx_savept_take(
-/*============*/
-	trx_t*	trx)	/*!< in: transaction */
-{
-	trx_savept_t	savept;
-
-	savept.least_undo_no = trx->undo_no;
-
-	return(savept);
+	return DB_NO_SAVEPOINT;
 }
 
 /*******************************************************************//**
@@ -599,7 +585,7 @@ trx_rollback_active(
 
 	heap = mem_heap_create(512);
 
-	fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+	fork = que_fork_create(heap);
 	fork->trx = trx;
 
 	thr = que_thr_create(fork, heap, NULL);
@@ -615,8 +601,7 @@ trx_rollback_active(
 
 	trx_roll_crash_recv_trx	= trx;
 
-	const bool dictionary_locked = trx_get_dict_operation(trx)
-		!= TRX_DICT_OP_NONE;
+	const bool dictionary_locked = trx->dict_operation;
 
 	if (dictionary_locked) {
 		row_mysql_lock_data_dictionary(trx);
@@ -632,26 +617,10 @@ trx_rollback_active(
 
 	if (UNIV_UNLIKELY(!trx->rollback_finish())) {
 		ut_ad(!dictionary_locked);
-		goto func_exit;
-	}
-
-	ut_a(trx->lock.que_state == TRX_QUE_RUNNING);
-
-	if (!dictionary_locked || !trx->table_id) {
-	} else if (dict_table_t* table = dict_table_open_on_id(
-			   trx->table_id, TRUE, DICT_TABLE_OP_NORMAL)) {
-		ib::info() << "Dropping table " << table->name
-			   << ", with id " << trx->table_id
-			   << " in recovery";
-
-		dict_table_close_and_drop(trx, table);
-
-		trx_commit_for_mysql(trx);
+	} else {
+		ib::info() << "Rolled back recovered transaction " << trx_id;
 	}
 
-	ib::info() << "Rolled back recovered transaction " << trx_id;
-
-func_exit:
 	if (dictionary_locked) {
 		row_mysql_unlock_data_dictionary(trx);
 	}
@@ -673,7 +642,7 @@ struct trx_roll_count_callback_arg
 static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
                                        trx_roll_count_callback_arg *arg)
 {
-  mutex_enter(&element->mutex);
+  element->mutex.wr_lock();
   if (trx_t *trx= element->trx)
   {
     if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
@@ -682,7 +651,7 @@ static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
       arg->n_rows+= trx->undo_no;
     }
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return 0;
 }
 
@@ -690,9 +659,9 @@ static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
 void trx_roll_report_progress()
 {
 	time_t now = time(NULL);
-	mutex_enter(&recv_sys.mutex);
+	mysql_mutex_lock(&recv_sys.mutex);
 	bool report = recv_sys.report(now);
-	mutex_exit(&recv_sys.mutex);
+	mysql_mutex_unlock(&recv_sys.mutex);
 
 	if (report) {
 		trx_roll_count_callback_arg arg;
@@ -720,19 +689,18 @@ void trx_roll_report_progress()
 static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
                                                std::vector<trx_t*> *trx_list)
 {
-  mutex_enter(&element->mutex);
+  element->mutex.wr_lock();
   if (trx_t *trx= element->trx)
   {
-    mutex_enter(&trx->mutex);
+    trx->mutex_lock();
     if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
       trx_list->push_back(trx);
-    mutex_exit(&trx->mutex);
+    trx->mutex_unlock();
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return 0;
 }
 
-
 /**
   Rollback any incomplete transactions which were encountered in crash recovery.
 
@@ -752,7 +720,8 @@ void trx_rollback_recovered(bool all)
 {
   std::vector<trx_t*> trx_list;
 
-  ut_a(srv_force_recovery < SRV_FORCE_NO_TRX_UNDO);
+  ut_a(srv_force_recovery <
+       ulong(all ? SRV_FORCE_NO_TRX_UNDO : SRV_FORCE_NO_DDL_UNDO));
 
   /*
     Collect list of recovered ACTIVE transaction ids first. Once collected, no
@@ -768,17 +737,16 @@ void trx_rollback_recovered(bool all)
     trx_list.pop_back();
 
     ut_ad(trx);
-    ut_d(trx_mutex_enter(trx));
+    ut_d(trx->mutex_lock());
     ut_ad(trx->is_recovered);
     ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
-    ut_d(trx_mutex_exit(trx));
+    ut_d(trx->mutex_unlock());
 
     if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
         srv_fast_shutdown)
       goto discard;
 
-    if (all || trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
-        || trx->has_stats_table_lock())
+    if (all || trx->dict_operation || trx->has_stats_table_lock())
     {
       trx_rollback_active(trx);
       if (trx->error_state != DB_SUCCESS)
@@ -818,19 +786,11 @@ Rollback or clean up any incomplete transactions which were
 encountered in crash recovery.  If the transaction already was
 committed, then we clean up a possible insert undo log. If the
 transaction was not yet committed, then we roll it back.
-Note: this is done in a background thread.
-@return a dummy parameter */
-extern "C"
-os_thread_ret_t
-DECLARE_THREAD(trx_rollback_all_recovered)(void*)
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*)
 {
-	my_thread_init();
 	ut_ad(!srv_read_only_mode);
 
-#ifdef UNIV_PFS_THREAD
-	pfs_register_thread(trx_rollback_clean_thread_key);
-#endif /* UNIV_PFS_THREAD */
-
 	if (trx_sys.rw_trx_hash.size()) {
 		ib::info() << "Starting in background the rollback of"
 			" recovered transactions";
@@ -840,12 +800,6 @@ DECLARE_THREAD(trx_rollback_all_recovered)(void*)
 	}
 
 	trx_rollback_is_active = false;
-
-	my_thread_end();
-	/* We count the number of threads in os_thread_exit(). A created
-	thread should always use that to exit and not use return() to exit. */
-
-	return os_thread_exit();
 }
 
 /****************************************************************//**
@@ -864,10 +818,9 @@ trx_roll_graph_build(
 	que_fork_t*	fork;
 	que_thr_t*	thr;
 
-	ut_ad(trx_mutex_own(trx));
-
+	ut_ad(trx->mutex_is_owner());
 	heap = mem_heap_create(512);
-	fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+	fork = que_fork_create(heap);
 	fork->trx = trx;
 
 	thr = que_thr_create(fork, heap, NULL);
@@ -890,10 +843,9 @@ trx_rollback_start(
 					partial undo), 0 if we are rolling back
 					the entire transaction */
 {
-	ut_ad(trx_mutex_own(trx));
-
 	/* Initialize the rollback field in the transaction */
 
+	ut_ad(trx->mutex_is_owner());
 	ut_ad(!trx->roll_limit);
 	ut_ad(!trx->in_rollback);
 
@@ -910,8 +862,6 @@ trx_rollback_start(
 
 	trx->graph = roll_graph;
 
-	trx->lock.que_state = TRX_QUE_ROLLING_BACK;
-
 	return(que_fork_start_command(roll_graph));
 }
 
@@ -958,20 +908,19 @@ trx_rollback_step(
 
 		trx = thr_get_trx(thr);
 
-		trx_mutex_enter(trx);
-
 		node->state = ROLL_NODE_WAIT;
 
 		ut_a(node->undo_thr == NULL);
 
 		roll_limit = node->savept ? node->savept->least_undo_no : 0;
 
+		trx->mutex_lock();
+
 		trx_commit_or_rollback_prepare(trx);
 
 		node->undo_thr = trx_rollback_start(trx, roll_limit);
 
-		trx_mutex_exit(trx);
-
+		trx->mutex_unlock();
 	} else {
 		ut_ad(node->state == ROLL_NODE_WAIT);
 
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
index 35690fb1775..6d95dcf06f1 100644
--- a/storage/innobase/trx/trx0rseg.cc
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -32,12 +32,24 @@ Created 3/26/1996 Heikki Tuuri
 #include "srv0mon.h"
 
 #ifdef WITH_WSREP
-#include <mysql/service_wsrep.h>
+# include <mysql/service_wsrep.h>
 
-#ifdef UNIV_DEBUG
+/** The offset to WSREP XID headers, after TRX_RSEG */
+# define TRX_RSEG_WSREP_XID_INFO      TRX_RSEG_MAX_TRX_ID + 16 + 512
+
+/** WSREP XID format (1 if present and valid, 0 if not present) */
+# define TRX_RSEG_WSREP_XID_FORMAT    TRX_RSEG_WSREP_XID_INFO
+/** WSREP XID GTRID length */
+# define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4
+/** WSREP XID bqual length */
+# define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8
+/** WSREP XID data (XIDDATASIZE bytes) */
+# define TRX_RSEG_WSREP_XID_DATA      TRX_RSEG_WSREP_XID_INFO + 12
+
+# ifdef UNIV_DEBUG
 /** The latest known WSREP XID sequence number */
 static long long wsrep_seqno = -1;
-#endif /* UNIV_DEBUG */
+# endif /* UNIV_DEBUG */
 /** The latest known WSREP XID UUID */
 static unsigned char wsrep_uuid[16];
 
@@ -57,28 +69,28 @@ trx_rseg_write_wsrep_checkpoint(
 
 	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
 				       TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
-				       + rseg_header->frame,
+				       + rseg_header->page.frame,
 				       uint32_t(xid->formatID));
 
 	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
 				       TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
-				       + rseg_header->frame,
+				       + rseg_header->page.frame,
 				       uint32_t(xid->gtrid_length));
 
 	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
 				       TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
-				       + rseg_header->frame,
+				       + rseg_header->page.frame,
 				       uint32_t(xid->bqual_length));
 
 	const ulint xid_length = static_cast<ulint>(xid->gtrid_length
 						    + xid->bqual_length);
 	mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
 				      TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
-				      + rseg_header->frame,
+				      + rseg_header->page.frame,
 				      xid->data, xid_length);
 	if (xid_length < XIDDATASIZE
 	    && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
-		      + rseg_header->frame, field_ref_zero,
+		      + rseg_header->page.frame, field_ref_zero,
 		      XIDDATASIZE - xid_length)) {
 		mtr->memset(rseg_header,
 			    TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
@@ -114,52 +126,37 @@ trx_rseg_update_wsrep_checkpoint(
 	trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
 }
 
-/** Clear the WSREP XID information from rollback segment header.
-@param[in,out]	block	rollback segment header
-@param[in,out]	mtr 	mini-transaction */
-static void trx_rseg_clear_wsrep_checkpoint(buf_block_t *block, mtr_t *mtr)
+static dberr_t trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
 {
-  mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
-              TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE - TRX_RSEG_WSREP_XID_INFO,
-              0);
-}
-
-static void
-trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
-{
-	const byte* xid_uuid = wsrep_xid_uuid(xid);
-	/* We must make check against wsrep_uuid here, the
-	trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with
-	xid contents in debug mode and the memcmp() will never give nonzero
-	result. */
-	const bool must_clear_rsegs = memcmp(wsrep_uuid, xid_uuid,
-					     sizeof wsrep_uuid);
-	const trx_rseg_t* rseg = trx_sys.rseg_array[0];
-
-	buf_block_t* rseg_header = trx_rsegf_get(rseg->space, rseg->page_no,
-						 mtr);
-	if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
-					   + rseg_header->frame))) {
-		trx_rseg_format_upgrade(rseg_header, mtr);
-	}
-
-	trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
-
-	if (must_clear_rsegs) {
-		/* Because the UUID part of the WSREP XID differed
-		from current_xid_uuid, the WSREP group UUID was
-		changed, and we must reset the XID in all rollback
-		segment headers. */
-		for (ulint rseg_id = 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id) {
-			if (const trx_rseg_t* rseg =
-			    trx_sys.rseg_array[rseg_id]) {
-				trx_rseg_clear_wsrep_checkpoint(
-					trx_rsegf_get(rseg->space,
-						      rseg->page_no, mtr),
-				        mtr);
-			}
-		}
-	}
+  dberr_t err;
+  buf_block_t *rseg_header = trx_sys.rseg_array[0].get(mtr, &err);
+
+  if (UNIV_UNLIKELY(!rseg_header))
+    return err;
+
+  /* We must make check against wsrep_uuid here, the
+  trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with xid
+  contents in debug mode and the memcmp() will never give nonzero
+  result. */
+  const bool must_clear_rsegs=
+    memcmp(wsrep_uuid, wsrep_xid_uuid(xid), sizeof wsrep_uuid);
+
+  if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                     rseg_header->page.frame)))
+    trx_rseg_format_upgrade(rseg_header, mtr);
+
+  trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+  if (must_clear_rsegs)
+    /* Because the UUID part of the WSREP XID differed from
+    current_xid_uuid, the WSREP group UUID was changed, and we must
+    reset the XID in all rollback segment headers. */
+    for (ulint rseg_id= 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id)
+      if (buf_block_t* block= trx_sys.rseg_array[rseg_id].get(mtr, &err))
+        mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+                    TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE -
+                    TRX_RSEG_WSREP_XID_INFO, 0);
+  return err;
 }
 
 /** Update WSREP checkpoint XID in first rollback segment header
@@ -186,7 +183,7 @@ bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
 {
 	int formatID = static_cast<int>(
 		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
-				 + rseg_header->frame));
+				 + rseg_header->page.frame));
 	if (formatID == 0) {
 		return false;
 	}
@@ -194,14 +191,14 @@ bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
 	xid.formatID = formatID;
 	xid.gtrid_length = static_cast<int>(
 		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
-				 + rseg_header->frame));
+				 + rseg_header->page.frame));
 
 	xid.bqual_length = static_cast<int>(
 		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
-				 + rseg_header->frame));
+				 + rseg_header->page.frame));
 
 	memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
-	       + rseg_header->frame, XIDDATASIZE);
+	       + rseg_header->page.frame, XIDDATASIZE);
 
 	return true;
 }
@@ -250,6 +247,9 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
 	     rseg_id++, mtr.commit()) {
 		mtr.start();
 		const buf_block_t* sys = trx_sysf_get(&mtr, false);
+		if (UNIV_UNLIKELY(!sys)) {
+			continue;
+		}
 		const uint32_t page_no = trx_sysf_rseg_get_page_no(
 			sys, rseg_id);
 
@@ -257,11 +257,17 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
 			continue;
 		}
 
-		const buf_block_t* rseg_header = trx_rsegf_get_new(
-			trx_sysf_rseg_get_space(sys, rseg_id), page_no, &mtr);
+		const buf_block_t* rseg_header = buf_page_get_gen(
+			page_id_t(trx_sysf_rseg_get_space(sys, rseg_id),
+				  page_no),
+			0, RW_S_LATCH, nullptr, BUF_GET, &mtr);
+
+		if (!rseg_header) {
+			continue;
+		}
 
 		if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
-				     + rseg_header->frame)) {
+				     + rseg_header->page.frame)) {
 			continue;
 		}
 
@@ -282,6 +288,17 @@ bool trx_rseg_read_wsrep_checkpoint(XID& xid)
 }
 #endif /* WITH_WSREP */
 
+buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const
+{
+  if (!space)
+  {
+    if (err) *err= DB_TABLESPACE_NOT_FOUND;
+    return nullptr;
+  }
+  return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr,
+                          BUF_GET, mtr, err);
+}
+
 /** Upgrade a rollback segment header page to MariaDB 10.3 format.
 @param[in,out]	rseg_header	rollback segment header page
 @param[in,out]	mtr		mini-transaction */
@@ -297,126 +314,96 @@ void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
 }
 
 /** Create a rollback segment header.
-@param[in,out]	space		system, undo, or temporary tablespace
-@param[in]	rseg_id		rollback segment identifier
-@param[in]	max_trx_id	new value of TRX_RSEG_MAX_TRX_ID
-@param[in,out]	sys_header	the TRX_SYS page (NULL for temporary rseg)
-@param[in,out]	mtr		mini-transaction
+@param[in,out]  space           system, undo, or temporary tablespace
+@param[in]      rseg_id         rollback segment identifier
+@param[in]      max_trx_id      new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
 @return the created rollback segment
-@retval	NULL	on failure */
-buf_block_t*
-trx_rseg_header_create(
-	fil_space_t*	space,
-	ulint		rseg_id,
-	trx_id_t	max_trx_id,
-	buf_block_t*	sys_header,
-	mtr_t*		mtr)
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+                                    trx_id_t max_trx_id, mtr_t *mtr,
+                                    dberr_t *err)
 {
-	buf_block_t*	block;
-
-	ut_ad(mtr->memo_contains(*space));
-	ut_ad(!sys_header == (space == fil_system.temp_space));
-
-	/* Allocate a new file segment for the rollback segment */
-	block = fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
-
-	if (block == NULL) {
-		/* No space left */
-		return block;
-	}
-
-	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
-
-	ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG
-				    + block->frame));
-	ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG
-				    + block->frame));
-	ut_ad(0 == mach_read_from_4(TRX_RSEG_MAX_TRX_ID + TRX_RSEG
-				    + block->frame));
-
-	/* Initialize the history list */
-	flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
-
-	mtr->write<8,mtr_t::MAYBE_NOP>(*block,
-				       TRX_RSEG + TRX_RSEG_MAX_TRX_ID
-				       + block->frame, max_trx_id);
-
-	/* Reset the undo log slots */
-	mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG,
-		    TRX_RSEG_N_SLOTS * 4, 0xff);
-
-	if (sys_header) {
-		/* Add the rollback segment info to the free slot in
-		the trx system header */
-
-		mtr->write<4,mtr_t::MAYBE_NOP>(
-			*sys_header,
-			TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
-			+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
-			+ sys_header->frame, space->id);
-		mtr->write<4,mtr_t::MAYBE_NOP>(
-			*sys_header,
-			TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO
-			+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
-			+ sys_header->frame, block->page.id().page_no());
-	}
-
-	return block;
+  ut_ad(mtr->memo_contains(*space));
+  buf_block_t *block=
+    fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr, err);
+  if (block)
+  {
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG +
+                                block->page.frame));
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG +
+                                block->page.frame));
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_MAX_TRX_ID + TRX_RSEG +
+                                block->page.frame));
+
+    /* Initialize the history list */
+    flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                   block->page.frame, max_trx_id);
+
+    /* Reset the undo log slots */
+    mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG, TRX_RSEG_N_SLOTS * 4,
+                0xff);
+  }
+  return block;
 }
 
-/** Free a rollback segment in memory. */
-void
-trx_rseg_mem_free(trx_rseg_t* rseg)
+void trx_rseg_t::destroy()
 {
-	trx_undo_t*	undo;
-	trx_undo_t*	next_undo;
-
-	mutex_free(&rseg->mutex);
-
-	/* There can't be any active transactions. */
-	ut_a(UT_LIST_GET_LEN(rseg->undo_list) == 0);
+  latch.destroy();
 
-	for (undo = UT_LIST_GET_FIRST(rseg->undo_cached);
-	     undo != NULL;
-	     undo = next_undo) {
+  /* There can't be any active transactions. */
+  ut_a(!UT_LIST_GET_LEN(undo_list));
 
-		next_undo = UT_LIST_GET_NEXT(undo_list, undo);
-
-		UT_LIST_REMOVE(rseg->undo_cached, undo);
-
-		MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
-
-		ut_free(undo);
-	}
-
-	ut_free(rseg);
+  for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+       undo= next)
+  {
+    next= UT_LIST_GET_NEXT(undo_list, undo);
+    UT_LIST_REMOVE(undo_cached, undo);
+    ut_free(undo);
+  }
 }
 
-/** Create a rollback segment object.
-@param[in]	id		rollback segment id
-@param[in]	space		space where the segment is placed
-@param[in]	page_no		page number of the segment header */
-static
-trx_rseg_t*
-trx_rseg_mem_create(ulint id, fil_space_t* space, uint32_t page_no)
+void trx_rseg_t::init(fil_space_t *space, uint32_t page)
 {
-	trx_rseg_t* rseg = static_cast<trx_rseg_t*>(
-		ut_zalloc_nokey(sizeof *rseg));
+  latch.SRW_LOCK_INIT(trx_rseg_latch_key);
+  ut_ad(!this->space);
+  this->space= space;
+  page_no= page;
+  last_page_no= FIL_NULL;
+  curr_size= 1;
+
+  UT_LIST_INIT(undo_list, &trx_undo_t::undo_list);
+  UT_LIST_INIT(undo_cached, &trx_undo_t::undo_list);
+}
 
-	rseg->id = id;
-	rseg->space = space;
-	rseg->page_no = page_no;
-	rseg->last_page_no = FIL_NULL;
-	rseg->curr_size = 1;
+void trx_rseg_t::reinit(uint32_t page)
+{
+  ut_ad(is_persistent());
+  ut_ad(page_no == page);
+  ut_a(!UT_LIST_GET_LEN(undo_list));
+  ut_ad(!history_size || UT_LIST_GET_FIRST(undo_cached));
 
-	mutex_create(rseg->is_persistent()
-		     ? LATCH_ID_REDO_RSEG : LATCH_ID_NOREDO_RSEG,
-		     &rseg->mutex);
+  history_size= 0;
+  page_no= page;
 
-	UT_LIST_INIT(rseg->undo_list, &trx_undo_t::undo_list);
-	UT_LIST_INIT(rseg->undo_cached, &trx_undo_t::undo_list);
+  for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+       undo= next)
+  {
+    next= UT_LIST_GET_NEXT(undo_list, undo);
+    UT_LIST_REMOVE(undo_cached, undo);
+    MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
+    ut_free(undo);
+  }
 
-	return(rseg);
+  ut_ad(!is_referenced());
+  needs_purge= 0;
+  last_commit_and_offset= 0;
+  last_page_no= FIL_NULL;
+  curr_size= 1;
+  ref.store(0, std::memory_order_release);
 }
 
 /** Read the undo log lists.
@@ -451,99 +438,96 @@ static dberr_t trx_undo_lists_init(trx_rseg_t *rseg,
 @return error code */
 static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
 {
-	buf_block_t* rseg_hdr = trx_rsegf_get_new(
-		rseg->space->id, rseg->page_no, mtr);
-
-	if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->frame)) {
-		trx_id_t id = mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID
-					       + rseg_hdr->frame);
-		if (id > rseg->needs_purge) {
-			rseg->needs_purge = id;
-		}
+  if (!rseg->space)
+    return DB_TABLESPACE_NOT_FOUND;
+  dberr_t err;
+  const buf_block_t *rseg_hdr=
+    buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr,
+                     &err);
+  if (!rseg_hdr)
+    return err;
+
+  if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->page.frame))
+  {
+    trx_id_t id= mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                  rseg_hdr->page.frame);
 
-		const byte* binlog_name = TRX_RSEG + TRX_RSEG_BINLOG_NAME
-			+ rseg_hdr->frame;
-		if (*binlog_name) {
-			lsn_t lsn = mach_read_from_8(my_assume_aligned<8>(
-							     FIL_PAGE_LSN
-							     + rseg_hdr
-							     ->frame));
-			compile_time_assert(TRX_RSEG_BINLOG_NAME_LEN == sizeof
-					    trx_sys.recovered_binlog_filename);
-			if (lsn > trx_sys.recovered_binlog_lsn) {
-				trx_sys.recovered_binlog_lsn = lsn;
-				trx_sys.recovered_binlog_offset
-					= mach_read_from_8(
-						TRX_RSEG
-						+ TRX_RSEG_BINLOG_OFFSET
-						+ rseg_hdr->frame);
-				memcpy(trx_sys.recovered_binlog_filename,
-				       binlog_name,
-				       TRX_RSEG_BINLOG_NAME_LEN);
-			}
+    if (id > rseg->needs_purge)
+      rseg->needs_purge= id;
+
+    const byte *binlog_name=
+      TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame;
+    if (*binlog_name)
+    {
+      lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                  (FIL_PAGE_LSN + rseg_hdr->page.frame));
+      static_assert(TRX_RSEG_BINLOG_NAME_LEN ==
+                    sizeof trx_sys.recovered_binlog_filename, "compatibility");
+      if (lsn > trx_sys.recovered_binlog_lsn)
+      {
+        trx_sys.recovered_binlog_lsn= lsn;
+        trx_sys.recovered_binlog_offset=
+          mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET +
+                           rseg_hdr->page.frame);
+        memcpy(trx_sys.recovered_binlog_filename, binlog_name,
+               TRX_RSEG_BINLOG_NAME_LEN);
+      }
 
 #ifdef WITH_WSREP
-			trx_rseg_read_wsrep_checkpoint(
-				rseg_hdr, trx_sys.recovered_wsrep_xid);
+      trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
 #endif
-		}
-	}
-
-	if (srv_operation == SRV_OPERATION_RESTORE) {
-		/* mariabackup --prepare only deals with
-		the redo log and the data files, not with
-		transactions or the data dictionary. */
-		return DB_SUCCESS;
-	}
-
-	/* Initialize the undo log lists according to the rseg header */
-
-	rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE
-					   + rseg_hdr->frame)
-		+ 1;
-	if (dberr_t err = trx_undo_lists_init(rseg, rseg_hdr)) {
-		return err;
-	}
-
-	if (auto len = flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY
-				    + rseg_hdr->frame)) {
-		trx_sys.rseg_history_len += len;
-
-		fil_addr_t node_addr = flst_get_last(TRX_RSEG
-						     + TRX_RSEG_HISTORY
-						     + rseg_hdr->frame);
-		node_addr.boffset = static_cast<uint16_t>(
-			node_addr.boffset - TRX_UNDO_HISTORY_NODE);
-
-		rseg->last_page_no = node_addr.page;
-
-		const buf_block_t* block = trx_undo_page_get(
-			page_id_t(rseg->space->id, node_addr.page), mtr);
-
-		trx_id_t trx_id, id;
-		trx_id = mach_read_from_8(block->frame + node_addr.boffset
-					  + TRX_UNDO_TRX_ID);
-		id = mach_read_from_8(block->frame + node_addr.boffset
-				      + TRX_UNDO_TRX_NO);
-		trx_id = std::max(trx_id, id);
-
-		if (trx_id > rseg->needs_purge) {
-			rseg->needs_purge = trx_id;
-		}
+    }
+  }
 
-		rseg->set_last_commit(node_addr.boffset, id);
-		ut_ad(mach_read_from_2(block->frame + node_addr.boffset
-				       + TRX_UNDO_NEEDS_PURGE) <= 1);
+  if (srv_operation == SRV_OPERATION_RESTORE)
+    /* mariabackup --prepare only deals with
+    the redo log and the data files, not with
+    transactions or the data dictionary. */
+    return DB_SUCCESS;
 
-		if (rseg->last_page_no != FIL_NULL) {
+  /* Initialize the undo log lists according to the rseg header */
 
-			/* There is no need to cover this operation by the purge
-			mutex because we are still bootstrapping. */
-			purge_sys.purge_queue.push(*rseg);
-		}
-	}
+  rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                                     rseg_hdr->page.frame) + 1;
+  err= trx_undo_lists_init(rseg, rseg_hdr);
+  if (err != DB_SUCCESS);
+  else if (auto len= flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY +
+                                  rseg_hdr->page.frame))
+  {
+    rseg->history_size+= len;
+
+    fil_addr_t node_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY +
+                                        rseg_hdr->page.frame);
+    node_addr.boffset= static_cast<uint16_t>(node_addr.boffset -
+                                             TRX_UNDO_HISTORY_NODE);
+    rseg->last_page_no= node_addr.page;
+
+    const buf_block_t* block=
+      buf_page_get_gen(page_id_t(rseg->space->id, node_addr.page),
+                       0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err);
+    if (!block)
+      return err;
+
+    trx_id_t id= mach_read_from_8(block->page.frame + node_addr.boffset +
+                                  TRX_UNDO_TRX_ID);
+    if (id > rseg->needs_purge)
+      rseg->needs_purge= id;
+    id= mach_read_from_8(block->page.frame + node_addr.boffset +
+                         TRX_UNDO_TRX_NO);
+    if (id > rseg->needs_purge)
+      rseg->needs_purge= id;
+
+    rseg->set_last_commit(node_addr.boffset, id);
+    ut_ad(mach_read_from_2(block->page.frame + node_addr.boffset +
+                           TRX_UNDO_NEEDS_PURGE) <= 1);
+
+    if (rseg->last_page_no != FIL_NULL)
+      /* There is no need to cover this operation by the purge
+      mutex because we are still bootstrapping. */
+      purge_sys.purge_queue.push(*rseg);
+  }
 
-	return DB_SUCCESS;
+  return err;
 }
 
 /** Read binlog metadata from the TRX_SYS page, in case we are upgrading
@@ -592,8 +576,8 @@ dberr_t trx_rseg_array_init()
 				information from the TRX_SYS page. */
 				max_trx_id = mach_read_from_8(
 					TRX_SYS + TRX_SYS_TRX_ID_STORE
-					+ sys->frame);
-				trx_rseg_init_binlog_info(sys->frame);
+					+ sys->page.frame);
+				trx_rseg_init_binlog_info(sys->page.frame);
 #ifdef WITH_WSREP
 				wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
 #endif
@@ -602,20 +586,16 @@ dberr_t trx_rseg_array_init()
 			const uint32_t	page_no = trx_sysf_rseg_get_page_no(
 				sys, rseg_id);
 			if (page_no != FIL_NULL) {
-				trx_rseg_t* rseg = trx_rseg_mem_create(
-					rseg_id,
-					fil_space_get(trx_sysf_rseg_get_space(
-							      sys, rseg_id)),
-					page_no);
-				ut_ad(rseg->is_persistent());
-				ut_ad(rseg->id == rseg_id);
-				ut_ad(!trx_sys.rseg_array[rseg_id]);
-				trx_sys.rseg_array[rseg_id] = rseg;
-				err = trx_rseg_mem_restore(rseg, &mtr);
-				if (rseg->needs_purge > max_trx_id) {
-					max_trx_id = rseg->needs_purge;
+				trx_rseg_t& rseg = trx_sys.rseg_array[rseg_id];
+				rseg.init(fil_space_get(
+						  trx_sysf_rseg_get_space(
+							  sys, rseg_id)),
+					  page_no);
+				ut_ad(rseg.is_persistent());
+				err = trx_rseg_mem_restore(&rseg, &mtr);
+				if (rseg.needs_purge > max_trx_id) {
+					max_trx_id = rseg.needs_purge;
 				}
-
 				if (err != DB_SUCCESS) {
 					mtr.commit();
 					break;
@@ -640,15 +620,10 @@ dberr_t trx_rseg_array_init()
 	}
 
 	if (err != DB_SUCCESS) {
-		for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
-			if (trx_rseg_t*& rseg = trx_sys.rseg_array[rseg_id]) {
-				while (trx_undo_t* u= UT_LIST_GET_FIRST(
-					     rseg->undo_list)) {
-					UT_LIST_REMOVE(rseg->undo_list, u);
-					ut_free(u);
-				}
-				trx_rseg_mem_free(rseg);
-				rseg = NULL;
+		for (auto& rseg : trx_sys.rseg_array) {
+			while (auto u = UT_LIST_GET_FIRST(rseg.undo_list)) {
+				UT_LIST_REMOVE(rseg.undo_list, u);
+				ut_free(u);
 			}
 		}
 		return err;
@@ -679,64 +654,27 @@ dberr_t trx_rseg_array_init()
 	return DB_SUCCESS;
 }
 
-/** Create a persistent rollback segment.
-@param[in]	space_id	system or undo tablespace id
-@return pointer to new rollback segment
-@retval	NULL	on failure */
-trx_rseg_t*
-trx_rseg_create(ulint space_id)
-{
-	trx_rseg_t*		rseg = NULL;
-	mtr_t			mtr;
-
-	mtr.start();
-
-	fil_space_t*	space = mtr_x_lock_space(space_id, &mtr);
-	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
-
-	if (buf_block_t* sys_header = trx_sysf_get(&mtr)) {
-		ulint	rseg_id = trx_sys_rseg_find_free(sys_header);
-		if (buf_block_t* rblock = rseg_id == ULINT_UNDEFINED
-		    ? NULL
-		    : trx_rseg_header_create(space, rseg_id, 0, sys_header,
-					     &mtr)) {
-			ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id)
-			      == space_id);
-			rseg = trx_rseg_mem_create(rseg_id, space,
-						   rblock->page.id().
-						   page_no());
-			ut_ad(rseg->id == rseg_id);
-			ut_ad(rseg->is_persistent());
-			ut_ad(!trx_sys.rseg_array[rseg->id]);
-			trx_sys.rseg_array[rseg->id] = rseg;
-		}
-	}
-
-	mtr.commit();
-
-	return(rseg);
-}
-
 /** Create the temporary rollback segments. */
-void
-trx_temp_rseg_create()
+dberr_t trx_temp_rseg_create(mtr_t *mtr)
 {
-	mtr_t		mtr;
-
-	for (ulong i = 0; i < TRX_SYS_N_RSEGS; i++) {
-		mtr.start();
-		mtr.set_log_mode(MTR_LOG_NO_REDO);
-		mtr_x_lock_space(fil_system.temp_space, &mtr);
-
-		buf_block_t* rblock = trx_rseg_header_create(
-			fil_system.temp_space, i, 0, NULL, &mtr);
-		trx_rseg_t* rseg = trx_rseg_mem_create(
-			i, fil_system.temp_space, rblock->page.id().page_no());
-		ut_ad(!rseg->is_persistent());
-		ut_ad(!trx_sys.temp_rsegs[i]);
-		trx_sys.temp_rsegs[i] = rseg;
-		mtr.commit();
-	}
+  for (ulong i= 0; i < array_elements(trx_sys.temp_rsegs); i++)
+  {
+    mtr->start();
+    mtr->set_log_mode(MTR_LOG_NO_REDO);
+    mtr->x_lock_space(fil_system.temp_space);
+    dberr_t err;
+    buf_block_t *rblock=
+      trx_rseg_header_create(fil_system.temp_space, i, 0, mtr, &err);
+    if (UNIV_UNLIKELY(!rblock))
+    {
+      mtr->commit();
+      return err;
+    }
+    trx_sys.temp_rsegs[i].init(fil_system.temp_space,
+                               rblock->page.id().page_no());
+    mtr->commit();
+  }
+  return DB_SUCCESS;
 }
 
 /** Update the offset information about the end of the binlog entry
@@ -761,10 +699,10 @@ void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, const trx_t *trx,
 
 	mtr->write<8,mtr_t::MAYBE_NOP>(*rseg_header,
 				       TRX_RSEG + TRX_RSEG_BINLOG_OFFSET
-				       + rseg_header->frame,
+				       + rseg_header->page.frame,
 				       trx->mysql_log_offset);
 
-	void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->frame;
+	void* name = TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->page.frame;
 
 	if (memcmp(trx->mysql_log_file_name, name, len)) {
 		mtr->memcpy(*rseg_header, name, trx->mysql_log_file_name, len);
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
index bcde969eb41..d344f3a0c83 100644
--- a/storage/innobase/trx/trx0sys.cc
+++ b/storage/innobase/trx/trx0sys.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2017, 2021, MariaDB Corporation.
+Copyright (c) 2017, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -44,40 +44,6 @@ Created 3/26/1996 Heikki Tuuri
 /** The transaction system */
 trx_sys_t		trx_sys;
 
-/** Check whether transaction id is valid.
-@param[in]	id              transaction id to check
-@param[in]      name            table name */
-void
-ReadViewBase::check_trx_id_sanity(
-	trx_id_t		id,
-	const table_name_t&	name)
-{
-	if (id >= trx_sys.get_max_trx_id()) {
-
-		ib::warn() << "A transaction id"
-			   << " in a record of table "
-			   << name
-			   << " is newer than the"
-			   << " system-wide maximum.";
-		ut_ad(0);
-		THD *thd = current_thd;
-		if (thd != NULL) {
-			char    table_name[MAX_FULL_NAME_LEN + 1];
-
-			innobase_format_name(
-				table_name, sizeof(table_name),
-				name.m_name);
-
-			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-					    ER_SIGNAL_WARN,
-					    "InnoDB: Transaction id"
-					    " in a record of table"
-					    " %s is newer than system-wide"
-					    " maximum.", table_name);
-		}
-	}
-}
-
 #ifdef UNIV_DEBUG
 /* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
 uint	trx_rseg_n_slots_debug = 0;
@@ -136,100 +102,155 @@ trx_sysf_get_n_rseg_slots()
 	mtr.commit();
 }
 
-/*****************************************************************//**
-Creates the file page for the transaction system. This function is called only
-at the database creation, before trx_sys_init. */
-static
-void
-trx_sysf_create(
-/*============*/
-	mtr_t*	mtr)	/*!< in: mtr */
+/** Initialize the transaction system when creating the database. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
 {
-	ulint		slot_no;
-	buf_block_t*	block;
-
-	ut_ad(mtr);
-
-	/* Note that below we first reserve the file space x-latch, and
-	then enter the kernel: we must do it in this order to conform
-	to the latching order rules. */
-
-	mtr_x_lock_space(fil_system.sys_space, mtr);
-	compile_time_assert(TRX_SYS_SPACE == 0);
-
-	/* Create the trx sys file block in a new allocated file segment */
-	block = fseg_create(fil_system.sys_space,
-			    TRX_SYS + TRX_SYS_FSEG_HEADER,
-			    mtr);
-	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
-
-	ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
-
-	mtr->write<2>(*block, FIL_PAGE_TYPE + block->frame,
-		      FIL_PAGE_TYPE_TRX_SYS);
-
-	ut_ad(!mach_read_from_4(block->frame
-				+ TRX_SYS_DOUBLEWRITE
-				+ TRX_SYS_DOUBLEWRITE_MAGIC));
-
-	/* Reset the rollback segment slots.  Old versions of InnoDB
-	(before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
-	that the whole array is initialized. */
-	compile_time_assert(256 >= TRX_SYS_N_RSEGS);
-	compile_time_assert(TRX_SYS + TRX_SYS_RSEGS
-			    + 256 * TRX_SYS_RSEG_SLOT_SIZE
-			    <= UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END);
-	mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS,
-		    256 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
-	/* Initialize all of the page.  This part used to be uninitialized. */
-	mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS
-		    + 256 * TRX_SYS_RSEG_SLOT_SIZE,
-		    srv_page_size
-		    - (FIL_PAGE_DATA_END + TRX_SYS + TRX_SYS_RSEGS
-		       + 256 * TRX_SYS_RSEG_SLOT_SIZE),
-		    0);
-
-	/* Create the first rollback segment in the SYSTEM tablespace */
-	slot_no = trx_sys_rseg_find_free(block);
-	buf_block_t* rblock = trx_rseg_header_create(fil_system.sys_space,
-						     slot_no, 0, block, mtr);
-
-	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
-	ut_a(rblock->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+  mtr->start();
+  mtr->x_lock_space(fil_system.sys_space);
+  static_assert(TRX_SYS_SPACE == 0, "compatibility");
+
+  /* Create the trx sys file block in a new allocated file segment */
+  dberr_t err;
+  buf_block_t *block= fseg_create(fil_system.sys_space,
+                                  TRX_SYS + TRX_SYS_FSEG_HEADER, mtr, &err);
+  if (UNIV_UNLIKELY(!block))
+  {
+  error:
+    mtr->commit();
+    return err;
+  }
+  ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+  mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                FIL_PAGE_TYPE_TRX_SYS);
+
+  /* Reset the rollback segment slots.  Old versions of InnoDB
+  (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+  that the whole array is initialized. */
+  static_assert(256 >= TRX_SYS_N_RSEGS, "");
+  static_assert(TRX_SYS + TRX_SYS_RSEGS + 256 * TRX_SYS_RSEG_SLOT_SIZE <=
+                UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END, "");
+  mtr->write<4>(*block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+                block->page.frame, FSP_FIRST_RSEG_PAGE_NO);
+  mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SLOT_SIZE,
+              255 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+
+  buf_block_t *r= trx_rseg_header_create(fil_system.sys_space, 0, 0,
+                                         mtr, &err);
+  if (UNIV_UNLIKELY(!r))
+    goto error;
+  ut_a(r->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+  mtr->commit();
+
+  return trx_lists_init_at_db_start();
 }
 
-/** Create the instance */
-void
-trx_sys_t::create()
+void trx_sys_t::create()
 {
-	ut_ad(this == &trx_sys);
-	ut_ad(!is_initialised());
-	m_initialised = true;
-	trx_list.create();
-	rseg_history_len= 0;
+  ut_ad(this == &trx_sys);
+  ut_ad(!is_initialised());
+  m_initialised= true;
+  trx_list.create();
+  rw_trx_hash.init();
+}
 
-	rw_trx_hash.init();
+size_t trx_sys_t::history_size()
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  for (auto &rseg : rseg_array)
+  {
+    rseg.latch.rd_lock(SRW_LOCK_CALL);
+    size+= rseg.history_size;
+  }
+  for (auto &rseg : rseg_array)
+    rseg.latch.rd_unlock();
+  return size;
 }
 
-/*****************************************************************//**
-Creates and initializes the transaction system at the database creation. */
-void
-trx_sys_create_sys_pages(void)
-/*==========================*/
+bool trx_sys_t::history_exceeds(size_t threshold)
 {
-	mtr_t	mtr;
+  ut_ad(is_initialised());
+  size_t size= 0;
+  bool exceeds= false;
+  size_t i;
+  for (i= 0; i < array_elements(rseg_array); i++)
+  {
+    rseg_array[i].latch.rd_lock(SRW_LOCK_CALL);
+    size+= rseg_array[i].history_size;
+    if (size > threshold)
+    {
+      exceeds= true;
+      i++;
+      break;
+    }
+  }
+  while (i)
+    rseg_array[--i].latch.rd_unlock();
+  return exceeds;
+}
 
-	mtr_start(&mtr);
+TPOOL_SUPPRESS_TSAN bool trx_sys_t::history_exists()
+{
+  ut_ad(is_initialised());
+  for (auto &rseg : rseg_array)
+    if (rseg.history_size)
+      return true;
+  return false;
+}
 
-	trx_sysf_create(&mtr);
+TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  for (auto &rseg : rseg_array)
+    size+= rseg.history_size;
+  return size;
+}
 
-	mtr_commit(&mtr);
+/** Create a persistent rollback segment.
+@param space_id   system or undo tablespace id
+@return pointer to new rollback segment
+@retval nullptr  on failure */
+static trx_rseg_t *trx_rseg_create(ulint space_id)
+{
+  trx_rseg_t *rseg= nullptr;
+  mtr_t mtr;
+
+  mtr.start();
+
+  if (fil_space_t *space= mtr.x_lock_space(space_id))
+  {
+    ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+    if (buf_block_t *sys_header= trx_sysf_get(&mtr))
+    {
+      ulint rseg_id= trx_sys_rseg_find_free(sys_header);
+      dberr_t err;
+      if (buf_block_t *rblock= rseg_id == ULINT_UNDEFINED
+          ? nullptr : trx_rseg_header_create(space, rseg_id, 0, &mtr, &err))
+      {
+        rseg= &trx_sys.rseg_array[rseg_id];
+        rseg->init(space, rblock->page.id().page_no());
+        ut_ad(rseg->is_persistent());
+        mtr.write<4,mtr_t::MAYBE_NOP>
+          (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE +
+           rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+           space_id);
+        mtr.write<4,mtr_t::MAYBE_NOP>
+          (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+           rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+           rseg->page_no);
+      }
+    }
+  }
+
+  mtr.commit();
+  return rseg;
 }
 
 /** Create the rollback segments.
 @return	whether the creation succeeded */
-bool
-trx_sys_create_rsegs()
+bool trx_sys_create_rsegs()
 {
 	/* srv_available_undo_logs reflects the number of persistent
 	rollback segments that have been initialized in the
@@ -309,14 +330,11 @@ trx_sys_t::close()
 
 	/* There can't be any active transactions. */
 
-	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
-		if (trx_rseg_t* rseg = rseg_array[i]) {
-			trx_rseg_mem_free(rseg);
-		}
-
-		if (trx_rseg_t* rseg = temp_rsegs[i]) {
-			trx_rseg_mem_free(rseg);
-		}
+	for (ulint i = 0; i < array_elements(temp_rsegs); ++i) {
+		temp_rsegs[i].destroy();
+	}
+	for (ulint i = 0; i < array_elements(rseg_array); ++i) {
+		rseg_array[i].destroy();
 	}
 
 	ut_a(trx_list.empty());
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 4ccf67c3bf9..ec5d14cc60a 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2021, MariaDB Corporation.
+Copyright (c) 2015, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -47,6 +47,7 @@ Created 3/26/1996 Heikki Tuuri
 #include "trx0xa.h"
 #include "ut0pool.h"
 #include "ut0vec.h"
+#include "log.h"
 
 #include <set>
 #include <new>
@@ -64,12 +65,6 @@ const byte timestamp_max_bytes[7] = {
 
 static const ulint MAX_DETAILED_ERROR_LEN = 256;
 
-/** Set of table_id */
-typedef std::set<
-	table_id_t,
-	std::less<table_id_t>,
-	ut_allocator<table_id_t> >	table_id_set;
-
 /*************************************************************//**
 Set detailed error message for the transaction. */
 void
@@ -119,9 +114,7 @@ trx_init(
 
 	trx->lock.n_rec_locks = 0;
 
-	trx->dict_operation = TRX_DICT_OP_NONE;
-
-	trx->table_id = 0;
+	trx->dict_operation = false;
 
 	trx->error_state = DB_SUCCESS;
 
@@ -139,9 +132,9 @@ trx_init(
 
 	trx->will_lock = false;
 
-	trx->ddl = false;
+	trx->bulk_insert = false;
 
-	trx->internal = false;
+	trx->apply_online_log = false;
 
 	ut_d(trx->start_file = 0);
 
@@ -149,8 +142,6 @@ trx_init(
 
 	trx->magic_n = TRX_MAGIC_N;
 
-	trx->lock.que_state = TRX_QUE_RUNNING;
-
 	trx->last_sql_stat_start.least_undo_no = 0;
 
 	ut_ad(!trx->read_view.is_open());
@@ -160,7 +151,6 @@ trx_init(
 	trx->lock.table_cached = 0;
 #ifdef WITH_WSREP
 	ut_ad(!trx->wsrep);
-	ut_ad(!trx->wsrep_UK_scan);
 #endif /* WITH_WSREP */
 }
 
@@ -187,18 +177,16 @@ struct TrxFactory {
 		trx->rw_trx_hash_pins = 0;
 		trx_init(trx);
 
-		trx->dict_operation_lock_mode = 0;
-
-		trx->xid = UT_NEW_NOKEY(xid_t());
+		trx->dict_operation_lock_mode = false;
 
 		trx->detailed_error = reinterpret_cast<char*>(
 			ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
 
 		trx->lock.lock_heap = mem_heap_create_typed(
 			1024, MEM_HEAP_FOR_LOCK_HEAP);
+		pthread_cond_init(&trx->lock.cond, nullptr);
 
-		lock_trx_lock_list_init(&trx->lock.trx_locks);
-
+		UT_LIST_INIT(trx->lock.trx_locks, &lock_t::trx_locks);
 		UT_LIST_INIT(trx->lock.evicted_tables,
 			     &dict_table_t::table_LRU);
 
@@ -206,7 +194,7 @@ struct TrxFactory {
 			trx->trx_savepoints,
 			&trx_named_savept_t::trx_savepoints);
 
-		mutex_create(LATCH_ID_TRX, &trx->mutex);
+		trx->mutex_init();
 	}
 
 	/** Release resources held by the transaction object.
@@ -230,20 +218,21 @@ struct TrxFactory {
 
 		ut_a(trx->lock.wait_lock == NULL);
 		ut_a(trx->lock.wait_thr == NULL);
-		ut_a(trx->dict_operation_lock_mode == 0);
+		ut_a(!trx->dict_operation_lock_mode);
 
 		if (trx->lock.lock_heap != NULL) {
 			mem_heap_free(trx->lock.lock_heap);
 			trx->lock.lock_heap = NULL;
 		}
 
+		pthread_cond_destroy(&trx->lock.cond);
+
 		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
 		ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
 
-		UT_DELETE(trx->xid);
 		ut_free(trx->detailed_error);
 
-		mutex_free(&trx->mutex);
+		trx->mutex_destroy();
 
 		trx->mod_tables.~trx_mod_tables_t();
 
@@ -256,49 +245,47 @@ struct TrxFactory {
 };
 
 /** The lock strategy for TrxPool */
-struct TrxPoolLock {
-	TrxPoolLock() = default;
-
-	/** Create the mutex */
-	void create()
-	{
-		mutex_create(LATCH_ID_TRX_POOL, &m_mutex);
-	}
+class TrxPoolLock
+{
+  mysql_mutex_t mutex;
 
-	/** Acquire the mutex */
-	void enter() { mutex_enter(&m_mutex); }
+public:
+  /** Create the mutex */
+  void create()
+  {
+    mysql_mutex_init(trx_pool_mutex_key, &mutex, nullptr);
+  }
 
-	/** Release the mutex */
-	void exit() { mutex_exit(&m_mutex); }
+  /** Acquire the mutex */
+  void enter() { mysql_mutex_lock(&mutex); }
 
-	/** Free the mutex */
-	void destroy() { mutex_free(&m_mutex); }
+  /** Release the mutex */
+  void exit() { mysql_mutex_unlock(&mutex); }
 
-	/** Mutex to use */
-	ib_mutex_t	m_mutex;
+  /** Free the mutex */
+  void destroy() { mysql_mutex_destroy(&mutex); }
 };
 
 /** The lock strategy for the TrxPoolManager */
-struct TrxPoolManagerLock {
-	TrxPoolManagerLock() = default;
-
-	/** Create the mutex */
-	void create()
-	{
-		mutex_create(LATCH_ID_TRX_POOL_MANAGER, &m_mutex);
-	}
+class TrxPoolManagerLock
+{
+  mysql_mutex_t mutex;
 
-	/** Acquire the mutex */
-	void enter() { mutex_enter(&m_mutex); }
+public:
+  /** Create the mutex */
+  void create()
+  {
+    mysql_mutex_init(trx_pool_manager_mutex_key, &mutex, nullptr);
+  }
 
-	/** Release the mutex */
-	void exit() { mutex_exit(&m_mutex); }
+  /** Acquire the mutex */
+  void enter() { mysql_mutex_lock(&mutex); }
 
-	/** Free the mutex */
-	void destroy() { mutex_free(&m_mutex); }
+  /** Release the mutex */
+  void exit() { mysql_mutex_unlock(&mutex); }
 
-	/** Mutex to use */
-	ib_mutex_t	m_mutex;
+  /** Free the mutex */
+  void destroy() { mysql_mutex_destroy(&mutex); }
 };
 
 /** Use explicit mutexes for the trx_t pool and its manager. */
@@ -370,10 +357,6 @@ trx_t *trx_create()
 	ut_ad(trx->lock.rec_cached == 0);
 	ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
 
-#ifdef WITH_WSREP
-	ut_ad(!trx->wsrep_UK_scan);
-#endif /* WITH_WSREP */
-
 	trx_sys.register_trx(trx);
 
 	return(trx);
@@ -382,23 +365,30 @@ trx_t *trx_create()
 /** Free the memory to trx_pools */
 void trx_t::free()
 {
+#ifdef HAVE_MEM_CHECK
+  if (xid.is_null())
+    MEM_MAKE_DEFINED(&xid, sizeof xid);
+  else
+    MEM_MAKE_DEFINED(&xid.data[xid.gtrid_length + xid.bqual_length],
+                     sizeof xid.data - (xid.gtrid_length + xid.bqual_length));
+#endif
   MEM_CHECK_DEFINED(this, sizeof *this);
 
   ut_ad(!n_mysql_tables_in_use);
   ut_ad(!mysql_log_file_name);
   ut_ad(!mysql_n_tables_locked);
-  ut_ad(!internal);
   ut_ad(!will_lock);
   ut_ad(error_state == DB_SUCCESS);
   ut_ad(magic_n == TRX_MAGIC_N);
   ut_ad(!read_only);
   ut_ad(!lock.wait_lock);
 
-  dict_operation= TRX_DICT_OP_NONE;
+  dict_operation= false;
   trx_sys.deregister_trx(this);
+  check_unique_secondary= true;
+  check_foreigns= true;
   assert_freed();
   trx_sys.rw_trx_hash.put_pins(this);
-
   mysql_thd= nullptr;
 
   // FIXME: We need to avoid this heap free/alloc for each commit.
@@ -410,12 +400,11 @@ void trx_t::free()
     autoinc_locks= NULL;
   }
 
-  mod_tables.clear();
-
   MEM_NOACCESS(&skip_lock_inheritance_and_n_ref,
                sizeof skip_lock_inheritance_and_n_ref);
   /* do not poison mutex */
   MEM_NOACCESS(&id, sizeof id);
+  MEM_NOACCESS(&max_inactive_id, sizeof id);
   MEM_NOACCESS(&state, sizeof state);
   MEM_NOACCESS(&is_recovered, sizeof is_recovered);
 #ifdef WITH_WSREP
@@ -437,7 +426,6 @@ void trx_t::free()
   MEM_NOACCESS(&start_time, sizeof start_time);
   MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
   MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
-  MEM_NOACCESS(&table_id, sizeof table_id);
   MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
   MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
   MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
@@ -462,8 +450,6 @@ void trx_t::free()
   MEM_NOACCESS(&fts_trx, sizeof fts_trx);
   MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
   MEM_NOACCESS(&flush_tables, sizeof flush_tables);
-  MEM_NOACCESS(&ddl, sizeof ddl);
-  MEM_NOACCESS(&internal, sizeof internal);
 #ifdef UNIV_DEBUG
   MEM_NOACCESS(&start_line, sizeof start_line);
   MEM_NOACCESS(&start_file, sizeof start_file);
@@ -471,16 +457,13 @@ void trx_t::free()
   MEM_NOACCESS(&xid, sizeof xid);
   MEM_NOACCESS(&mod_tables, sizeof mod_tables);
   MEM_NOACCESS(&detailed_error, sizeof detailed_error);
-#ifdef WITH_WSREP
-  ut_ad(!wsrep_UK_scan);
-  MEM_NOACCESS(&wsrep_UK_scan, sizeof wsrep_UK_scan);
-#endif /* WITH_WSREP */
   MEM_NOACCESS(&magic_n, sizeof magic_n);
+  MEM_NOACCESS(&apply_online_log, sizeof apply_online_log);
   trx_pools->mem_free(this);
 }
 
 /** Transition to committed state, to release implicit locks. */
-inline void trx_t::commit_state()
+TRANSACTIONAL_INLINE inline void trx_t::commit_state()
 {
   ut_ad(state == TRX_STATE_PREPARED
 	|| state == TRX_STATE_PREPARED_RECOVERED
@@ -497,22 +480,22 @@ inline void trx_t::commit_state()
   makes modifications to the database, will get an lsn larger than the
   committing transaction T. In the case where the log flush fails, and
   T never gets committed, also T2 will never get committed. */
-  trx_mutex_enter(this);
+  TMTrxGuard tg{*this};
   state= TRX_STATE_COMMITTED_IN_MEMORY;
-  trx_mutex_exit(this);
   ut_ad(id || !is_referenced());
 }
 
 /** Release any explicit locks of a committing transaction. */
 inline void trx_t::release_locks()
 {
+  DEBUG_SYNC_C("trx_t_release_locks_enter");
   DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
   DBUG_ASSERT(!is_referenced());
 
   if (UT_LIST_GET_LEN(lock.trx_locks))
   {
     lock_release(this);
-    lock.n_rec_locks = 0;
+    ut_ad(!lock.n_rec_locks);
     ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
     ut_ad(ib_vector_is_empty(autoinc_locks));
     mem_heap_empty(lock.lock_heap);
@@ -520,11 +503,17 @@ inline void trx_t::release_locks()
 
   lock.table_locks.clear();
   reset_skip_lock_inheritance();
+  id= 0;
+  while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+  {
+    UT_LIST_REMOVE(lock.evicted_tables, table);
+    dict_mem_table_free(table);
+  }
+  DEBUG_SYNC_C("after_trx_committed_in_memory");
 }
 
 /** At shutdown, frees a transaction object. */
-void
-trx_free_at_shutdown(trx_t *trx)
+TRANSACTIONAL_TARGET void trx_free_at_shutdown(trx_t *trx)
 {
 	ut_ad(trx->is_recovered);
 	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
@@ -539,8 +528,10 @@ trx_free_at_shutdown(trx_t *trx)
 		         && !srv_undo_sources && srv_fast_shutdown))));
 	ut_a(trx->magic_n == TRX_MAGIC_N);
 
+	ut_d(trx->apply_online_log = false);
 	trx->commit_state();
 	trx->release_locks();
+	trx->mod_tables.clear();
 	trx_undo_free_at_shutdown(trx);
 
 	ut_a(!trx->read_only);
@@ -548,7 +539,6 @@ trx_free_at_shutdown(trx_t *trx)
 	DBUG_LOG("trx", "Free prepared: " << trx);
 	trx->state = TRX_STATE_NOT_STARTED;
 	ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
-	trx->id = 0;
 	trx->free();
 }
 
@@ -572,97 +562,95 @@ void trx_disconnect_prepared(trx_t *trx)
   trx_sys.rw_trx_hash.put_pins(trx);
 }
 
-/****************************************************************//**
-Resurrect the table locks for a resurrected transaction. */
-static
-void
-trx_resurrect_table_locks(
-/*======================*/
-	trx_t*			trx,	/*!< in/out: transaction */
-	const trx_undo_t*	undo)	/*!< in: undo log */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Resurrect the table locks for a resurrected transaction. */
+static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo)
 {
-	mtr_t			mtr;
-	table_id_set		tables;
+  ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+        trx_state_eq(trx, TRX_STATE_PREPARED));
+  ut_ad(undo.rseg == trx->rsegs.m_redo.rseg);
+
+  if (undo.empty())
+    return DB_SUCCESS;
+
+  mtr_t mtr;
+  std::map<table_id_t, bool> tables;
+  mtr.start();
+
+  dberr_t err;
+  if (buf_block_t *block=
+      buf_page_get_gen(page_id_t(trx->rsegs.m_redo.rseg->space->id,
+                                 undo.top_page_no), 0, RW_S_LATCH, nullptr,
+                       BUF_GET, &mtr, &err))
+  {
+    buf_block_t *undo_block= block;
+    const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset;
 
-	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
-	      trx_state_eq(trx, TRX_STATE_PREPARED));
-	ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+    do
+    {
+      ulint type;
+      undo_no_t undo_no;
+      table_id_t table_id;
+      ulint cmpl_info;
+      bool updated_extern;
 
-	if (undo->empty()) {
-		return;
-	}
+      if (undo_block != block)
+      {
+        mtr.release(*undo_block);
+        undo_block= block;
+      }
+      trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+                            &updated_extern, &undo_no, &table_id);
+      tables.emplace(table_id, type == TRX_UNDO_EMPTY);
+      undo_rec= trx_undo_get_prev_rec(block, page_offset(undo_rec),
+                                      undo.hdr_page_no, undo.hdr_offset,
+                                      true, &mtr);
+    }
+    while (undo_rec);
+  }
 
-	mtr_start(&mtr);
+  mtr.commit();
 
-	/* trx_rseg_mem_create() may have acquired an X-latch on this
-	page, so we cannot acquire an S-latch. */
-	buf_block_t* block = trx_undo_page_get(
-		page_id_t(trx->rsegs.m_redo.rseg->space->id,
-			  undo->top_page_no), &mtr);
-	buf_block_t* undo_block = block;
-	trx_undo_rec_t* undo_rec = block->frame + undo->top_offset;
+  if (err != DB_SUCCESS)
+    return err;
 
-	do {
-		ulint		type;
-		undo_no_t	undo_no;
-		table_id_t	table_id;
-		ulint		cmpl_info;
-		bool		updated_extern;
-
-		if (undo_block != block) {
-			mtr.memo_release(undo_block, MTR_MEMO_PAGE_X_FIX);
-			undo_block = block;
-		}
+  for (auto p : tables)
+  {
+    if (dict_table_t *table=
+        dict_table_open_on_id(p.first, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE))
+    {
+      if (!table->is_readable())
+      {
+        dict_sys.lock(SRW_LOCK_CALL);
+        table->release();
+        dict_sys.remove(table);
+        dict_sys.unlock();
+        continue;
+      }
 
-		trx_undo_rec_get_pars(
-			undo_rec, &type, &cmpl_info,
-			&updated_extern, &undo_no, &table_id);
-		tables.insert(table_id);
-
-		undo_rec = trx_undo_get_prev_rec(
-			block, page_offset(undo_rec), undo->hdr_page_no,
-			undo->hdr_offset, false, &mtr);
-	} while (undo_rec);
-
-	mtr_commit(&mtr);
-
-	for (table_id_set::const_iterator i = tables.begin();
-	     i != tables.end(); i++) {
-		if (dict_table_t* table = dict_table_open_on_id(
-			    *i, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE)) {
-			if (!table->is_readable()) {
-				mutex_enter(&dict_sys.mutex);
-				dict_table_close(table, TRUE, FALSE);
-				dict_sys.remove(table);
-				mutex_exit(&dict_sys.mutex);
-				continue;
-			}
+      if (trx->state == TRX_STATE_PREPARED)
+        trx->mod_tables.emplace(table, 0);
 
-			if (trx->state == TRX_STATE_PREPARED) {
-				trx->mod_tables.insert(
-					trx_mod_tables_t::value_type(table,
-								     0));
-			}
-			lock_table_ix_resurrect(table, trx);
+      lock_table_resurrect(table, trx, p.second ? LOCK_X : LOCK_IX);
 
-			DBUG_LOG("ib_trx",
-				 "resurrect " << ib::hex(trx->id)
-				 << " IX lock on " << table->name);
+      DBUG_LOG("ib_trx",
+               "resurrect " << ib::hex(trx->id) << " lock on " << table->name);
+      table->release();
+    }
+  }
 
-			dict_table_close(table, FALSE, FALSE);
-		}
-	}
+  return DB_SUCCESS;
 }
 
 
+MY_ATTRIBUTE((nonnull, warn_unused_result))
 /**
   Resurrect the transactions that were doing inserts/updates the time of the
   crash, they need to be undone.
 */
-
-static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
-                          time_t start_time, ulonglong start_time_micro,
-                          uint64_t *rows_to_undo)
+static dberr_t trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+                             time_t start_time, ulonglong start_time_micro,
+                             uint64_t *rows_to_undo)
 {
   trx_state_t state;
   ut_ad(rseg->needs_purge >= undo->trx_id);
@@ -680,16 +668,15 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
       Prepared transactions are left in the prepared state
       waiting for a commit or abort decision from MySQL
     */
-    ib::info() << "Transaction " << undo->trx_id
-               << " was in the XA prepared state.";
-
     state= TRX_STATE_PREPARED;
+    sql_print_information("InnoDB: Transaction " TRX_ID_FMT
+                          " was in the XA prepared state.", undo->trx_id);
     break;
   default:
-    return;
+    return DB_SUCCESS;
   }
 
-  ++rseg->trx_ref_count;
+  rseg->acquire();
   trx_t *trx= trx_create();
   trx->state= state;
   ut_d(trx->start_file= __FILE__);
@@ -698,24 +685,18 @@ static void trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
   trx->rsegs.m_redo.undo= undo;
   trx->undo_no= undo->top_undo_no + 1;
   trx->rsegs.m_redo.rseg= rseg;
-  *trx->xid= undo->xid;
+  trx->xid= undo->xid;
   trx->id= undo->trx_id;
   trx->is_recovered= true;
   trx->start_time= start_time;
   trx->start_time_micro= start_time_micro;
-
-  if (undo->dict_operation)
-  {
-    trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
-    if (!trx->table_id)
-      trx->table_id= undo->table_id;
-  }
+  trx->dict_operation= undo->dict_operation;
 
   trx_sys.rw_trx_hash.insert(trx);
   trx_sys.rw_trx_hash.put_pins(trx);
-  trx_resurrect_table_locks(trx, undo);
   if (trx_state_eq(trx, TRX_STATE_ACTIVE))
     *rows_to_undo+= trx->undo_no;
+  return trx_resurrect_table_locks(trx, *undo);
 }
 
 
@@ -737,7 +718,10 @@ dberr_t trx_lists_init_at_db_start()
 	}
 
 	purge_sys.create();
-	if (dberr_t err = trx_rseg_array_init()) {
+	dberr_t err = trx_rseg_array_init();
+
+	if (err != DB_SUCCESS) {
+corrupted:
 		ib::info() << "Retry with innodb_force_recovery=5";
 		return err;
 	}
@@ -748,32 +732,32 @@ dberr_t trx_lists_init_at_db_start()
 	const ulonglong	start_time_micro= microsecond_interval_timer();
 	uint64_t	rows_to_undo	= 0;
 
-	for (ulint i = 0; i < TRX_SYS_N_RSEGS; ++i) {
+	for (auto& rseg : trx_sys.rseg_array) {
 		trx_undo_t*	undo;
-		trx_rseg_t*	rseg = trx_sys.rseg_array[i];
 
 		/* Some rollback segment may be unavailable,
 		especially if the server was previously run with a
 		non-default value of innodb_undo_logs. */
-		if (rseg == NULL) {
+		if (!rseg.space) {
 			continue;
 		}
-		/* Ressurrect other transactions. */
-		for (undo = UT_LIST_GET_FIRST(rseg->undo_list);
+		/* Resurrect other transactions. */
+		for (undo = UT_LIST_GET_FIRST(rseg.undo_list);
 		     undo != NULL;
 		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
 			trx_t *trx = trx_sys.find(0, undo->trx_id, false);
 			if (!trx) {
-				trx_resurrect(undo, rseg, start_time,
-					      start_time_micro, &rows_to_undo);
+				err = trx_resurrect(undo, &rseg, start_time,
+						    start_time_micro,
+						    &rows_to_undo);
 			} else {
 				ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
 				      trx_state_eq(trx, TRX_STATE_PREPARED));
 				ut_ad(trx->start_time == start_time);
 				ut_ad(trx->is_recovered);
-				ut_ad(trx->rsegs.m_redo.rseg == rseg);
-				ut_ad(rseg->trx_ref_count);
-				ut_ad(rseg->needs_purge);
+				ut_ad(trx->rsegs.m_redo.rseg == &rseg);
+				ut_ad(rseg.is_referenced());
+				ut_ad(rseg.needs_purge);
 
 				trx->rsegs.m_redo.undo = undo;
 				if (undo->top_undo_no >= trx->undo_no) {
@@ -786,7 +770,11 @@ dberr_t trx_lists_init_at_db_start()
 
 					trx->undo_no = undo->top_undo_no + 1;
 				}
-				trx_resurrect_table_locks(trx, undo);
+				err = trx_resurrect_table_locks(trx, *undo);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto corrupted;
 			}
 		}
 	}
@@ -799,7 +787,7 @@ dberr_t trx_lists_init_at_db_start()
 		ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
 	}
 
-	purge_sys.clone_oldest_view();
+	purge_sys.clone_oldest_view<true>();
 	return DB_SUCCESS;
 }
 
@@ -812,7 +800,7 @@ static void trx_assign_rseg_low(trx_t *trx)
 	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
 
 	/* The first slot is always assigned to the system tablespace. */
-	ut_ad(trx_sys.rseg_array[0]->space == fil_system.sys_space);
+	ut_ad(trx_sys.rseg_array[0].space == fil_system.sys_space);
 
 	trx_sys.register_rw(trx);
 	ut_ad(trx->id);
@@ -823,45 +811,35 @@ static void trx_assign_rseg_low(trx_t *trx)
 	static Atomic_counter<unsigned>	rseg_slot;
 	unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
 	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+	ut_d(const auto start_scan_slot = slot);
 	trx_rseg_t*	rseg;
 
-#ifdef UNIV_DEBUG
-	ulint	start_scan_slot = slot;
-	bool	look_for_rollover = false;
-#endif /* UNIV_DEBUG */
-
-	bool	skip_allocation;
+	bool	allocated;
 
 	do {
 		for (;;) {
-			rseg = trx_sys.rseg_array[slot];
+			rseg = &trx_sys.rseg_array[slot];
 
-#ifdef UNIV_DEBUG
-			/* Ensure that we are not revisiting the same
-			slot that we have already inspected. */
-			if (look_for_rollover) {
+			do {
+				ut_d(if (!trx_rseg_n_slots_debug) continue);
+				slot = (slot + 1) % TRX_SYS_N_RSEGS;
 				ut_ad(start_scan_slot != slot);
-			}
-			look_for_rollover = true;
-#endif /* UNIV_DEBUG */
-
-			ut_d(if (!trx_rseg_n_slots_debug))
-			slot = (slot + 1) % TRX_SYS_N_RSEGS;
+			} while (0);
 
-			if (rseg == NULL) {
+			if (!rseg->space) {
 				continue;
 			}
 
 			ut_ad(rseg->is_persistent());
 
 			if (rseg->space != fil_system.sys_space) {
-				if (rseg->skip_allocation
+				if (rseg->skip_allocation()
 				    || !srv_undo_tablespaces) {
 					continue;
 				}
-			} else if (trx_rseg_t* next
-				   = trx_sys.rseg_array[slot]) {
-				if (next->space != fil_system.sys_space
+			} else if (const fil_space_t *space =
+				   trx_sys.rseg_array[slot].space) {
+				if (space != fil_system.sys_space
 				    && srv_undo_tablespaces > 0) {
 					/** If dedicated
 					innodb_undo_tablespaces have
@@ -874,16 +852,11 @@ static void trx_assign_rseg_low(trx_t *trx)
 			break;
 		}
 
-		mutex_enter(&rseg->mutex);
-		ut_ad(rseg->is_persistent());
-		skip_allocation = rseg->skip_allocation;
-		if (!skip_allocation) {
-			/* Ensure that the allocation remains valid until
-			trx_undo_reuse_cached() is invoked. */
-			++rseg->trx_ref_count;
-		}
-		mutex_exit(&rseg->mutex);
-	} while (skip_allocation);
+		/* By now we have only selected the rseg but not marked it
+		allocated. By marking it allocated we are ensuring that it will
+		never be selected for UNDO truncate purge. */
+		allocated = rseg->acquire_if_available();
+	} while (!allocated);
 
 	trx->rsegs.m_redo.rseg = rseg;
 }
@@ -899,7 +872,7 @@ trx_rseg_t *trx_t::assign_temp_rseg()
 	/* Choose a temporary rollback segment between 0 and 127
 	in a round-robin fashion. */
 	static Atomic_counter<unsigned> rseg_slot;
-	trx_rseg_t*	rseg = trx_sys.temp_rsegs[
+	trx_rseg_t*	rseg = &trx_sys.temp_rsegs[
 		rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
 	ut_ad(!rseg->is_persistent());
 	rsegs.m_noredo.rseg = rseg;
@@ -908,7 +881,6 @@ trx_rseg_t *trx_t::assign_temp_rseg()
 		trx_sys.register_rw(this);
 	}
 
-	ut_ad(!rseg->is_persistent());
 	return(rseg);
 }
 
@@ -936,7 +908,7 @@ trx_start_low(
 	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
 
 	trx->read_only = srv_read_only_mode
-		|| (!trx->ddl && !trx->internal
+		|| (!trx->dict_operation
 		    && thd_trx_is_read_only(trx->mysql_thd));
 
 	if (!trx->auto_commit) {
@@ -946,7 +918,7 @@ trx_start_low(
 	}
 
 #ifdef WITH_WSREP
-	trx->xid->null();
+	trx->xid.null();
 #endif /* WITH_WSREP */
 
 	ut_a(ib_vector_is_empty(trx->autoinc_locks));
@@ -968,7 +940,7 @@ trx_start_low(
 	list too. */
 
 	if (!trx->read_only
-	    && (trx->mysql_thd == 0 || read_write || trx->ddl)) {
+	    && (!trx->mysql_thd || read_write || trx->dict_operation)) {
 		/* Temporary rseg is assigned only if the transaction
 		updates a temporary table */
 		if (!high_level_read_only) {
@@ -992,12 +964,10 @@ trx_start_low(
 
 	trx->start_time = time(NULL);
 	trx->start_time_micro = trx->mysql_thd
-		? thd_query_start_micro(trx->mysql_thd)
+		? thd_start_utime(trx->mysql_thd)
 		: microsecond_interval_timer();
 
 	ut_a(trx->error_state == DB_SUCCESS);
-
-	MONITOR_INC(MONITOR_TRX_ACTIVE);
 }
 
 /** Set the serialisation number for a persistent committed transaction.
@@ -1008,10 +978,9 @@ trx_serialise(trx_t* trx)
 {
 	trx_rseg_t *rseg = trx->rsegs.m_redo.rseg;
 	ut_ad(rseg);
-	ut_ad(mutex_own(&rseg->mutex));
 
 	if (rseg->last_page_no == FIL_NULL) {
-		mutex_enter(&purge_sys.pq_mutex);
+		mysql_mutex_lock(&purge_sys.pq_mutex);
 	}
 
 	trx_sys.assign_new_trx_no(trx);
@@ -1023,7 +992,7 @@ trx_serialise(trx_t* trx)
 	if (rseg->last_page_no == FIL_NULL) {
 		purge_sys.purge_queue.push(TrxUndoRsegs(trx->rw_trx_hash_element->no,
 							*rseg));
-		mutex_exit(&purge_sys.pq_mutex);
+		mysql_mutex_unlock(&purge_sys.pq_mutex);
 	}
 }
 
@@ -1055,10 +1024,7 @@ trx_write_serialisation_history(
 		mtr_t	temp_mtr;
 		temp_mtr.start();
 		temp_mtr.set_log_mode(MTR_LOG_NO_REDO);
-
-		mutex_enter(&trx->rsegs.m_noredo.rseg->mutex);
 		trx_undo_set_state_at_finish(undo, &temp_mtr);
-		mutex_exit(&trx->rsegs.m_noredo.rseg->mutex);
 		temp_mtr.commit();
 	}
 
@@ -1071,21 +1037,20 @@ trx_write_serialisation_history(
 	trx_undo_t*& undo = trx->rsegs.m_redo.undo;
 
 	ut_ad(!trx->read_only);
-	mutex_enter(&rseg->mutex);
-	ut_ad(rseg->trx_ref_count);
-	--rseg->trx_ref_count;
 
 	/* Assign the transaction serialisation number and add any
 	undo log to the purge queue. */
 	if (undo) {
+		rseg->latch.wr_lock(SRW_LOCK_CALL);
 		ut_ad(undo->rseg == rseg);
 		trx_serialise(trx);
 		UT_LIST_REMOVE(rseg->undo_list, undo);
 		trx_purge_add_undo_to_history(trx, undo, mtr);
 		MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+		rseg->latch.wr_unlock();
 	}
 
-	mutex_exit(&rseg->mutex);
+	rseg->release();
 }
 
 /********************************************************************
@@ -1146,35 +1111,46 @@ trx_finalize_for_fts(
 	trx->fts_trx = NULL;
 }
 
-/**********************************************************************//**
-If required, flushes the log to disk based on the value of
-innodb_flush_log_at_trx_commit. */
-static
-void
-trx_flush_log_if_needed_low(
-/*========================*/
-	lsn_t	lsn)	/*!< in: lsn up to which logs are to be
-			flushed. */
+static MYSQL_THD thd_increment_pending_ops(MYSQL_THD) {return nullptr;}
+static void thd_decrement_pending_ops(MYSQL_THD) {}
+
+#include "../log/log0sync.h"
+
+/*
+  If required, initiates write and optionally flush of the log to
+  disk
+  @param lsn   LSN up to which logs are to be flushed.
+  @param trx   transaction; if trx->state is PREPARED, the function will
+  also wait for the flush to complete.
+*/
+static void trx_flush_log_if_needed_low(lsn_t lsn, const trx_t *trx)
 {
-	bool	flush = srv_file_flush_method != SRV_NOSYNC;
+  if (!srv_flush_log_at_trx_commit)
+    return;
 
-	switch (srv_flush_log_at_trx_commit) {
-	case 2:
-		/* Write the log but do not flush it to disk */
-		flush = false;
-		/* fall through */
-	case 1:
-	case 3:
-		/* Write the log and optionally flush it to disk */
-		log_write_up_to(lsn, flush);
-		srv_inc_activity_count();
-		return;
-	case 0:
-		/* Do nothing */
-		return;
-	}
+  if (log_sys.get_flushed_lsn() > lsn)
+    return;
 
-	ut_error;
+  const bool flush= srv_file_flush_method != SRV_NOSYNC &&
+    (srv_flush_log_at_trx_commit & 1);
+
+  if (trx->state == TRX_STATE_PREPARED)
+  {
+    /* XA, which is used with binlog as well.
+    Be conservative, use synchronous wait.*/
+sync:
+    log_write_up_to(lsn, flush);
+    return;
+  }
+
+  completion_callback cb;
+  if ((cb.m_param = thd_increment_pending_ops(trx->mysql_thd)))
+  {
+    cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops;
+    log_write_up_to(lsn, flush, false, &cb);
+  }
+  else
+    goto sync;
 }
 
 /**********************************************************************//**
@@ -1189,66 +1165,54 @@ trx_flush_log_if_needed(
 	trx_t*	trx)	/*!< in/out: transaction */
 {
 	trx->op_info = "flushing log";
-	trx_flush_log_if_needed_low(lsn);
+	trx_flush_log_if_needed_low(lsn, trx);
 	trx->op_info = "";
 }
 
-/**********************************************************************//**
-For each table that has been modified by the given transaction: update
-its dict_table_t::update_time with the current timestamp. Clear the list
-of the modified tables at the end. */
-static
-void
-trx_update_mod_tables_timestamp(
-/*============================*/
-	trx_t*	trx)	/*!< in: transaction */
+/** Process tables that were modified by the committing transaction. */
+inline void trx_t::commit_tables()
 {
-	/* consider using trx->start_time if calling time() is too
-	expensive here */
-	const time_t now = time(NULL);
-
-	trx_mod_tables_t::const_iterator	end = trx->mod_tables.end();
-
-	for (trx_mod_tables_t::const_iterator it = trx->mod_tables.begin();
-	     it != end;
-	     ++it) {
-
-		/* This could be executed by multiple threads concurrently
-		on the same table object. This is fine because time_t is
-		word size or less. And _purely_ _theoretically_, even if
-		time_t write is not atomic, likely the value of 'now' is
-		the same in all threads and even if it is not, getting a
-		"garbage" in table->update_time is justified because
-		protecting it with a latch here would be too performance
-		intrusive. */
-		dict_table_t* table = it->first;
-		table->update_time = now;
-	}
+  if (undo_no && !mod_tables.empty())
+  {
+    const trx_id_t max_trx_id= trx_sys.get_max_trx_id();
+    const auto now= start_time;
 
-	trx->mod_tables.clear();
+    for (const auto &p : mod_tables)
+    {
+      dict_table_t *table= p.first;
+      table->update_time= now;
+      table->query_cache_inv_trx_id= max_trx_id;
+    }
+  }
 }
 
 /** Evict a table definition due to the rollback of ALTER TABLE.
-@param[in]	table_id	table identifier */
-void trx_t::evict_table(table_id_t table_id)
+@param table_id   table identifier
+@param reset_only whether to only reset dict_table_t::def_trx_id */
+void trx_t::evict_table(table_id_t table_id, bool reset_only)
 {
 	ut_ad(in_rollback);
 
-	dict_table_t* table = dict_table_open_on_id(
-		table_id, true, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+	dict_table_t* table = dict_sys.find_table(table_id);
 	if (!table) {
 		return;
 	}
 
-	if (!table->release()) {
+	table->def_trx_id = 0;
+
+	if (auto ref_count = table->get_ref_count()) {
 		/* This must be a DDL operation that is being rolled
 		back in an active connection. */
-		ut_a(table->get_ref_count() == 1);
+		ut_a(ref_count == 1);
 		ut_ad(!is_recovered);
 		ut_ad(mysql_thd);
 		return;
 	}
 
+	if (reset_only) {
+		return;
+	}
+
 	/* This table should only be locked by this transaction, if at all. */
 	ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
 	const bool locked = UT_LIST_GET_LEN(table->locks);
@@ -1259,9 +1223,10 @@ void trx_t::evict_table(table_id_t table_id)
 	}
 }
 
-/** Mark a transaction committed in the main memory data structures. */
-inline void trx_t::commit_in_memory(const mtr_t *mtr)
+TRANSACTIONAL_INLINE inline void trx_t::commit_in_memory(const mtr_t *mtr)
 {
+  /* We already detached from rseg in trx_write_serialisation_history() */
+  ut_ad(!rsegs.m_redo.undo);
   must_flush_log_later= false;
   read_view.close();
 
@@ -1272,14 +1237,14 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
     ut_ad(!will_lock);
     ut_a(!is_recovered);
     ut_ad(!rsegs.m_redo.rseg);
+    ut_ad(!rsegs.m_redo.undo);
     ut_ad(mysql_thd);
     ut_ad(state == TRX_STATE_ACTIVE);
 
-    /* Note: We are asserting without holding the lock mutex. But
-    that is OK because this transaction is not waiting and cannot
-    be rolled back and no new locks can (or should) be added
-    because it is flagged as a non-locking read-only transaction. */
+    /* Note: We do not have to hold any lock_sys latch here, because
+    this is a non-locking transaction. */
     ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
 
     /* This state change is not protected by any mutex, therefore
     there is an inherent race here around state transition during
@@ -1309,7 +1274,7 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
       /* Wait for any implicit-to-explicit lock conversions to cease,
       so that there will be no race condition in lock_release(). */
       while (UNIV_UNLIKELY(is_referenced()))
-        ut_delay(srv_spin_wait_delay);
+        LF_BACKOFF();
     }
     else
       ut_ad(read_only || !rsegs.m_redo.rseg);
@@ -1320,25 +1285,15 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
     }
     else
     {
-      trx_update_mod_tables_timestamp(this);
+      commit_tables();
       MONITOR_INC(MONITOR_TRX_RW_COMMIT);
       is_recovered= false;
     }
 
-    release_locks();
-    id= 0;
-    DEBUG_SYNC_C("after_trx_committed_in_memory");
-
-    while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
-    {
-      UT_LIST_REMOVE(lock.evicted_tables, table);
-      dict_mem_table_free(table);
-    }
+    if (UNIV_LIKELY(!dict_operation))
+      release_locks();
   }
 
-  ut_ad(!rsegs.m_redo.undo);
-  ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
-
   if (mtr)
   {
     if (trx_undo_t *&undo= rsegs.m_noredo.undo)
@@ -1349,7 +1304,7 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
     }
 
     /* NOTE that we could possibly make a group commit more efficient
-    here: call os_thread_yield here to allow also other trxs to come
+    here: call std::this_thread::yield() here to allow also other trxs to come
     to commit! */
 
     /*-------------------------------------*/
@@ -1375,7 +1330,7 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
     serialize all commits and prevent a group of transactions from
     gathering. */
 
-    commit_lsn= mtr->commit_lsn();
+    commit_lsn= undo_no || !xid.is_null() ? mtr->commit_lsn() : 0;
     if (!commit_lsn)
       /* Nothing to be done. */;
     else if (flush_log_later)
@@ -1387,10 +1342,7 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
 
   ut_ad(!rsegs.m_noredo.undo);
 
-  /* Free all savepoints, starting from the first. */
-  trx_named_savept_t *savep= UT_LIST_GET_FIRST(trx_savepoints);
-
-  trx_roll_savepoints_free(this, savep);
+  savepoints_discard();
 
   if (fts_trx)
     trx_finalize_for_fts(this, undo_no != 0);
@@ -1404,50 +1356,60 @@ inline void trx_t::commit_in_memory(const mtr_t *mtr)
     wsrep= false;
     wsrep_commit_ordered(mysql_thd);
   }
-  lock.was_chosen_as_wsrep_victim= false;
 #endif /* WITH_WSREP */
-  trx_mutex_enter(this);
-  dict_operation= TRX_DICT_OP_NONE;
+  lock.was_chosen_as_deadlock_victim= false;
+}
+
+void trx_t::commit_cleanup()
+{
+  ut_ad(!dict_operation);
+  ut_ad(!was_dict_operation);
 
-  DBUG_LOG("trx", "Commit in memory: " << this);
+  mutex.wr_lock();
   state= TRX_STATE_NOT_STARTED;
+  mod_tables.clear();
 
+  check_foreigns= true;
+  check_unique_secondary= true;
   assert_freed();
   trx_init(this);
-  trx_mutex_exit(this);
+  mutex.wr_unlock();
 
   ut_a(error_state == DB_SUCCESS);
-  if (!srv_read_only_mode)
-    srv_wake_purge_thread_if_not_active();
 }
 
 /** Commit the transaction in a mini-transaction.
 @param mtr  mini-transaction (if there are any persistent modifications) */
-void trx_t::commit_low(mtr_t *mtr)
+TRANSACTIONAL_TARGET void trx_t::commit_low(mtr_t *mtr)
 {
   ut_ad(!mtr || mtr->is_active());
-  ut_d(bool aborted = in_rollback && error_state == DB_DEADLOCK);
+  ut_d(bool aborted= in_rollback && error_state == DB_DEADLOCK);
   ut_ad(!mtr == (aborted || !has_logged()));
   ut_ad(!mtr || !aborted);
 
-  /* undo_no is non-zero if we're doing the final commit. */
   if (fts_trx && undo_no)
   {
     ut_a(!is_autocommit_non_locking());
-    /* FTS-FIXME: Temporarily tolerate DB_DUPLICATE_KEY instead of
-    dying. This is a possible scenario if there is a crash between
+    /* MDEV-24088 FIXME: Invoke fts_commit() earlier (before possible
+    XA PREPARE), so that we will be able to return an error and rollback
+    the transaction, instead of violating consistency!
+
+    The original claim about DB_DUPLICATE KEY was:
+    This is a possible scenario if there is a crash between
     insert to DELETED table committing and transaction committing. The
     fix would be able to return error from this function */
-    if (dberr_t error= fts_commit(this))
-      ut_a(error == DB_DUPLICATE_KEY);
+    if (ut_d(dberr_t error=) fts_commit(this))
+      ut_ad(error == DB_DUPLICATE_KEY || error == DB_LOCK_WAIT_TIMEOUT);
   }
 
-#ifndef DBUG_OFF
+#ifdef ENABLED_DEBUG_SYNC
   const bool debug_sync= mysql_thd && has_logged_persistent();
 #endif
 
   if (mtr)
   {
+    if (UNIV_UNLIKELY(apply_online_log))
+      apply_log();
     trx_write_serialisation_history(this, mtr);
 
     /* The following call commits the mini-transaction, making the
@@ -1471,12 +1433,10 @@ void trx_t::commit_low(mtr_t *mtr)
   {
     ut_ad(id);
     ut_ad(!rsegs.m_redo.undo);
-    mutex_enter(&rseg->mutex);
-    --rseg->trx_ref_count;
-    mutex_exit(&rseg->mutex);
+    rseg->release();
   }
 
-#ifndef DBUG_OFF
+#ifdef ENABLED_DEBUG_SYNC
   if (debug_sync)
     DEBUG_SYNC_C("before_trx_state_committed_in_memory");
 #endif
@@ -1485,7 +1445,7 @@ void trx_t::commit_low(mtr_t *mtr)
 }
 
 
-void trx_t::commit()
+void trx_t::commit_persist()
 {
   mtr_t *mtr= nullptr;
   mtr_t local_mtr;
@@ -1498,6 +1458,19 @@ void trx_t::commit()
   commit_low(mtr);
 }
 
+
+void trx_t::commit()
+{
+  ut_ad(!was_dict_operation);
+  ut_d(was_dict_operation= dict_operation);
+  dict_operation= false;
+  commit_persist();
+  ut_d(was_dict_operation= false);
+  ut_d(for (const auto &p : mod_tables) ut_ad(!p.second.is_dropped()));
+  commit_cleanup();
+}
+
+
 /****************************************************************//**
 Prepares a transaction for commit/rollback. */
 void
@@ -1518,19 +1491,7 @@ trx_commit_or_rollback_prepare(
 	case TRX_STATE_ACTIVE:
 	case TRX_STATE_PREPARED:
 	case TRX_STATE_PREPARED_RECOVERED:
-		/* If the trx is in a lock wait state, moves the waiting
-		query thread to the suspended state */
-
-		if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-
-			ut_a(trx->lock.wait_thr != NULL);
-			trx->lock.wait_thr->state = QUE_THR_SUSPENDED;
-			trx->lock.wait_thr = NULL;
-
-			trx->lock.que_state = TRX_QUE_RUNNING;
-		}
-
-		ut_ad(trx->lock.n_active_thrs == 1);
+		trx->lock.wait_thr = NULL;
 		return;
 
 	case TRX_STATE_COMMITTED_IN_MEMORY:
@@ -1583,14 +1544,11 @@ trx_commit_step(
 		trx = thr_get_trx(thr);
 
 		ut_a(trx->lock.wait_thr == NULL);
-		ut_a(trx->lock.que_state != TRX_QUE_LOCK_WAIT);
 
 		trx_commit_or_rollback_prepare(trx);
 
-		trx->lock.que_state = TRX_QUE_COMMITTING;
 		trx->commit();
 		ut_ad(trx->lock.wait_thr == NULL);
-		trx->lock.que_state = TRX_QUE_RUNNING;
 
 		thr = NULL;
 	} else {
@@ -1618,17 +1576,12 @@ trx_commit_for_mysql(
 
 	switch (trx->state) {
 	case TRX_STATE_NOT_STARTED:
-		ut_d(trx->start_file = __FILE__);
-		ut_d(trx->start_line = __LINE__);
-
-		trx_start_low(trx, true);
-		/* fall through */
+		return DB_SUCCESS;
 	case TRX_STATE_ACTIVE:
 	case TRX_STATE_PREPARED:
 	case TRX_STATE_PREPARED_RECOVERED:
 		trx->op_info = "committing";
 		trx->commit();
-		MONITOR_DEC(MONITOR_TRX_ACTIVE);
 		trx->op_info = "";
 		return(DB_SUCCESS);
 	case TRX_STATE_COMMITTED_IN_MEMORY:
@@ -1676,12 +1629,18 @@ trx_mark_sql_stat_end(
 		trx->undo_no = 0;
 		/* fall through */
 	case TRX_STATE_ACTIVE:
-		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
-
 		if (trx->fts_trx != NULL) {
 			fts_savepoint_laststmt_refresh(trx);
 		}
 
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			return;
+		}
+
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+		trx->end_bulk_insert();
 		return;
 	}
 
@@ -1701,15 +1660,17 @@ trx_print_low(
 			/*!< in: max query length to print,
 			or 0 to use the default max length */
 	ulint		n_rec_locks,
-			/*!< in: lock_number_of_rows_locked(&trx->lock) */
+			/*!< in: trx->lock.n_rec_locks */
 	ulint		n_trx_locks,
 			/*!< in: length of trx->lock.trx_locks */
 	ulint		heap_size)
 			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
 {
-	ibool		newline;
-
-	fprintf(f, "TRANSACTION " TRX_ID_FMT, trx_get_id_for_print(trx));
+	if (const trx_id_t id = trx->id) {
+		fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+	} else {
+		fprintf(f, "TRANSACTION (%p)", trx);
+	}
 
 	switch (trx->state) {
 	case TRX_STATE_NOT_STARTED:
@@ -1750,27 +1711,18 @@ state_ok:
 			(ulong) trx->mysql_n_tables_locked);
 	}
 
-	newline = TRUE;
-
-	/* trx->lock.que_state of an ACTIVE transaction may change
-	while we are not holding trx->mutex. We perform a dirty read
-	for performance reasons. */
-
-	switch (trx->lock.que_state) {
-	case TRX_QUE_RUNNING:
-		newline = FALSE; break;
-	case TRX_QUE_LOCK_WAIT:
-		fputs("LOCK WAIT ", f); break;
-	case TRX_QUE_ROLLING_BACK:
-		fputs("ROLLING BACK ", f); break;
-	case TRX_QUE_COMMITTING:
-		fputs("COMMITTING ", f); break;
-	default:
-		fprintf(f, "que state %lu ", (ulong) trx->lock.que_state);
+	bool newline = true;
+
+	if (trx->in_rollback) { /* dirty read for performance reasons */
+		fputs("ROLLING BACK ", f);
+	} else if (trx->lock.wait_lock) {
+		fputs("LOCK WAIT ", f);
+	} else {
+		newline = false;
 	}
 
 	if (n_trx_locks > 0 || heap_size > 400) {
-		newline = TRUE;
+		newline = true;
 
 		fprintf(f, "%lu lock struct(s), heap size %lu,"
 			" %lu row lock(s)",
@@ -1780,7 +1732,7 @@ state_ok:
 	}
 
 	if (trx->undo_no != 0) {
-		newline = TRUE;
+		newline = true;
 		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
 	}
 
@@ -1796,7 +1748,7 @@ state_ok:
 
 /**********************************************************************//**
 Prints info about a transaction.
-The caller must hold lock_sys.mutex.
+The caller must hold lock_sys.latch.
 When possible, use trx_print() instead. */
 void
 trx_print_latched(
@@ -1806,17 +1758,18 @@ trx_print_latched(
 	ulint		max_query_len)	/*!< in: max query length to print,
 					or 0 to use the default max length */
 {
-	ut_ad(lock_mutex_own());
+	lock_sys.assert_locked();
 
 	trx_print_low(f, trx, max_query_len,
-		      lock_number_of_rows_locked(&trx->lock),
+		      trx->lock.n_rec_locks,
 		      UT_LIST_GET_LEN(trx->lock.trx_locks),
 		      mem_heap_get_size(trx->lock.lock_heap));
 }
 
 /**********************************************************************//**
 Prints info about a transaction.
-Acquires and releases lock_sys.mutex. */
+Acquires and releases lock_sys.latch. */
+TRANSACTIONAL_TARGET
 void
 trx_print(
 /*======*/
@@ -1825,53 +1778,15 @@ trx_print(
 	ulint		max_query_len)	/*!< in: max query length to print,
 					or 0 to use the default max length */
 {
-	ulint	n_rec_locks;
-	ulint	n_trx_locks;
-	ulint	heap_size;
-
-	lock_mutex_enter();
-	n_rec_locks = lock_number_of_rows_locked(&trx->lock);
-	n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
-	heap_size = mem_heap_get_size(trx->lock.lock_heap);
-	lock_mutex_exit();
-
-	trx_print_low(f, trx, max_query_len,
-		      n_rec_locks, n_trx_locks, heap_size);
-}
-
-/*******************************************************************//**
-Compares the "weight" (or size) of two transactions. Transactions that
-have edited non-transactional tables are considered heavier than ones
-that have not.
-@return TRUE if weight(a) >= weight(b) */
-bool
-trx_weight_ge(
-/*==========*/
-	const trx_t*	a,	/*!< in: transaction to be compared */
-	const trx_t*	b)	/*!< in: transaction to be compared */
-{
-	ibool	a_notrans_edit;
-	ibool	b_notrans_edit;
-
-	/* If mysql_thd is NULL for a transaction we assume that it has
-	not edited non-transactional tables. */
-
-	a_notrans_edit = a->mysql_thd != NULL
-		&& thd_has_edited_nontrans_tables(a->mysql_thd);
-
-	b_notrans_edit = b->mysql_thd != NULL
-		&& thd_has_edited_nontrans_tables(b->mysql_thd);
-
-	if (a_notrans_edit != b_notrans_edit) {
-
-		return(a_notrans_edit);
-	}
-
-	/* Either both had edited non-transactional tables or both had
-	not, we fall back to comparing the number of altered/locked
-	rows. */
+  ulint n_rec_locks, n_trx_locks, heap_size;
+  {
+    TMLockMutexGuard g{SRW_LOCK_CALL};
+    n_rec_locks= trx->lock.n_rec_locks;
+    n_trx_locks= UT_LIST_GET_LEN(trx->lock.trx_locks);
+    heap_size= mem_heap_get_size(trx->lock.lock_heap);
+  }
 
-	return(TRX_WEIGHT(a) >= TRX_WEIGHT(b));
+  trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size);
 }
 
 /** Prepare a transaction.
@@ -1888,11 +1803,7 @@ static lsn_t trx_prepare_low(trx_t *trx)
 
 		mtr.start();
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
-
-		mutex_enter(&undo->rseg->mutex);
 		trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
-		mutex_exit(&undo->rseg->mutex);
-
 		mtr.commit();
 	}
 
@@ -1903,8 +1814,7 @@ static lsn_t trx_prepare_low(trx_t *trx)
 		return(0);
 	}
 
-	trx_rseg_t*	rseg = trx->rsegs.m_redo.rseg;
-	ut_ad(undo->rseg == rseg);
+	ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
 
 	mtr.start();
 
@@ -1912,10 +1822,7 @@ static lsn_t trx_prepare_low(trx_t *trx)
 	TRX_UNDO_PREPARED: these modifications to the file data
 	structure define the transaction as prepared in the file-based
 	world, at the serialization point of lsn. */
-
-	mutex_enter(&rseg->mutex);
 	trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
-	mutex_exit(&rseg->mutex);
 
 	/* Make the XA PREPARE durable. */
 	mtr.commit();
@@ -1925,6 +1832,7 @@ static lsn_t trx_prepare_low(trx_t *trx)
 
 /****************************************************************//**
 Prepares a transaction. */
+TRANSACTIONAL_TARGET
 static
 void
 trx_prepare(
@@ -1940,9 +1848,10 @@ trx_prepare(
 	DBUG_EXECUTE_IF("ib_trx_crash_during_xa_prepare_step", DBUG_SUICIDE(););
 
 	ut_a(trx->state == TRX_STATE_ACTIVE);
-	trx_mutex_enter(trx);
-	trx->state = TRX_STATE_PREPARED;
-	trx_mutex_exit(trx);
+	{
+		TMTrxGuard tg{*trx};
+		trx->state = TRX_STATE_PREPARED;
+	}
 
 	if (lsn) {
 		/* Depending on the my.cnf options, we may now write the log
@@ -2005,7 +1914,7 @@ static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
   trx_recover_for_mysql_callback_arg *arg)
 {
   DBUG_ASSERT(arg->len > 0);
-  mutex_enter(&element->mutex);
+  element->mutex.wr_lock();
   if (trx_t *trx= element->trx)
   {
     /*
@@ -2027,11 +1936,11 @@ static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
                    << " in prepared state after recovery";
         ib::info() << "Transaction contains changes to " << trx->undo_no
                    << " rows";
-        xid= *trx->xid;
+        xid= trx->xid;
       }
     }
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   /* Do not terminate upon reaching arg->len; count all transactions */
   return false;
 }
@@ -2040,13 +1949,13 @@ static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
 static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
   void*)
 {
-  mutex_enter(&element->mutex);
+  element->mutex.wr_lock();
   if (trx_t *trx= element->trx)
   {
     if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
       trx->state= TRX_STATE_PREPARED;
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return false;
 }
 
@@ -2095,29 +2004,29 @@ static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
   trx_get_trx_by_xid_callback_arg *arg)
 {
   my_bool found= 0;
-  mutex_enter(&element->mutex);
+  element->mutex.wr_lock();
   if (trx_t *trx= element->trx)
   {
-    trx_mutex_enter(trx);
+    trx->mutex_lock();
     if (trx->is_recovered &&
 	(trx_state_eq(trx, TRX_STATE_PREPARED) ||
 	 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
-        arg->xid->eq(reinterpret_cast<XID*>(trx->xid)))
+        arg->xid->eq(&trx->xid))
     {
 #ifdef WITH_WSREP
       /* The commit of a prepared recovered Galera
       transaction needs a valid trx->xid for
       invoking trx_sys_update_wsrep_checkpoint(). */
-      if (!wsrep_is_wsrep_xid(trx->xid))
+      if (!wsrep_is_wsrep_xid(&trx->xid))
 #endif /* WITH_WSREP */
       /* Invalidate the XID, so that subsequent calls will not find it. */
-      trx->xid->null();
+      trx->xid.null();
       arg->trx= trx;
       found= 1;
     }
-    trx_mutex_exit(trx);
+    trx->mutex_unlock();
   }
-  mutex_exit(&element->mutex);
+  element->mutex.wr_unlock();
   return found;
 }
 
@@ -2198,65 +2107,24 @@ trx_start_if_not_started_low(
 	ut_error;
 }
 
-/*************************************************************//**
-Starts a transaction for internal processing. */
-void
-trx_start_internal_low(
-/*===================*/
-	trx_t*	trx)		/*!< in/out: transaction */
-{
-	/* Ensure it is not flagged as an auto-commit-non-locking
-	transaction. */
-
-	trx->will_lock = true;
-
-	trx->internal = true;
-
-	trx_start_low(trx, true);
-}
-
-/** Starts a read-only transaction for internal processing.
-@param[in,out] trx	transaction to be started */
-void
-trx_start_internal_read_only_low(
-	trx_t*	trx)
+/**
+Start a transaction for internal processing.
+@param trx          transaction
+@param read_write   whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write)
 {
-	/* Ensure it is not flagged as an auto-commit-non-locking
-	transaction. */
-
-	trx->will_lock = true;
-
-	trx->internal = true;
-
-	trx_start_low(trx, false);
+  trx->will_lock= true;
+  trx_start_low(trx, read_write);
 }
 
-/*************************************************************//**
-Starts the transaction for a DDL operation. */
-void
-trx_start_for_ddl_low(
-/*==================*/
-	trx_t*		trx,	/*!< in/out: transaction */
-	trx_dict_op_t	op)	/*!< in: dictionary operation type */
+/** Start a transaction for a DDL operation.
+@param trx   transaction */
+void trx_start_for_ddl_low(trx_t *trx)
 {
-	switch (trx->state) {
-	case TRX_STATE_NOT_STARTED:
-		/* Flag this transaction as a dictionary operation, so that
-		the data dictionary will be locked in crash recovery. */
-
-		trx_set_dict_operation(trx, op);
-		trx->ddl= true;
-		trx_start_internal_low(trx);
-		return;
-
-	case TRX_STATE_ACTIVE:
-	case TRX_STATE_PREPARED:
-	case TRX_STATE_PREPARED_RECOVERED:
-	case TRX_STATE_COMMITTED_IN_MEMORY:
-		break;
-	}
-
-	ut_error;
+  /* Flag this transaction as a dictionary operation, so that
+  the data dictionary will be locked in crash recovery. */
+  trx->dict_operation= true;
+  trx_start_internal_low(trx, true);
 }
 
 /*************************************************************//**
@@ -2287,16 +2155,3 @@ trx_set_rw_mode(
 		trx->read_view.set_creator_trx_id(trx->id);
 	}
 }
-
-bool trx_t::has_stats_table_lock() const
-{
-  for (lock_list::const_iterator it= lock.table_locks.begin(),
-       end= lock.table_locks.end(); it != end; ++it)
-  {
-     const lock_t *lock= *it;
-     if (lock && lock->un_member.tab_lock.table->is_stats_table())
-       return true;
-  }
-
-  return false;
-}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
index a69b748d78b..33b1f93ff65 100644
--- a/storage/innobase/trx/trx0undo.cc
+++ b/storage/innobase/trx/trx0undo.cc
@@ -119,7 +119,7 @@ uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
                                  uint16_t offset)
 {
   return page_no == block->page.id().page_no()
-    ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->frame)
+    ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->page.frame)
     : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
 }
 
@@ -135,7 +135,7 @@ trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
 {
   uint16_t start= trx_undo_page_get_start(block, page_no, offset);
   return start == trx_undo_page_get_end(block, page_no, offset)
-    ? nullptr : block->frame + start;
+    ? nullptr : block->page.frame + start;
 }
 
 /** Get the last undo log record on a page.
@@ -151,7 +151,8 @@ trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
 {
   uint16_t end= trx_undo_page_get_end(block, page_no, offset);
   return trx_undo_page_get_start(block, page_no, offset) == end
-    ? nullptr : block->frame + mach_read_from_2(block->frame + end - 2);
+    ? nullptr
+    : block->page.frame + mach_read_from_2(block->page.frame + end - 2);
 }
 
 /** Get the previous record in an undo log from the previous page.
@@ -167,18 +168,18 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
                                      uint32_t page_no, uint16_t offset,
                                      bool shared, mtr_t *mtr)
 {
-  uint32_t prev_page_no= flst_get_prev_addr(TRX_UNDO_PAGE_HDR +
-                                            TRX_UNDO_PAGE_NODE +
-                                            block->frame).page;
+  uint32_t prev_page_no= mach_read_from_4(TRX_UNDO_PAGE_HDR +
+                                          TRX_UNDO_PAGE_NODE +
+                                          FLST_PREV + FIL_ADDR_PAGE +
+                                          block->page.frame);
 
   if (prev_page_no == FIL_NULL)
     return nullptr;
 
   block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
                       0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
-  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
 
-  return trx_undo_page_get_last_rec(block, page_no, offset);
+  return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr;
 }
 
 /** Get the previous undo log record.
@@ -193,10 +194,11 @@ trx_undo_rec_t*
 trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
                            uint32_t page_no, uint16_t offset)
 {
-  ut_ad(block->frame == page_align(rec));
-  return rec == block->frame + trx_undo_page_get_start(block, page_no, offset)
+  ut_ad(block->page.frame == page_align(rec));
+  return
+    rec == block->page.frame + trx_undo_page_get_start(block, page_no, offset)
     ? nullptr
-    : block->frame + mach_read_from_2(rec - 2);
+    : block->page.frame + mach_read_from_2(rec - 2);
 }
 
 /** Get the previous record in an undo log.
@@ -212,7 +214,7 @@ trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
                       uint16_t offset, bool shared, mtr_t *mtr)
 {
   if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
-                                                       block->frame + rec,
+                                                       block->page.frame + rec,
                                                        page_no, offset))
     return prev;
 
@@ -231,22 +233,24 @@ trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
 @param[in,out]  mtr     mini-transaction
 @return undo log record, the page latched, NULL if none */
 static trx_undo_rec_t*
-trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no,
-                                     uint16_t offset, ulint mode, mtr_t *mtr)
+trx_undo_get_next_rec_from_next_page(const buf_block_t *&block,
+                                     uint32_t page_no, uint16_t offset,
+                                     ulint mode, mtr_t *mtr)
 {
   if (page_no == block->page.id().page_no() &&
-      mach_read_from_2(block->frame + offset + TRX_UNDO_NEXT_LOG))
-    return NULL;
+      mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG))
+    return nullptr;
 
-  uint32_t next= flst_get_next_addr(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
-				    block->frame).page;
+  uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                  FLST_NEXT + FIL_ADDR_PAGE +
+                                  block->page.frame);
   if (next == FIL_NULL)
-    return NULL;
+    return nullptr;
 
-  block= buf_page_get(page_id_t(block->page.id().space(), next), 0, mode, mtr);
-  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+  block= buf_page_get_gen(page_id_t(block->page.id().space(), next), 0, mode,
+                          nullptr, BUF_GET_POSSIBLY_FREED, mtr);
 
-  return trx_undo_page_get_first_rec(block, page_no, offset);
+  return block ? trx_undo_page_get_first_rec(block, page_no, offset) : nullptr;
 }
 
 /** Get the next record in an undo log.
@@ -257,8 +261,8 @@ trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no,
 @param[in,out]  mtr     mini-transaction
 @return undo log record, the page latched, NULL if none */
 trx_undo_rec_t*
-trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
-                      uint16_t offset, mtr_t *mtr)
+trx_undo_get_next_rec(const buf_block_t *&block, uint16_t rec,
+                      uint32_t page_no, uint16_t offset, mtr_t *mtr)
 {
   if (trx_undo_rec_t *next= trx_undo_page_get_next_rec(block, rec, page_no,
                                                        offset))
@@ -275,14 +279,18 @@ trx_undo_get_next_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
 @param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
 @param[out]     block   undo log page
 @param[in,out]  mtr     mini-transaction
-@return undo log record, the page latched, NULL if none */
+@param[out]     err     error code
+@return undo log record, the page latched
+@retval nullptr if none */
 trx_undo_rec_t*
 trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
-                       uint16_t offset, ulint mode, buf_block_t*& block,
-                       mtr_t *mtr)
+                       uint16_t offset, ulint mode, const buf_block_t*& block,
+                       mtr_t *mtr, dberr_t *err)
 {
-  block = buf_page_get(page_id_t(space.id, page_no), 0, mode, mtr);
-  buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+  block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
+                          nullptr, BUF_GET, mtr, err);
+  if (!block)
+    return nullptr;
 
   if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
     return rec;
@@ -291,37 +299,151 @@ trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
                                               mtr);
 }
 
+inline void UndorecApplier::assign_rec(const buf_block_t &block,
+                                       uint16_t offset)
+{
+  ut_ad(block.page.lock.have_s());
+  this->offset= offset;
+  this->undo_rec= trx_undo_rec_copy(block.page.frame + offset, heap);
+}
+
+inline void UndorecApplier::apply_undo_rec()
+{
+  if (!undo_rec)
+    return;
+  bool updated_extern= false;
+  undo_no_t undo_no= 0;
+  table_id_t table_id= 0;
+  undo_rec= trx_undo_rec_get_pars(undo_rec, &type,
+                                  &cmpl_info,
+                                  &updated_extern, &undo_no, &table_id);
+  dict_sys.freeze(SRW_LOCK_CALL);
+  dict_table_t *table= dict_sys.find_table(table_id);
+  dict_sys.unfreeze();
+
+  ut_ad(table);
+  if (!table->is_active_ddl())
+    return;
+
+  dict_index_t *index= dict_table_get_first_index(table);
+  const dtuple_t *undo_tuple;
+  switch (type) {
+  default:
+    ut_ad("invalid type" == 0);
+    MY_ASSERT_UNREACHABLE();
+  case TRX_UNDO_INSERT_REC:
+    undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+  insert:
+    log_insert(*undo_tuple, index);
+    break;
+  case TRX_UNDO_UPD_EXIST_REC:
+  case TRX_UNDO_UPD_DEL_REC:
+  case TRX_UNDO_DEL_MARK_REC:
+    trx_id_t trx_id;
+    roll_ptr_t roll_ptr;
+    byte info_bits;
+    undo_rec= trx_undo_update_rec_get_sys_cols(
+      undo_rec, &trx_id, &roll_ptr, &info_bits);
+
+    undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+    undo_rec= trx_undo_update_rec_get_update(undo_rec, index, type, trx_id,
+                                             roll_ptr, info_bits,
+                                             heap, &update);
+    if (type == TRX_UNDO_UPD_DEL_REC)
+      goto insert;
+    log_update(*undo_tuple, index);
+  }
+
+  clear_undo_rec();
+}
+
+/** Apply any changes to tables for which online DDL is in progress. */
+ATTRIBUTE_COLD void trx_t::apply_log()
+{
+  const trx_undo_t *undo= rsegs.m_redo.undo;
+  if (!undo || !undo_no)
+    return;
+  page_id_t page_id{rsegs.m_redo.rseg->space->id, undo->hdr_page_no};
+  page_id_t next_page_id(page_id);
+  mtr_t mtr;
+  mtr.start();
+  buf_block_t *block= buf_page_get(page_id, 0, RW_S_LATCH, &mtr);
+  if (UNIV_UNLIKELY(!block))
+  {
+    mtr.commit();
+    return;
+  }
+
+  UndorecApplier log_applier(page_id, id);
+
+  for (;;)
+  {
+    trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_id.page_no(),
+                                                     undo->hdr_offset);
+    while (rec)
+    {
+      log_applier.assign_rec(*block, page_offset(rec));
+      mtr.commit();
+      log_applier.apply_undo_rec();
+      mtr.start();
+      block= buf_page_get(log_applier.get_page_id(), 0, RW_S_LATCH, &mtr);
+      if (UNIV_UNLIKELY(!block))
+        goto func_exit;
+      rec= trx_undo_page_get_next_rec(block, log_applier.get_offset(),
+                                      page_id.page_no(), undo->hdr_offset);
+    }
+
+    uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                    FLST_NEXT + FIL_ADDR_PAGE +
+                                    block->page.frame);
+    if (next == FIL_NULL)
+      break;
+    next_page_id.set_page_no(next);
+    mtr.commit();
+    mtr.start();
+    block= buf_page_get_gen(next_page_id, 0, RW_S_LATCH, block, BUF_GET, &mtr);
+    if (UNIV_UNLIKELY(!block))
+      break;
+    log_applier.assign_next(next_page_id);
+  }
+func_exit:
+  mtr.commit();
+  apply_online_log= false;
+}
+
 /*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
 
 /** Initialize an undo log page.
 NOTE: This corresponds to a redo log record and must not be changed!
 @see mtr_t::undo_create()
-@param[in,out]	block	undo log page */
+@param block   undo log page */
 void trx_undo_page_init(const buf_block_t &block)
 {
-  mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.frame),
+  mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.page.frame),
                   FIL_PAGE_UNDO_LOG);
   static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
-  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.frame,
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.page.frame,
                     0, 2);
   mach_write_to_2(my_assume_aligned<2>
-                  (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame),
+                  (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame),
                   TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
-  memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.frame,
-                    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.frame, 2);
+  memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame,
+                    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame,
+                    2);
   /* The following corresponds to flst_zero_both(), but without writing log. */
   memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
-                    FIL_ADDR_PAGE + block.frame, 0xff, 4);
+                    FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
   memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
-                    FIL_ADDR_BYTE + block.frame, 0, 2);
+                    FIL_ADDR_BYTE + block.page.frame, 0, 2);
   memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
-                    FIL_ADDR_PAGE + block.frame, 0xff, 4);
+                    FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
   memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
-                    FIL_ADDR_BYTE + block.frame, 0, 2);
+                    FIL_ADDR_BYTE + block.page.frame, 0, 2);
   static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
                 TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
   /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
-  memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE + block.frame, 0,
+  memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                    block.page.frame, 0,
                     srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
                                      FIL_PAGE_DATA_END));
 }
@@ -361,7 +483,6 @@ trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
 {
 	buf_block_t*	block;
 	uint32_t	n_reserved;
-	bool		success;
 
 	const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
 
@@ -376,45 +497,42 @@ trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
 
 	ut_ad(slot_no < TRX_RSEG_N_SLOTS);
 
-	success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+	*err = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
 					   mtr);
-	if (!success) {
-		*err = DB_OUT_OF_FILE_SPACE;
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
 		return NULL;
 	}
 
 	/* Allocate a new file segment for the undo log */
 	block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
-			    mtr, true);
+			    mtr, err, true);
 
 	space->release_free_extents(n_reserved);
 
-	if (block == NULL) {
-		*err = DB_OUT_OF_FILE_SPACE;
-		return NULL;
+	if (!block) {
+		return block;
 	}
 
-	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-
 	mtr->undo_create(*block);
 	trx_undo_page_init(*block);
 
 	mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-		      + block->frame,
+		      + block->page.frame,
 		      TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
 	mtr->write<2,mtr_t::MAYBE_NOP>(*block,
 				       TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
-				       + block->frame, 0U);
+				       + block->page.frame, 0U);
 
-	flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame,
-		  mtr);
+	flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+		  + block->page.frame, mtr);
 
-	flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
-		      block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+	*err = flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+			     block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+			     mtr);
 
 	*id = slot_no;
 	mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
-		      + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->frame,
+		      + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->page.frame,
 		      block->page.id().page_no());
 
 	MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED);
@@ -434,11 +552,11 @@ static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
   /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
   repurposed after upgrading to MariaDB 10.3. */
   byte *undo_type= my_assume_aligned<2>
-    (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->frame);
+    (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->page.frame);
   ut_ad(mach_read_from_2(undo_type) <= 2);
   mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
   byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
-                                    undo_page->frame);
+                                    undo_page->page.frame);
   const uint16_t free= mach_read_from_2(start + 2);
   static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
                 "compatibility");
@@ -452,22 +570,22 @@ static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
   mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
               start, 2);
   uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
-                                      undo_page->frame);
+                                      undo_page->page.frame);
   alignas(4) byte buf[4];
   mach_write_to_2(buf, TRX_UNDO_ACTIVE);
   mach_write_to_2(buf + 2, free);
   static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
   static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
   mtr->memcpy(*undo_page, my_assume_aligned<4>
-              (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->frame),
+              (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->page.frame),
               buf, 4);
   if (prev_log)
-    mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG + undo_page->frame,
-                  free);
+    mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG +
+                  undo_page->page.frame, free);
   mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
-                                 undo_page->frame, trx_id);
+                                 undo_page->page.frame, trx_id);
   if (UNIV_UNLIKELY(mach_read_from_8(free + TRX_UNDO_TRX_NO +
-                                     undo_page->frame) != 0))
+                                     undo_page->page.frame) != 0))
     mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0);
 
   /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
@@ -476,14 +594,14 @@ static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
   static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
                 "compatibility");
   mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
-                                undo_page->frame, buf, 4);
+                                undo_page->page.frame, buf, 4);
   /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
   if (prev_log)
   {
     mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
                 TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
     mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
-                                   undo_page->frame, prev_log);
+                                   undo_page->page.frame, prev_log);
     static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
                   "compatibility");
     mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
@@ -511,9 +629,9 @@ static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
   static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
                 "gtrid and bqual don't fit xid data");
   DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
-                               block->frame) == offset);
+                               block->page.frame) == offset);
 
-  trx_ulogf_t* log_hdr= block->frame + offset;
+  trx_ulogf_t* log_hdr= block->page.frame + offset;
 
   mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
                                  static_cast<uint32_t>(xid.formatID));
@@ -523,7 +641,7 @@ static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
                                  static_cast<uint32_t>(xid.bqual_length));
   const ulint xid_length= static_cast<ulint>(xid.gtrid_length
                                              + xid.bqual_length);
-  mtr->memcpy(*block, &block->frame[offset + TRX_UNDO_XA_XID],
+  mtr->memcpy(*block, &block->page.frame[offset + TRX_UNDO_XA_XID],
               xid.data, xid_length);
   if (UNIV_LIKELY(xid_length < XIDDATASIZE))
     mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
@@ -551,54 +669,58 @@ trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
 /** Allocate an undo log page.
 @param[in,out]	undo	undo log
 @param[in,out]	mtr	mini-transaction that does not hold any page latch
+@param[out]	err	error code
 @return	X-latched block if success
-@retval	NULL	on failure */
-buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr)
+@retval	nullptr	on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
 {
-	trx_rseg_t*	rseg		= undo->rseg;
-	buf_block_t*	new_block	= NULL;
-	uint32_t	n_reserved;
-
-	/* When we add a page to an undo log, this is analogous to
-	a pessimistic insert in a B-tree, and we must reserve the
-	counterpart of the tree latch, which is the rseg mutex. */
-
-	mutex_enter(&rseg->mutex);
-
-	buf_block_t* header_block = trx_undo_page_get(
-		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
-
-	if (!fsp_reserve_free_extents(&n_reserved, undo->rseg->space, 1,
-				      FSP_UNDO, mtr)) {
-		goto func_exit;
-	}
-
-	new_block = fseg_alloc_free_page_general(
-		TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
-		+ header_block->frame,
-		undo->top_page_no + 1, FSP_UP, true, mtr, mtr);
-
-	rseg->space->release_free_extents(n_reserved);
-
-	if (!new_block) {
-		goto func_exit;
-	}
-
-	ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1);
-	buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE);
-	undo->last_page_no = new_block->page.id().page_no();
-
-	mtr->undo_create(*new_block);
-	trx_undo_page_init(*new_block);
-
-	flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
-		      new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
-	undo->size++;
-	rseg->curr_size++;
+  buf_block_t *new_block= nullptr;
+  uint32_t n_reserved;
+
+  /* When we add a page to an undo log, this is analogous to
+   a pessimistic insert in a B-tree, and we must reserve the
+   counterpart of the tree latch, which is the rseg mutex. */
+
+  trx_rseg_t *rseg= undo->rseg;
+  rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+  buf_block_t *header_block=
+    buf_page_get_gen(page_id_t{rseg->space->id, undo->hdr_page_no},
+                     0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+  if (!header_block)
+    goto func_exit;
+  *err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr);
+
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    goto func_exit;
+
+  new_block=
+    fseg_alloc_free_page_general(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                                 header_block->page.frame,
+                                 undo->top_page_no + 1, FSP_UP, true,
+                                 mtr, mtr, err);
+  rseg->space->release_free_extents(n_reserved);
+
+  if (!new_block)
+    goto func_exit;
+
+  undo->last_page_no= new_block->page.id().page_no();
+
+  mtr->undo_create(*new_block);
+  trx_undo_page_init(*new_block);
+  *err= flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+                      new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    new_block= nullptr;
+  else
+  {
+    undo->size++;
+    rseg->curr_size++;
+  }
 
 func_exit:
-	mutex_exit(&rseg->mutex);
-	return(new_block);
+  rseg->latch.wr_unlock();
+  return new_block;
 }
 
 /********************************************************************//**
@@ -614,41 +736,58 @@ trx_undo_free_page(
 	uint32_t hdr_page_no,	/*!< in: header page number */
 	uint32_t page_no,	/*!< in: page number to free: must not be the
 				header page */
-	mtr_t*	mtr)		/*!< in: mtr which does not have a latch to any
+	mtr_t*	mtr,		/*!< in: mtr which does not have a latch to any
 				undo log page; the caller must have reserved
 				the rollback segment mutex */
+	dberr_t* err)		/*!< out: error code */
 {
-	const ulint	space = rseg->space->id;
-
 	ut_a(hdr_page_no != page_no);
-	ut_ad(mutex_own(&(rseg->mutex)));
 
-	buf_block_t* undo_block = trx_undo_page_get(page_id_t(space, page_no),
-						    mtr);
-	buf_block_t* header_block = trx_undo_page_get(page_id_t(space,
-								hdr_page_no),
-						      mtr);
+	buf_block_t* undo_block = buf_page_get_gen(page_id_t(rseg->space->id,
+							     page_no),
+						   0, RW_X_LATCH, nullptr,
+						   BUF_GET, mtr, err);
+	if (UNIV_UNLIKELY(!undo_block)) {
+		return FIL_NULL;
+	}
+	buf_block_t* header_block = buf_page_get_gen(page_id_t(rseg->space->id,
+							       hdr_page_no),
+						     0, RW_X_LATCH, nullptr,
+						     BUF_GET, mtr, err);
+	if (UNIV_UNLIKELY(!header_block)) {
+		return FIL_NULL;
+	}
 
-	flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
-		    undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+	*err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+			   undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+			   mtr);
+
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return FIL_NULL;
+	}
 
-	fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
-		       + header_block->frame,
-		       rseg->space, page_no, mtr);
-	buf_page_free(rseg->space, page_no, mtr, __FILE__, __LINE__);
+	*err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+			      + header_block->page.frame,
+			      rseg->space, page_no, mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return FIL_NULL;
+	}
+	buf_page_free(rseg->space, page_no, mtr);
 
 	const fil_addr_t last_addr = flst_get_last(
-		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + header_block->frame);
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+		+ header_block->page.frame);
 	rseg->curr_size--;
 
-	if (in_history) {
-		buf_block_t* rseg_header = trx_rsegf_get(
-			rseg->space, rseg->page_no, mtr);
+	if (!in_history) {
+	} else if (buf_block_t* rseg_header = rseg->get(mtr, err)) {
 		byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
-			+ rseg_header->frame;
+			+ rseg_header->page.frame;
 		uint32_t hist_size = mach_read_from_4(rseg_hist_size);
 		ut_ad(hist_size > 0);
 		mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+	} else {
+		return FIL_NULL;
 	}
 
 	return(last_addr.page);
@@ -657,73 +796,101 @@ trx_undo_free_page(
 /** Free the last undo log page. The caller must hold the rseg mutex.
 @param[in,out]	undo	undo log
 @param[in,out]	mtr	mini-transaction that does not hold any undo log page
-			or that has allocated the undo log page */
-void
-trx_undo_free_last_page(trx_undo_t* undo, mtr_t* mtr)
+			or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
 {
-	ut_ad(undo->hdr_page_no != undo->last_page_no);
-	ut_ad(undo->size > 0);
-
-	undo->last_page_no = trx_undo_free_page(
-		undo->rseg, false, undo->hdr_page_no, undo->last_page_no, mtr);
-
-	undo->size--;
+  ut_ad(undo->hdr_page_no != undo->last_page_no);
+  ut_ad(undo->size > 0);
+  undo->size--;
+
+  dberr_t err;
+  undo->last_page_no= trx_undo_free_page(undo->rseg, false, undo->hdr_page_no,
+                                         undo->last_page_no, mtr, &err);
+  return err;
 }
 
 /** Truncate the tail of an undo log during rollback.
 @param[in,out]	undo	undo log
 @param[in]	limit	all undo logs after this limit will be discarded
-@param[in]	is_temp	whether this is temporary undo log */
-void trx_undo_truncate_end(trx_undo_t& undo, undo_no_t limit, bool is_temp)
+@param[in]	is_temp	whether this is temporary undo log
+@return error code */
+static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit,
+                                     bool is_temp)
 {
-	mtr_t mtr;
-	ut_ad(is_temp == !undo.rseg->is_persistent());
+  ut_ad(is_temp == !undo.rseg->is_persistent());
 
-	for (;;) {
-		mtr.start();
-		if (is_temp) {
-			mtr.set_log_mode(MTR_LOG_NO_REDO);
-		}
+  for (mtr_t mtr;;)
+  {
+    mtr.start();
+    if (is_temp)
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+    trx_undo_rec_t *trunc_here= nullptr;
+    undo.rseg->latch.wr_lock(SRW_LOCK_CALL);
+    dberr_t err;
+    buf_block_t *undo_block=
+      buf_page_get_gen(page_id_t{undo.rseg->space->id, undo.last_page_no},
+                       0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
+    if (UNIV_UNLIKELY(!undo_block))
+      goto func_exit;
+
+    for (trx_undo_rec_t *rec=
+           trx_undo_page_get_last_rec(undo_block,
+                                      undo.hdr_page_no, undo.hdr_offset);
+         rec; )
+    {
+      if (trx_undo_rec_get_undo_no(rec) < limit)
+        goto func_exit;
+      /* Truncate at least this record off, maybe more */
+      trunc_here= rec;
+      rec= trx_undo_page_get_prev_rec(undo_block, rec,
+                                      undo.hdr_page_no, undo.hdr_offset);
+    }
+
+    if (undo.last_page_no != undo.hdr_page_no)
+    {
+      err= trx_undo_free_last_page(&undo, &mtr);
+      if (UNIV_UNLIKELY(err != DB_SUCCESS))
+        goto func_exit;
+      undo.rseg->latch.wr_unlock();
+      mtr.commit();
+      continue;
+    }
 
-		trx_undo_rec_t* trunc_here = NULL;
-		mutex_enter(&undo.rseg->mutex);
-		buf_block_t* undo_block = trx_undo_page_get(
-			page_id_t(undo.rseg->space->id, undo.last_page_no),
-			&mtr);
-		trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
-			undo_block, undo.hdr_page_no, undo.hdr_offset);
-		while (rec) {
-			if (trx_undo_rec_get_undo_no(rec) < limit) {
-				goto func_exit;
-			}
-			/* Truncate at least this record off, maybe more */
-			trunc_here = rec;
-
-			rec = trx_undo_page_get_prev_rec(undo_block, rec,
-							 undo.hdr_page_no,
-							 undo.hdr_offset);
-		}
+func_exit:
+    undo.rseg->latch.wr_unlock();
 
-		if (undo.last_page_no != undo.hdr_page_no) {
-			trx_undo_free_last_page(&undo, &mtr);
-			mutex_exit(&undo.rseg->mutex);
-			mtr.commit();
-			continue;
-		}
+    if (trunc_here && err == DB_SUCCESS)
+      mtr.write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                   undo_block->page.frame,
+                   ulint(trunc_here - undo_block->page.frame));
 
-func_exit:
-		mutex_exit(&undo.rseg->mutex);
+    mtr.commit();
+    return err;
+  }
+}
 
-		if (trunc_here) {
-			mtr.write<2>(*undo_block,
-				     TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-				     + undo_block->frame,
-				     ulint(trunc_here - undo_block->frame));
-		}
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx)
+{
+  if (trx_undo_t *undo= trx.rsegs.m_redo.undo)
+  {
+    ut_ad(undo->rseg == trx.rsegs.m_redo.rseg);
+    if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, false))
+      return err;
+  }
 
-		mtr.commit();
-		return;
-	}
+  if (trx_undo_t *undo = trx.rsegs.m_noredo.undo)
+  {
+    ut_ad(undo->rseg == trx.rsegs.m_noredo.rseg);
+    if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, true))
+      return err;
+  }
+
+  return DB_SUCCESS;
 }
 
 /** Truncate the head of an undo log.
@@ -733,8 +900,9 @@ freed, but emptied, if all the records there are below the limit.
 @param[in]	hdr_page_no	header page number
 @param[in]	hdr_offset	header offset on the page
 @param[in]	limit		first undo number to preserve
-(everything below the limit will be truncated) */
-void
+(everything below the limit will be truncated)
+@return error code  */
+dberr_t
 trx_undo_truncate_start(
 	trx_rseg_t*	rseg,
 	uint32_t	hdr_page_no,
@@ -745,10 +913,8 @@ trx_undo_truncate_start(
 	trx_undo_rec_t* last_rec;
 	mtr_t		mtr;
 
-	ut_ad(mutex_own(&(rseg->mutex)));
-
 	if (!limit) {
-		return;
+		return DB_SUCCESS;
 	}
 loop:
 	mtr_start(&mtr);
@@ -757,14 +923,15 @@ loop:
 		mtr.set_log_mode(MTR_LOG_NO_REDO);
 	}
 
-	buf_block_t* undo_page;
+	dberr_t err;
+	const buf_block_t* undo_page;
 	rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
-				     RW_X_LATCH, undo_page, &mtr);
+				     RW_X_LATCH, undo_page, &mtr, &err);
 	if (rec == NULL) {
 		/* Already empty */
 done:
 		mtr.commit();
-		return;
+		return err;
 	}
 
 	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
@@ -775,22 +942,24 @@ done:
 
 	if (undo_page->page.id().page_no() == hdr_page_no) {
 		uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
-						+ undo_page->frame);
+						+ undo_page->page.frame);
 		if (end == 0) {
 			end = mach_read_from_2(TRX_UNDO_PAGE_HDR
 					       + TRX_UNDO_PAGE_FREE
-					       + undo_page->frame);
+					       + undo_page->page.frame);
 		}
 
-		mtr.write<2>(*undo_page, undo_page->frame + hdr_offset
+		mtr.write<2>(*undo_page, undo_page->page.frame + hdr_offset
 			     + TRX_UNDO_LOG_START, end);
 	} else {
 		trx_undo_free_page(rseg, true, hdr_page_no,
-				   undo_page->page.id().page_no(), &mtr);
+				   undo_page->page.id().page_no(), &mtr, &err);
+		if (err != DB_SUCCESS) {
+			goto done;
+		}
 	}
 
-	mtr_commit(&mtr);
-
+	mtr.commit();
 	goto loop;
 }
 
@@ -798,37 +967,42 @@ done:
 @param undo	temporary undo log */
 static void trx_undo_seg_free(const trx_undo_t *undo)
 {
-	ut_ad(undo->id < TRX_RSEG_N_SLOTS);
-
-	trx_rseg_t* const rseg = undo->rseg;
-	bool		finished;
-	mtr_t		mtr;
-	ut_ad(rseg->space == fil_system.temp_space);
-
-	do {
-		mtr.start();
-		mtr.set_log_mode(MTR_LOG_NO_REDO);
+  ut_ad(undo->id < TRX_RSEG_N_SLOTS);
 
-		buf_block_t* block = trx_undo_page_get(
-			page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), &mtr);
+  trx_rseg_t *const rseg= undo->rseg;
+  bool finished;
+  mtr_t mtr;
+  ut_ad(rseg->space == fil_system.temp_space);
 
-		fseg_header_t* file_seg = TRX_UNDO_SEG_HDR
-			+ TRX_UNDO_FSEG_HEADER + block->frame;
-
-		finished = fseg_free_step(file_seg, &mtr);
-
-		if (finished) {
-			/* Update the rseg header */
-			buf_block_t* rseg_header = trx_rsegf_get(
-				rseg->space, rseg->page_no, &mtr);
-			compile_time_assert(FIL_NULL == 0xffffffff);
-			mtr.memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
-				   + undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
-			MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
-		}
-
-		mtr.commit();
-	} while (!finished);
+  do
+  {
+    mtr.start();
+    mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+    finished= true;
+
+    if (buf_block_t *block=
+        buf_page_get(page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), 0,
+                     RW_X_LATCH, &mtr))
+    {
+      fseg_header_t *file_seg= TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+        block->page.frame;
+
+      finished= fseg_free_step(file_seg, &mtr);
+
+      if (!finished);
+      else if (buf_block_t* rseg_header = rseg->get(&mtr, nullptr))
+      {
+        static_assert(FIL_NULL == 0xffffffff, "compatibility");
+        mtr.memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                   undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+        MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_USED);
+      }
+    }
+
+    mtr.commit();
+  }
+  while (!finished);
 }
 
 /*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
@@ -848,22 +1022,26 @@ trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no)
 	ut_ad(id < TRX_RSEG_N_SLOTS);
 
 	mtr.start();
-	const buf_block_t* block = trx_undo_page_get(
-		page_id_t(rseg->space->id, page_no), &mtr);
+	const buf_block_t* block = buf_page_get(
+		page_id_t(rseg->space->id, page_no), 0, RW_X_LATCH, &mtr);
+	if (UNIV_UNLIKELY(!block)) {
+corrupted:
+		mtr.commit();
+		return nullptr;
+	}
+
 	const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
 					       + TRX_UNDO_PAGE_TYPE
-					       + block->frame);
+					       + block->page.frame);
 	if (UNIV_UNLIKELY(type > 2)) {
 corrupted_type:
 		sql_print_error("InnoDB: unsupported undo header type %u",
 				type);
-corrupted:
-		mtr.commit();
-		return nullptr;
+		goto corrupted;
 	}
 
 	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
-					   + block->frame);
+					   + block->page.frame);
 	if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
 	    offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
 		sql_print_error("InnoDB: invalid undo header offset %u",
@@ -871,9 +1049,10 @@ corrupted:
 		goto corrupted;
 	}
 
-	const trx_ulogf_t* const undo_header = block->frame + offset;
+	const trx_ulogf_t* const undo_header = block->page.frame + offset;
 	uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
-					  + block->frame);
+					  + block->page.frame);
+
 	const trx_id_t trx_id= mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
 	if (trx_id >> 48) {
 		sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
@@ -930,34 +1109,39 @@ corrupted:
 		xid.null();
 	}
 
-	mutex_enter(&rseg->mutex);
 	if (trx_no > rseg->needs_purge) {
 		rseg->needs_purge = trx_no;
 	}
+
 	trx_undo_t* undo = trx_undo_mem_create(
 		rseg, id, trx_id, &xid, page_no, offset);
-	mutex_exit(&rseg->mutex);
 	if (!undo) {
 		return undo;
 	}
 
 	undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
-	undo->table_id = mach_read_from_8(undo_header + TRX_UNDO_TABLE_ID);
 	undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
-				  + block->frame);
+				  + block->page.frame);
 
 	fil_addr_t	last_addr = flst_get_last(
-		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->frame);
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
 
 	undo->last_page_no = last_addr.page;
 	undo->top_page_no = last_addr.page;
 
-	const buf_block_t* last = trx_undo_page_get(
-		page_id_t(rseg->space->id, undo->last_page_no), &mtr);
+	const buf_block_t* last = buf_page_get(
+		page_id_t(rseg->space->id, undo->last_page_no), 0,
+		RW_X_LATCH, &mtr);
+
+	if (UNIV_UNLIKELY(!last)) {
+		ut_free(undo);
+		goto corrupted;
+        }
 
 	if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
 		    last, page_no, offset)) {
-		undo->top_offset = static_cast<uint16_t>(rec - last->frame);
+		undo->top_offset = static_cast<uint16_t>(
+			rec - last->page.frame);
 		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
 		ut_ad(!undo->empty());
 	} else {
@@ -995,8 +1179,6 @@ trx_undo_mem_create(
 {
 	trx_undo_t*	undo;
 
-	ut_ad(mutex_own(&(rseg->mutex)));
-
 	ut_a(id < TRX_RSEG_N_SLOTS);
 
 	undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
@@ -1040,8 +1222,6 @@ trx_undo_mem_init_for_reuse(
 	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
 	uint16_t	offset)	/*!< in: undo log header byte offset on page */
 {
-	ut_ad(mutex_own(&((undo->rseg)->mutex)));
-
 	ut_a(undo->id < TRX_RSEG_N_SLOTS);
 
 	undo->state = TRX_UNDO_ACTIVE;
@@ -1069,12 +1249,11 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 		dberr_t* err, mtr_t* mtr)
 {
 	ulint		id;
+	buf_block_t*	block = rseg->get(mtr, err);
 
-	ut_ad(mutex_own(&(rseg->mutex)));
-
-	buf_block_t*	block = trx_undo_seg_create(
-		rseg->space,
-		trx_rsegf_get(rseg->space, rseg->page_no, mtr), &id, err, mtr);
+	if (block) {
+		block = trx_undo_seg_create(rseg->space, block, &id, err, mtr);
+	}
 
 	if (!block) {
 		return NULL;
@@ -1084,7 +1263,7 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 
 	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
 
-	*undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid,
+	*undo = trx_undo_mem_create(rseg, id, trx->id, &trx->xid,
 				    block->page.id().page_no(), offset);
 	if (*undo == NULL) {
 		*err = DB_OUT_OF_MEMORY;
@@ -1094,21 +1273,14 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 		return block;
 	}
 
-	switch (trx_get_dict_operation(trx)) {
-	case TRX_DICT_OP_NONE:
-		break;
-	case TRX_DICT_OP_INDEX:
-		/* Do not discard the table on recovery. */
-		trx->table_id = 0;
-		/* fall through */
-	case TRX_DICT_OP_TABLE:
-		(*undo)->table_id = trx->table_id;
-		(*undo)->dict_operation = TRUE;
-		mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+	if (trx->dict_operation) {
+		(*undo)->dict_operation = true;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
 					       + TRX_UNDO_DICT_TRANS, 1U);
-		mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
-					       + TRX_UNDO_TABLE_ID,
-					       trx->table_id);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_TABLE_ID, 0U);
 	}
 
 	*err = DB_SUCCESS;
@@ -1129,10 +1301,8 @@ buf_block_t*
 trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
 		      mtr_t* mtr)
 {
-	ut_ad(mutex_own(&rseg->mutex));
-
 	if (rseg->is_persistent()) {
-		ut_ad(rseg->trx_ref_count);
+		ut_ad(rseg->is_referenced());
 		if (rseg->needs_purge <= trx->id) {
 			/* trx_purge_truncate_history() compares
 			rseg->needs_purge <= head.trx_no
@@ -1144,7 +1314,7 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
 			rseg->needs_purge = trx->id + 1;
 		}
 	} else {
-		ut_ad(!rseg->trx_ref_count);
+		ut_ad(!rseg->is_referenced());
 	}
 
 	trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
@@ -1162,8 +1332,6 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
 		return NULL;
 	}
 
-	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
-
 	UT_LIST_REMOVE(rseg->undo_cached, undo);
 	MONITOR_DEC(MONITOR_NUM_UNDO_SLOT_CACHED);
 
@@ -1171,27 +1339,20 @@ trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
 
 	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
 
-	trx_undo_mem_init_for_reuse(undo, trx->id, trx->xid, offset);
+	trx_undo_mem_init_for_reuse(undo, trx->id, &trx->xid, offset);
 
 	if (rseg != trx->rsegs.m_redo.rseg) {
 		return block;
 	}
 
-	switch (trx_get_dict_operation(trx)) {
-	case TRX_DICT_OP_NONE:
-		return block;
-	case TRX_DICT_OP_INDEX:
-		/* Do not discard the table on recovery. */
-		trx->table_id = 0;
-		/* fall through */
-	case TRX_DICT_OP_TABLE:
-		undo->table_id = trx->table_id;
+	if (trx->dict_operation) {
 		undo->dict_operation = TRUE;
-		mtr->write<1,mtr_t::MAYBE_NOP>(*block, block->frame + offset
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
 					       + TRX_UNDO_DICT_TRANS, 1U);
-		mtr->write<8,mtr_t::MAYBE_NOP>(*block, block->frame + offset
-					       + TRX_UNDO_TABLE_ID,
-					       trx->table_id);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_TABLE_ID, 0U);
 	}
 
 	return block;
@@ -1215,12 +1376,12 @@ trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
 		return buf_page_get_gen(
 			page_id_t(undo->rseg->space->id, undo->last_page_no),
 			0, RW_X_LATCH, undo->guess_block,
-			BUF_GET, __FILE__, __LINE__, mtr, err);
+			BUF_GET, mtr, err);
 	}
 
 	trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
 
-	mutex_enter(&rseg->mutex);
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
 	buf_block_t* block = trx_undo_reuse_cached(
 		trx, rseg, &trx->rsegs.m_redo.undo, mtr);
 
@@ -1238,7 +1399,7 @@ trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
 	UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
 
 func_exit:
-	mutex_exit(&rseg->mutex);
+	rseg->latch.wr_unlock();
 	return block;
 }
 
@@ -1267,7 +1428,7 @@ trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 		return buf_page_get_gen(
 			page_id_t(rseg->space->id, (*undo)->last_page_no),
 			0, RW_X_LATCH, (*undo)->guess_block,
-			BUF_GET, __FILE__, __LINE__, mtr, err);
+			BUF_GET, mtr, err);
 	}
 
 	DBUG_EXECUTE_IF(
@@ -1275,7 +1436,7 @@ trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 		*err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
 	);
 
-	mutex_enter(&rseg->mutex);
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
 	buf_block_t* block = trx_undo_reuse_cached(trx, rseg, undo, mtr);
 
 	if (!block) {
@@ -1291,7 +1452,7 @@ trx_undo_assign_low(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
 	UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
 
 func_exit:
-	mutex_exit(&rseg->mutex);
+	rseg->latch.wr_unlock();
 	return block;
 }
 
@@ -1304,22 +1465,25 @@ trx_undo_set_state_at_finish(
 	trx_undo_t*	undo,	/*!< in: undo log memory copy */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
-	ut_a(undo->id < TRX_RSEG_N_SLOTS);
-
-	buf_block_t* block = trx_undo_page_get(
-		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
-
-	const uint16_t state = undo->size == 1
-		&& TRX_UNDO_PAGE_REUSE_LIMIT
-		> mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
-				   + block->frame)
-		? TRX_UNDO_CACHED
-		: TRX_UNDO_TO_PURGE;
-
-	undo->state = state;
-	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
-		      + block->frame, state);
-	return block;
+  ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+  buf_block_t *block=
+    buf_page_get(page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0,
+                 RW_X_LATCH, mtr);
+  /* This function is invoked during transaction commit, which is not
+  allowed to fail. If we get a corrupted undo header, we will crash here. */
+  ut_a(block);
+  const uint16_t state = undo->size == 1 &&
+    TRX_UNDO_PAGE_REUSE_LIMIT >
+    mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                     block->page.frame)
+    ? TRX_UNDO_CACHED
+    : TRX_UNDO_TO_PURGE;
+
+  undo->state= state;
+  mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->page.frame,
+                state);
+  return block;
 }
 
 /** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
@@ -1333,27 +1497,36 @@ void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
 {
 	ut_a(undo->id < TRX_RSEG_N_SLOTS);
 
-	buf_block_t* block = trx_undo_page_get(
-		page_id_t(undo->rseg->space->id, undo->hdr_page_no), mtr);
+	buf_block_t* block = buf_page_get(
+		page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0,
+		RW_X_LATCH, mtr);
+	if (UNIV_UNLIKELY(!block)) {
+		/* In case of !rollback the undo header page
+		corruption would leave the transaction object in an
+		unexpected (active) state. */
+		ut_a(rollback);
+		return;
+	}
 
 	if (rollback) {
 		ut_ad(undo->state == TRX_UNDO_PREPARED);
 		mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
-			      + block->frame, TRX_UNDO_ACTIVE);
+			      + block->page.frame, TRX_UNDO_ACTIVE);
 		return;
 	}
 
 	/*------------------------------*/
 	ut_ad(undo->state == TRX_UNDO_ACTIVE);
 	undo->state = TRX_UNDO_PREPARED;
-	undo->xid   = *trx->xid;
+	undo->xid   = trx->xid;
 	/*------------------------------*/
 
-	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + block->frame,
-		      undo->state);
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+		      + block->page.frame, undo->state);
 	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
-					   + block->frame);
-	mtr->write<1>(*block, block->frame + offset + TRX_UNDO_XID_EXISTS, 1U);
+					   + block->page.frame);
+	mtr->write<1>(*block, block->page.frame + offset + TRX_UNDO_XID_EXISTS,
+		      1U);
 
 	trx_undo_write_xid(block, offset, undo->xid, mtr);
 }
@@ -1367,13 +1540,14 @@ void trx_undo_commit_cleanup(trx_undo_t *undo)
 	trx_rseg_t*	rseg	= undo->rseg;
 	ut_ad(rseg->space == fil_system.temp_space);
 
-	mutex_enter(&rseg->mutex);
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
 
 	UT_LIST_REMOVE(rseg->undo_list, undo);
 
 	if (undo->state == TRX_UNDO_CACHED) {
 		UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
 		MONITOR_INC(MONITOR_NUM_UNDO_SLOT_CACHED);
+		undo = nullptr;
 	} else {
 		ut_ad(undo->state == TRX_UNDO_TO_PURGE);
 
@@ -1382,11 +1556,10 @@ void trx_undo_commit_cleanup(trx_undo_t *undo)
 
 		ut_ad(rseg->curr_size > undo->size);
 		rseg->curr_size -= undo->size;
-
-		ut_free(undo);
 	}
 
-	mutex_exit(&rseg->mutex);
+	rseg->latch.wr_unlock();
+	ut_free(undo);
 }
 
 /** At shutdown, frees the undo logs of a transaction. */
diff --git a/storage/innobase/unittest/CMakeLists.txt b/storage/innobase/unittest/CMakeLists.txt
index df98cddf73e..7dd7c111baa 100644
--- a/storage/innobase/unittest/CMakeLists.txt
+++ b/storage/innobase/unittest/CMakeLists.txt
@@ -14,9 +14,21 @@
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335 USA
 
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include
-		    ${CMAKE_SOURCE_DIR}/unittest/mytap
-                    ${CMAKE_SOURCE_DIR}/storage/innobase/include)
+                    ${CMAKE_SOURCE_DIR}/unittest/mytap
+                    ${CMAKE_SOURCE_DIR}/storage/innobase/include
+                    ${CMAKE_SOURCE_DIR}/tpool)
 ADD_EXECUTABLE(innodb_fts-t innodb_fts-t.cc)
 TARGET_LINK_LIBRARIES(innodb_fts-t mysys mytap)
 ADD_DEPENDENCIES(innodb_fts-t GenError)
 MY_ADD_TEST(innodb_fts)
+# See explanation in innobase/CmakeLists.txt
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64|s390x")
+  ADD_COMPILE_FLAGS(
+      ../sync/srw_lock.cc
+      COMPILE_FLAGS "-mhtm"
+      )
+ENDIF()
+ADD_EXECUTABLE(innodb_sync-t innodb_sync-t.cc ../sync/srw_lock.cc)
+TARGET_LINK_LIBRARIES(innodb_sync-t mysys mytap)
+ADD_DEPENDENCIES(innodb_sync-t GenError)
+MY_ADD_TEST(innodb_sync)
diff --git a/storage/innobase/unittest/innodb_sync-t.cc b/storage/innobase/unittest/innodb_sync-t.cc
new file mode 100644
index 00000000000..d0289086b24
--- /dev/null
+++ b/storage/innobase/unittest/innodb_sync-t.cc
@@ -0,0 +1,185 @@
+/* Copyright (c) 2021, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#include <thread>
+#include "tap.h"
+#include "my_sys.h"
+#include "sux_lock.h"
+
+static std::atomic<bool> critical;
+
+ulong srv_n_spin_wait_rounds= 30;
+uint srv_spin_wait_delay= 4;
+
+constexpr unsigned N_THREADS= 30;
+constexpr unsigned N_ROUNDS= 100;
+constexpr unsigned M_ROUNDS= 100;
+
+static srw_mutex m;
+
+static void test_srw_mutex()
+{
+  for (auto i= N_ROUNDS * M_ROUNDS; i--; )
+  {
+    m.wr_lock();
+    assert(!critical);
+    critical= true;
+    critical= false;
+    m.wr_unlock();
+  }
+}
+
+static srw_lock_low l;
+
+static void test_srw_lock()
+{
+  for (auto i= N_ROUNDS; i--; )
+  {
+    l.wr_lock();
+    assert(!critical);
+    critical= true;
+    critical= false;
+    l.wr_unlock();
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      l.rd_lock();
+      assert(!critical);
+      l.rd_unlock();
+    }
+  }
+}
+
+static ssux_lock_impl<false> ssux;
+
+static void test_ssux_lock()
+{
+  for (auto i= N_ROUNDS; i--; )
+  {
+    ssux.wr_lock();
+    assert(!critical);
+    critical= true;
+    critical= false;
+    ssux.wr_unlock();
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      ssux.rd_lock();
+      assert(!critical);
+      ssux.rd_unlock();
+    }
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      ssux.u_lock();
+      assert(!critical);
+      ssux.u_wr_upgrade();
+      assert(!critical);
+      critical= true;
+      critical= false;
+      ssux.wr_u_downgrade();
+      ssux.u_unlock();
+    }
+  }
+}
+
+static sux_lock<ssux_lock_impl<true>> sux;
+
+static void test_sux_lock()
+{
+  for (auto i= N_ROUNDS; i--; )
+  {
+    sux.x_lock();
+    assert(!critical);
+    critical= true;
+    for (auto j= M_ROUNDS; j--; )
+      sux.x_lock();
+    critical= false;
+    for (auto j= M_ROUNDS + 1; j--; )
+      sux.x_unlock();
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      sux.s_lock();
+      assert(!critical);
+      sux.s_unlock();
+    }
+
+    for (auto j= M_ROUNDS / 2; j--; )
+    {
+      sux.u_lock();
+      assert(!critical);
+      sux.u_lock();
+      sux.u_x_upgrade();
+      assert(!critical);
+      critical= true;
+      sux.x_unlock();
+      critical= false;
+      sux.x_u_downgrade();
+      sux.u_unlock();
+    }
+  }
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+  std::thread t[N_THREADS];
+
+  MY_INIT(argv[0]);
+
+  plan(4);
+
+  m.init();
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_srw_mutex);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  m.destroy();
+  ok(true, "srw_mutex");
+
+  l.init();
+
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_srw_lock);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  ok(true, "srw_lock");
+
+  l.destroy();
+
+  ssux.init();
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_ssux_lock);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  ok(true, "ssux_lock");
+  ssux.destroy();
+
+  sux.init();
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_sux_lock);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  ok(true, "sux_lock");
+  sux.free();
+}
diff --git a/storage/innobase/ut/ut0mem.cc b/storage/innobase/ut/ut0mem.cc
index faade827283..15d4da12a76 100644
--- a/storage/innobase/ut/ut0mem.cc
+++ b/storage/innobase/ut/ut0mem.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2019, MariaDB Corporation.
+Copyright (c) 2019, 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -25,6 +25,7 @@ Created 5/11/1994 Heikki Tuuri
 *************************************************************************/
 
 #include "ut0mem.h"
+#include "ut0new.h"
 
 /********************************************************************
 Concatenate 3 strings.*/
diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc
index 5e00a4ca0ea..a3ce1bdf3c7 100644
--- a/storage/innobase/ut/ut0new.cc
+++ b/storage/innobase/ut/ut0new.cc
@@ -25,7 +25,7 @@ Created May 26, 2014 Vasil Dimov
 *******************************************************/
 
 #include "univ.i"
-#include <algorithm>
+#include "ut0new.h"
 /** The total amount of memory currently allocated from the operating
 system with allocate_large(). */
 Atomic_counter<ulint> os_total_large_mem_allocated;
diff --git a/storage/innobase/ut/ut0rbt.cc b/storage/innobase/ut/ut0rbt.cc
index cdd1ef06775..7ba6693cbc1 100644
--- a/storage/innobase/ut/ut0rbt.cc
+++ b/storage/innobase/ut/ut0rbt.cc
@@ -1,6 +1,7 @@
 /***************************************************************************//**
 
 Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,6 +25,7 @@ Created 2007-03-20 Sunny Bains
 ***********************************************************************/
 
 #include "ut0rbt.h"
+#include "ut0new.h"
 
 /**********************************************************************//**
 Definition of a red-black tree
diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc
index da99e87caa5..9c214e65bbd 100644
--- a/storage/innobase/ut/ut0ut.cc
+++ b/storage/innobase/ut/ut0ut.cc
@@ -32,7 +32,6 @@ Created 5/11/1994 Heikki Tuuri
 
 #ifndef UNIV_INNOCHECKSUM
 #include <mysql_com.h>
-#include "os0thread.h"
 #include "ut0ut.h"
 #include "trx0trx.h"
 #include <string>
@@ -92,7 +91,7 @@ ut_print_timestamp(
 #ifdef UNIV_INNOCHECKSUM
 		ulint{0}
 #else
-		ulint(os_thread_get_curr_id())
+		ulint(pthread_self())
 #endif
 		);
 }
@@ -364,10 +363,6 @@ ut_strerr(
 		return("Cluster not found");
 	case DB_TABLE_NOT_FOUND:
 		return("Table not found");
-	case DB_MUST_GET_MORE_FILE_SPACE:
-		return("More file space needed");
-	case DB_TABLE_IS_BEING_USED:
-		return("Table is being used");
 	case DB_TOO_BIG_RECORD:
 		return("Record too big");
 	case DB_TOO_BIG_INDEX_COL:
@@ -432,8 +427,6 @@ ut_strerr(
 		return("End of index");
 	case DB_IO_ERROR:
 		return("I/O error");
-	case DB_TABLE_IN_FK_CHECK:
-		return("Table is being used in foreign key check");
 	case DB_NOT_FOUND:
 		return("not found");
 	case DB_ONLINE_LOG_TOO_BIG:
@@ -479,57 +472,6 @@ ut_strerr(
 	return("Unknown error");
 }
 
-#ifdef UNIV_PFS_MEMORY
-
-/** Extract the basename of a file without its extension.
-For example, extract "foo0bar" out of "/path/to/foo0bar.cc".
-@param[in]	file		file path, e.g. "/path/to/foo0bar.cc"
-@param[out]	base		result, e.g. "foo0bar"
-@param[in]	base_size	size of the output buffer 'base', if there
-is not enough space, then the result will be truncated, but always
-'\0'-terminated
-@return number of characters that would have been printed if the size
-were unlimited (not including the final ‘\0’) */
-size_t
-ut_basename_noext(
-	const char*	file,
-	char*		base,
-	size_t		base_size)
-{
-	/* Assuming 'file' contains something like the following,
-	extract the file name without the extenstion out of it by
-	setting 'beg' and 'len'.
-	...mysql-trunk/storage/innobase/dict/dict0dict.cc:302
-                                             ^-- beg, len=9
-	*/
-
-	const char*	beg = strrchr(file, OS_PATH_SEPARATOR);
-
-	if (beg == NULL) {
-		beg = file;
-	} else {
-		beg++;
-	}
-
-	size_t		len = strlen(beg);
-
-	const char*	end = strrchr(beg, '.');
-
-	if (end != NULL) {
-		len = end - beg;
-	}
-
-	const size_t	copy_len = std::min(len, base_size - 1);
-
-	memcpy(base, beg, copy_len);
-
-	base[copy_len] = '\0';
-
-	return(len);
-}
-
-#endif /* UNIV_PFS_MEMORY */
-
 namespace ib {
 
 ATTRIBUTE_COLD logger& logger::operator<<(dberr_t err)
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
index af56bb5cba0..f4af031f96c 100644
--- a/storage/innobase/ut/ut0wqueue.cc
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -38,10 +38,7 @@ ib_wqueue_create(void)
 	ib_wqueue_t*	wq = static_cast<ib_wqueue_t*>(
 		ut_malloc_nokey(sizeof(*wq)));
 
-	/* Function ib_wqueue_create() has not been used anywhere,
-	not necessary to instrument this mutex */
-
-	mutex_create(LATCH_ID_WORK_QUEUE, &wq->mutex);
+	mysql_mutex_init(0, &wq->mutex, nullptr);
 
 	wq->items = ib_list_create();
 	wq->length = 0;
@@ -56,7 +53,7 @@ ib_wqueue_free(
 /*===========*/
 	ib_wqueue_t*	wq)	/*!< in: work queue */
 {
-	mutex_free(&wq->mutex);
+	mysql_mutex_destroy(&wq->mutex);
 	ib_list_free(wq->items);
 
 	ut_free(wq);
@@ -71,7 +68,7 @@ void
 ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, bool wq_locked)
 {
 	if (!wq_locked) {
-		mutex_enter(&wq->mutex);
+		mysql_mutex_lock(&wq->mutex);
 	}
 
 	ib_list_add_last(wq->items, item, heap);
@@ -79,7 +76,7 @@ ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, bool wq_locked)
 	ut_ad(wq->length == ib_list_len(wq->items));
 
 	if (!wq_locked) {
-		mutex_exit(&wq->mutex);
+		mysql_mutex_unlock(&wq->mutex);
 	}
 }
 
@@ -93,7 +90,7 @@ ib_wqueue_nowait(
 {
 	ib_list_node_t*	node = NULL;
 
-	mutex_enter(&wq->mutex);
+	mysql_mutex_lock(&wq->mutex);
 
 	if(!ib_list_is_empty(wq->items)) {
 		node = ib_list_get_first(wq->items);
@@ -105,7 +102,7 @@ ib_wqueue_nowait(
 		}
 	}
 
-	mutex_exit(&wq->mutex);
+	mysql_mutex_unlock(&wq->mutex);
 
 	return (node ? node->data : NULL);
 }
@@ -114,8 +111,8 @@ ib_wqueue_nowait(
 @return whether the queue is empty */
 bool ib_wqueue_is_empty(ib_wqueue_t* wq)
 {
-	mutex_enter(&wq->mutex);
+	mysql_mutex_lock(&wq->mutex);
 	bool is_empty = ib_list_is_empty(wq->items);
-	mutex_exit(&wq->mutex);
+	mysql_mutex_unlock(&wq->mutex);
 	return is_empty;
 }
diff --git a/tpool/CMakeLists.txt b/tpool/CMakeLists.txt
index 3e3f8e0b42a..3a49ea22837 100644
--- a/tpool/CMakeLists.txt
+++ b/tpool/CMakeLists.txt
@@ -1,17 +1,42 @@
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/include)
 IF(WIN32)
   SET(EXTRA_SOURCES tpool_win.cc aio_win.cc)
-ELSE()
-  SET(EXTRA_SOURCES aio_linux.cc)
-ENDIF()
-
-IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
- CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
- CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
- IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
-    ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
-    LINK_LIBRARIES(aio)
- ENDIF()
+ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  OPTION(WITH_URING "Require that io_uring be used" OFF)
+  OPTION(WITH_LIBAIO "Require that libaio is used, unless uring is there" OFF)
+  IF(WITH_URING)
+    SET(URING_REQUIRED REQUIRED)
+  ELSEIF(WITH_LIBAIO)
+    SET(LIBAIO_REQIRED REQUIRED)
+  ENDIF()
+  FIND_PACKAGE(URING QUIET ${URING_REQUIRED})
+  IF(URING_FOUND)
+    SET(URING_FOUND ${URING_FOUND} PARENT_SCOPE)
+    SET(TPOOL_DEFINES "-DHAVE_URING" PARENT_SCOPE)
+    ADD_DEFINITIONS(-DHAVE_URING)
+    LINK_LIBRARIES(${URING_LIBRARIES})
+    INCLUDE_DIRECTORIES(${URING_INCLUDE_DIR})
+    SET(EXTRA_SOURCES aio_liburing.cc)
+    SET(CMAKE_REQUIRED_INCLUDES_SAVE ${CMAKE_REQUIRED_INCLUDES})
+    SET(CMAKE_REQUIRED_LIBRARIES_SAVE ${CMAKE_REQUIRED_LIBRARIES})
+    SET(CMAKE_REQUIRED_INCLUDES ${URING_INCLUDE_DIR})
+    SET(CMAKE_REQUIRED_LIBRARIES ${URING_LIBRARIES})
+    CHECK_SYMBOL_EXISTS(io_uring_mlock_size "liburing.h" HAVE_IO_URING_MLOCK_SIZE)
+    SET(CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES_SAVE})
+    SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES_SAVE})
+    IF(HAVE_IO_URING_MLOCK_SIZE)
+      SET_SOURCE_FILES_PROPERTIES(aio_liburing.cc PROPERTIES COMPILE_FLAGS "-DHAVE_IO_URING_MLOCK_SIZE")
+    ENDIF()
+  ELSE()
+    FIND_PACKAGE(LIBAIO QUIET ${LIBAIO_REQUIRED})
+    IF(LIBAIO_FOUND)
+      SET(TPOOL_DEFINES "-DLINUX_NATIVE_AIO" PARENT_SCOPE)
+      ADD_DEFINITIONS(-DLINUX_NATIVE_AIO)
+      INCLUDE_DIRECTORIES(${LIBAIO_INCLUDE_DIR})
+      LINK_LIBRARIES(${LIBAIO_LIBRARIES})
+      SET(EXTRA_SOURCES aio_linux.cc)
+    ENDIF()
+  ENDIF()
 ENDIF()
 
 ADD_LIBRARY(tpool STATIC
@@ -26,4 +51,6 @@ ADD_LIBRARY(tpool STATIC
   ${EXTRA_SOURCES}
 )
 
-INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/include)
-\ No newline at end of file
+IF(URING_FOUND)
+  ADD_DEPENDENCIES(tpool GenError)
+ENDIF()
diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc
new file mode 100644
index 00000000000..447c2335c74
--- /dev/null
+++ b/tpool/aio_liburing.cc
@@ -0,0 +1,209 @@
+/* Copyright (C) 2021, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute itand /or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
+
+#include "tpool_structs.h"
+#include "tpool.h"
+#include "mysql/service_my_print_error.h"
+#include "mysqld_error.h"
+
+#include <liburing.h>
+
+#include <algorithm>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <stdexcept>
+
+namespace
+{
+
+class aio_uring final : public tpool::aio
+{
+public:
+  aio_uring(tpool::thread_pool *tpool, int max_aio) : tpool_(tpool)
+  {
+    if (io_uring_queue_init(max_aio, &uring_, 0) != 0)
+    {
+      switch (const auto e= errno) {
+      case ENOMEM:
+        my_printf_error(ER_UNKNOWN_ERROR,
+                        "io_uring_queue_init() failed with ENOMEM:"
+                        " try larger memory locked limit, ulimit -l"
+                        ", or https://mariadb.com/kb/en/systemd/#configuring-limitmemlock"
+                        " under systemd"
+#ifdef HAVE_IO_URING_MLOCK_SIZE
+                        " (%zd bytes required)", ME_ERROR_LOG | ME_WARNING,
+                        io_uring_mlock_size(max_aio, 0));
+#else
+                        , ME_ERROR_LOG | ME_WARNING);
+#endif
+        break;
+      case ENOSYS:
+        my_printf_error(ER_UNKNOWN_ERROR,
+                        "io_uring_queue_init() failed with ENOSYS:"
+                        " check seccomp filters, and the kernel version "
+                        "(newer than 5.1 required)",
+                        ME_ERROR_LOG | ME_WARNING);
+        break;
+      default:
+        my_printf_error(ER_UNKNOWN_ERROR,
+                        "io_uring_queue_init() failed with errno %d",
+                        ME_ERROR_LOG | ME_WARNING, e);
+      }
+      throw std::runtime_error("aio_uring()");
+    }
+    if (io_uring_ring_dontfork(&uring_) != 0)
+    {
+      my_printf_error(ER_UNKNOWN_ERROR,
+                      "io_uring_dontfork() failed with errno %d (continuing)",
+                      ME_ERROR_LOG | ME_WARNING, errno);
+    }
+
+    thread_= std::thread(thread_routine, this);
+  }
+
+  ~aio_uring() noexcept
+  {
+    {
+      std::lock_guard<std::mutex> _(mutex_);
+      io_uring_sqe *sqe= io_uring_get_sqe(&uring_);
+      io_uring_prep_nop(sqe);
+      io_uring_sqe_set_data(sqe, nullptr);
+      auto ret= io_uring_submit(&uring_);
+      if (ret != 1)
+      {
+        my_printf_error(ER_UNKNOWN_ERROR,
+                        "io_uring_submit() returned %d during shutdown:"
+                        " this may cause a hang\n",
+                        ME_ERROR_LOG | ME_FATAL, ret);
+        abort();
+      }
+    }
+    thread_.join();
+    io_uring_queue_exit(&uring_);
+  }
+
+  int submit_io(tpool::aiocb *cb) final
+  {
+    cb->iov_base= cb->m_buffer;
+    cb->iov_len= cb->m_len;
+
+    // The whole operation since io_uring_get_sqe() and till io_uring_submit()
+    // must be atomical. This is because liburing provides thread-unsafe calls.
+    std::lock_guard<std::mutex> _(mutex_);
+
+    io_uring_sqe *sqe= io_uring_get_sqe(&uring_);
+    if (cb->m_opcode == tpool::aio_opcode::AIO_PREAD)
+      io_uring_prep_readv(sqe, cb->m_fh, static_cast<struct iovec *>(cb), 1,
+                          cb->m_offset);
+    else
+      io_uring_prep_writev(sqe, cb->m_fh, static_cast<struct iovec *>(cb), 1,
+                           cb->m_offset);
+    io_uring_sqe_set_data(sqe, cb);
+
+    return io_uring_submit(&uring_) == 1 ? 0 : -1;
+  }
+
+  int bind(native_file_handle &fd) final
+  {
+    std::lock_guard<std::mutex> _(files_mutex_);
+    auto it= std::lower_bound(files_.begin(), files_.end(), fd);
+    assert(it == files_.end() || *it != fd);
+    files_.insert(it, fd);
+    return io_uring_register_files_update(&uring_, 0, files_.data(),
+                                          files_.size());
+  }
+
+  int unbind(const native_file_handle &fd) final
+  {
+    std::lock_guard<std::mutex> _(files_mutex_);
+    auto it= std::lower_bound(files_.begin(), files_.end(), fd);
+    assert(*it == fd);
+    files_.erase(it);
+    return io_uring_register_files_update(&uring_, 0, files_.data(),
+                                          files_.size());
+  }
+
+private:
+  static void thread_routine(aio_uring *aio)
+  {
+    for (;;)
+    {
+      io_uring_cqe *cqe;
+      if (int ret= io_uring_wait_cqe(&aio->uring_, &cqe))
+      {
+        if (ret == -EINTR)
+          continue;
+        my_printf_error(ER_UNKNOWN_ERROR,
+                        "io_uring_wait_cqe() returned %d\n",
+                        ME_ERROR_LOG | ME_FATAL, ret);
+        abort();
+      }
+
+      auto *iocb= static_cast<tpool::aiocb*>(io_uring_cqe_get_data(cqe));
+      if (!iocb)
+        break; // ~aio_uring() told us to terminate
+
+      int res= cqe->res;
+      if (res < 0)
+      {
+        iocb->m_err= -res;
+        iocb->m_ret_len= 0;
+      }
+      else
+      {
+        iocb->m_err= 0;
+        iocb->m_ret_len= res;
+      }
+
+      io_uring_cqe_seen(&aio->uring_, cqe);
+      finish_synchronous(iocb);
+
+      // If we need to resubmit the IO operation, but the ring is full,
+      // we will follow the same path as for any other error codes.
+      if (res == -EAGAIN && !aio->submit_io(iocb))
+        continue;
+
+      iocb->m_internal_task.m_func= iocb->m_callback;
+      iocb->m_internal_task.m_arg= iocb;
+      iocb->m_internal_task.m_group= iocb->m_group;
+      aio->tpool_->submit_task(&iocb->m_internal_task);
+    }
+  }
+
+  io_uring uring_;
+  std::mutex mutex_;
+  tpool::thread_pool *tpool_;
+  std::thread thread_;
+
+  std::vector<native_file_handle> files_;
+  std::mutex files_mutex_;
+};
+
+} // namespace
+
+namespace tpool
+{
+
+aio *create_linux_aio(thread_pool *pool, int max_aio)
+{
+  try {
+    return new aio_uring(pool, max_aio);
+  } catch (std::runtime_error& error) {
+    return nullptr;
+  }
+}
+
+} // namespace tpool
diff --git a/tpool/aio_linux.cc b/tpool/aio_linux.cc
index 6997cbcccab..507c6b9264f 100644
--- a/tpool/aio_linux.cc
+++ b/tpool/aio_linux.cc
@@ -16,7 +16,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 #include "tpool_structs.h"
 #include "tpool.h"
 
-#ifdef LINUX_NATIVE_AIO
 # include <thread>
 # include <atomic>
 # include <cstdio>
@@ -70,7 +69,6 @@ static int my_getevents(io_context_t ctx, long min_nr, long nr, io_event *ev)
   }
   return ret;
 }
-#endif
 
 
 /*
@@ -85,7 +83,6 @@ static int my_getevents(io_context_t ctx, long min_nr, long nr, io_event *ev)
 */
 namespace tpool
 {
-#ifdef LINUX_NATIVE_AIO
 
 class aio_linux final : public aio
 {
@@ -132,8 +129,7 @@ class aio_linux final : public aio
           {
             iocb->m_ret_len= event.res;
             iocb->m_err= 0;
-            if (iocb->m_ret_len != iocb->m_len)
-              finish_synchronous(iocb);
+            finish_synchronous(iocb);
           }
           iocb->m_internal_task.m_func= iocb->m_callback;
           iocb->m_internal_task.m_arg= iocb;
@@ -190,7 +186,4 @@ aio *create_linux_aio(thread_pool *pool, int max_io)
   }
   return new aio_linux(ctx, pool);
 }
-#else
-aio *create_linux_aio(thread_pool*, int) { return nullptr; }
-#endif
 }
diff --git a/tpool/task.cc b/tpool/task.cc
index 0b5253bc725..81ec88590ce 100644
--- a/tpool/task.cc
+++ b/tpool/task.cc
@@ -1,4 +1,4 @@
-/* Copyright (C) 2019, 2020, MariaDB Corporation.
+/* Copyright (C) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute itand /or modify
 it under the terms of the GNU General Public License as published by
@@ -21,21 +21,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 
 namespace tpool
 {
-
-#ifndef DBUG_OFF
-static callback_func_np after_task_callback;
-void set_after_task_callback(callback_func_np cb)
-{
-  after_task_callback= cb;
-}
-
-void execute_after_task_callback()
-{
-  if (after_task_callback)
-    after_task_callback();
-}
-#endif
-
   task::task(callback_func func, void* arg, task_group* group) :
     m_func(func), m_arg(arg), m_group(group) {}
 
@@ -50,7 +35,6 @@ void execute_after_task_callback()
     {
       /* Execute directly. */
       m_func(m_arg);
-      dbug_execute_after_task_callback();
       release();
     }
   }
diff --git a/tpool/task_group.cc b/tpool/task_group.cc
index 97fbb0911c8..b52fe7c0f67 100644
--- a/tpool/task_group.cc
+++ b/tpool/task_group.cc
@@ -53,7 +53,6 @@ namespace tpool
       if (t)
       {
         t->m_func(t->m_arg);
-        dbug_execute_after_task_callback();
         t->release();
       }
       lk.lock();
diff --git a/tpool/tpool.h b/tpool/tpool.h
index 7ac6763ae23..87a0122adce 100644
--- a/tpool/tpool.h
+++ b/tpool/tpool.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2019, 2020, MariaDB Corporation.
+/* Copyright (C) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute itand /or modify
 it under the terms of the GNU General Public License as published by
@@ -22,6 +22,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 #ifdef LINUX_NATIVE_AIO
 #include <libaio.h>
 #endif
+#ifdef HAVE_URING
+#include <sys/uio.h>
+#endif
 #ifdef _WIN32
 #ifndef NOMINMAX
 #define NOMINMAX
@@ -114,7 +117,7 @@ enum class aio_opcode
   AIO_PREAD,
   AIO_PWRITE
 };
-constexpr size_t MAX_AIO_USERDATA_LEN= 3 * sizeof(void*);
+constexpr size_t MAX_AIO_USERDATA_LEN= 4 * sizeof(void*);
 
 /** IO control block, includes parameters for the IO, and the callback*/
 
@@ -123,6 +126,8 @@ struct aiocb
   :OVERLAPPED
 #elif defined LINUX_NATIVE_AIO
   :iocb
+#elif defined HAVE_URING
+  :iovec
 #endif
 {
   native_file_handle m_fh;
@@ -168,7 +173,17 @@ public:
 protected:
   static void synchronous(aiocb *cb);
   /** finish a partial read/write callback synchronously */
-  static void finish_synchronous(aiocb *cb);
+  static inline void finish_synchronous(aiocb *cb)
+  {
+    if (!cb->m_err && cb->m_ret_len != cb->m_len)
+    {
+      /* partial read/write */
+      cb->m_buffer= (char *) cb->m_buffer + cb->m_ret_len;
+      cb->m_len-= (unsigned int) cb->m_ret_len;
+      cb->m_offset+= cb->m_ret_len;
+      synchronous(cb);
+    }
+  }
 };
 
 class timer
@@ -183,18 +198,6 @@ class thread_pool;
 
 extern aio *create_simulated_aio(thread_pool *tp);
 
-#ifndef DBUG_OFF
-/*
-  This function is useful for debugging to make sure all mutexes are released
-  inside a task callback
-*/
-void set_after_task_callback(callback_func_np cb);
-void execute_after_task_callback();
-#define dbug_execute_after_task_callback() execute_after_task_callback()
-#else
-#define dbug_execute_after_task_callback() do{}while(0)
-#endif
-
 class thread_pool
 {
 protected:
diff --git a/tpool/tpool_generic.cc b/tpool/tpool_generic.cc
index 8dbd7c94d30..5d99783e8b9 100644
--- a/tpool/tpool_generic.cc
+++ b/tpool/tpool_generic.cc
@@ -1,4 +1,4 @@
-/* Copyright (C) 2019, 2020, MariaDB Corporation.
+/* Copyright (C) 2019, 2022, MariaDB Corporation.
 
 This program is free software; you can redistribute itand /or modify
 it under the terms of the GNU General Public License as published by
@@ -33,12 +33,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 #include <my_dbug.h>
 #include <thr_timer.h>
 #include <stdlib.h>
+#include "aligned.h"
 
 namespace tpool
 {
 
 #ifdef __linux__
+#if defined(HAVE_URING) || defined(LINUX_NATIVE_AIO)
   extern aio* create_linux_aio(thread_pool* tp, int max_io);
+#else
+  aio *create_linux_aio(thread_pool *, int) { return nullptr; };
+#endif
 #endif
 #ifdef _WIN32
   extern aio* create_win_aio(thread_pool* tp, int max_io);
@@ -81,25 +86,12 @@ void aio::synchronous(aiocb *cb)
 #endif
   cb->m_ret_len = ret_len;
   cb->m_err = err;
-  if (!err && cb->m_ret_len != cb->m_len)
+  if (ret_len)
     finish_synchronous(cb);
 }
 
 
 /**
-  A partial read/write has occured, continue synchronously.
-*/
-void aio::finish_synchronous(aiocb *cb)
-{
-  assert(cb->m_ret_len != (unsigned int) cb->m_len && !cb->m_err);
-  /* partial read/write */
-  cb->m_buffer= (char *) cb->m_buffer + cb->m_ret_len;
-  cb->m_len-= (unsigned int) cb->m_ret_len;
-  cb->m_offset+= cb->m_ret_len;
-  synchronous(cb);
-}
-
-/**
   Implementation of generic threadpool.
   This threadpool consists of the following components
 
@@ -137,7 +129,7 @@ enum worker_wake_reason
 
 
 /* A per-worker  thread structure.*/
-struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE)  worker_data
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE)  worker_data
 {
   /** Condition variable to wakeup this worker.*/
   std::condition_variable m_cv;
@@ -189,23 +181,13 @@ struct MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE)  worker_data
   {}
 
   /*Define custom new/delete because of overaligned structure. */
-  void* operator new(size_t size)
+  static void *operator new(size_t size)
   {
-#ifdef _WIN32
-    return _aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
-#else
-    void* ptr;
-    int ret = posix_memalign(&ptr, CPU_LEVEL1_DCACHE_LINESIZE, size);
-    return ret ? 0 : ptr;
-#endif
+    return aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
   }
-  void operator delete(void* p)
+  static void operator delete(void* p)
   {
-#ifdef _WIN32
-    _aligned_free(p);
-#else
-    free(p);
-#endif
+    aligned_free(p);
   }
 };
 
@@ -359,7 +341,6 @@ public:
       do
       {
         m_callback(m_data);
-        dbug_execute_after_task_callback();
       }
       while (m_running.fetch_sub(1, std::memory_order_release) != 1);
 
diff --git a/tpool/tpool_structs.h b/tpool/tpool_structs.h
index b49204f2d75..550a92d6e58 100644
--- a/tpool/tpool_structs.h
+++ b/tpool/tpool_structs.h
@@ -14,14 +14,13 @@ along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 
 #pragma once
+#include <my_global.h>
+#include <my_pthread.h>
 #include <vector>
 #include <stack>
-#include <mutex>
-#include <condition_variable>
 #include <assert.h>
 #include <algorithm>
 
-
 /* Suppress TSAN warnings, that we believe are not critical. */
 #if defined(__has_feature)
 #define TPOOL_HAS_FEATURE(...) __has_feature(__VA_ARGS__)
@@ -37,6 +36,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/
 #define TPOOL_SUPPRESS_TSAN
 #endif
 
+#ifdef HAVE_PSI_INTERFACE
+typedef unsigned int mysql_pfs_key_t;
+extern mysql_pfs_key_t tpool_cache_mutex_key;
+#endif
+
 namespace tpool
 {
 
@@ -55,13 +59,13 @@ namespace tpool
 template<typename T> class cache
 {
   /** Protects updates of m_pos and m_cache members */
-  std::mutex m_mtx;
+  mysql_mutex_t m_mtx;
 
   /**
     Notify waiting threads about "cache full" or "cache not empty" conditions
     @see get() and wait()
   */
-  std::condition_variable m_cv;
+  pthread_cond_t m_cv;
 
   /** Cached items vector.Does not change after construction */
   std::vector<T> m_base;
@@ -108,13 +112,22 @@ public:
   Constructor
   @param size - maximum number of items in cache
   */
-  cache(size_t size) : m_mtx(), m_cv(), m_base(size), m_cache(size),
+  cache(size_t size) : m_base(size), m_cache(size),
     m_waiters(), m_pos(0)
   {
+    mysql_mutex_init(tpool_cache_mutex_key, &m_mtx, nullptr);
+    pthread_cond_init(&m_cv, nullptr);
+
     for(size_t i= 0 ; i < size; i++)
       m_cache[i]= &m_base[i];
   }
 
+  ~cache()
+  {
+    mysql_mutex_destroy(&m_mtx);
+    pthread_cond_destroy(&m_cv);
+ }
+
   /**
    Retrieve an item from cache. Waits for free item, if cache is
    currently empty.
@@ -122,21 +135,25 @@ public:
   */
   T* get()
   {
-    std::unique_lock<std::mutex> lk(m_mtx);
-    while(is_empty())
-      m_cv.wait(lk);
+    mysql_mutex_lock(&m_mtx);
+    while (is_empty())
+      my_cond_wait(&m_cv, &m_mtx.m_mutex);
     assert(m_pos < capacity());
     //  return last element
-    return m_cache[m_pos++];
+    T *t= m_cache[m_pos++];
+    mysql_mutex_unlock(&m_mtx);
+    return t;
   }
 
-	/**
-   Put back an item to cache.
-   @param item - item to put back
+  mysql_mutex_t &mutex() { return m_mtx; }
+
+  /**
+   Put back an element to cache.
+   @param ele element to put back
   */
   void put(T *ele)
   {
-    std::unique_lock<std::mutex> lk(m_mtx);
+    mysql_mutex_lock(&m_mtx);
     assert(!is_full());
     // put element to the logical end of the array
     m_cache[--m_pos] = ele;
@@ -144,7 +161,8 @@ public:
     /* Notify waiters  when the cache becomes
      not empty, or when it becomes full */
     if (m_pos == 1 || (m_waiters && is_full()))
-      m_cv.notify_all();
+      pthread_cond_broadcast(&m_cv);
+    mysql_mutex_unlock(&m_mtx);
   }
 
   /** Check if pointer represents cached element */
@@ -154,16 +172,25 @@ public:
     return ele >= &m_base[0] && ele <= &m_base[capacity() - 1];
   }
 
-  /** Wait until cache is full.*/
-  void wait()
+  /** Wait until cache is full
+  @param m cache mutex (locked) */
+  void wait(mysql_mutex_t &m)
   {
-    std::unique_lock<std::mutex> lk(m_mtx);
+    mysql_mutex_assert_owner(&m);
     m_waiters++;
-    while(!is_full())
-      m_cv.wait(lk);
+    while (!is_full())
+      my_cond_wait(&m_cv, &m.m_mutex);
     m_waiters--;
   }
 
+  /* Wait until cache is full.*/
+  void wait()
+  {
+    mysql_mutex_lock(&m_mtx);
+    wait(m_mtx);
+    mysql_mutex_unlock(&m_mtx);
+  }
+
   /**
    @return approximate number of "borrowed" items.
    A "dirty" read, not used in any critical functionality.
diff --git a/tpool/tpool_win.cc b/tpool/tpool_win.cc
index 09fd49d9411..88168b26eff 100644
--- a/tpool/tpool_win.cc
+++ b/tpool/tpool_win.cc
@@ -1,4 +1,4 @@
-/* Copyright(C) 2019 MariaDB
+/* Copyright (C) 2019, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute itand /or modify
 it under the terms of the GNU General Public License as published by
@@ -93,7 +93,6 @@ class thread_pool_win : public thread_pool
         return;
       }
       timer->m_func(timer->m_data);
-      dbug_execute_after_task_callback();
       if (timer->m_period)
         timer->set_time(timer->m_period, timer->m_period);
     }
author	Marko Mäkelä <marko.makela@mariadb.com>	2023-04-18 16:19:39 +0300
committer	Marko Mäkelä <marko.makela@mariadb.com>	2023-04-18 16:19:39 +0300
commit	821e3d52c0e7405aea7054e138604ae1a4e8c921 (patch)
tree	1dbf11353bfd7ca04e176bc719e3dcec42eafe11
parent	6c196090c8c265bfd93df1e2ee6b18cda2b1d2d8 (diff)
download	mariadb-git-10.56-MDEV-30986.tar.gz